Ancestry PRS Load

Author

Sarah Urbut

Published

March 7, 2024

LDL PRS analysis

Code

library(dplyr)
library(broom)
library(tidyr)
library(ggplot2)
library(data.table)
library(ggridges)
library(lubridate)
library(stringr)



prs = fread("./GLGC_2021_ALL_LDL_PRS_weights_PRS-CS.sortedID.sscore")
#ancestry = fread("mgbb_data/GSA_53K.hg38.1KGPCA.tsv.gz")[, c(1:3)]
#s = stringr::str_split_fixed(ancestry$IID, pattern = "-", n = 2)

pheno=fread("na29.na32.5399smp_updated220923.pheno")

ldl = pheno[,c("IID","ldl1_adj")]

## we will use consent date
pheno$CAD_censor=pheno$MI

sum(na.omit(pheno$CAD_censor == 1))

[1] 3321

Code

sum(na.omit(pheno$CAD_censor == 0))

[1] 2009

Code

pheno_all_with_covs = pheno[,c("IID","Age","Sex","PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","CAD_censor")]

totaldata=merge(pheno_all_with_covs,prs[,c("IID","SCORE1_AVG")],by="IID")
totaldata$PRS=totaldata$SCORE1_AVG
totaldata=left_join(totaldata,ldl,by = "IID")

Histograms

We note that the distributions of these PRS are skewed for non EUR pops but will normalize for our analysis:

Using residual PRS

Code

library(ggridges)
pcmod = lm(PRS ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = totaldata)
totaldata$predictscore = predict(object = pcmod, newdata = totaldata)
totaldata$residscore = totaldata$PRS - totaldata$predictscore
totaldata$scaled_resid = scale(totaldata$residscore)
## because only one populatino
totaldata$pop_scaled_prs = scale(totaldata$residscore)
totaldata$Population="Saudi"


ggplot(totaldata, aes(x = residscore, y = Population, fill = Population)) +
  geom_density_ridges(alpha = 0.7) +
  scale_fill_brewer(palette = "Set1") +

  labs(title = "Density Distribution of Sum Scores",
       x = "Sum Score",
       y = "Population")

Picking joint bandwidth of 1.71e-08

Code

ggplot(totaldata, aes(x = pop_scaled_prs, y = Population, fill = Population)) +
    geom_density_ridges(alpha = 0.7) +
    scale_fill_brewer(palette = "Set1") +
    # Optional: Adds a nice theme suitable for ridgeline plots
    labs(title = "Density Distribution of Sum Scores: Scaled for all",
         x = "Sum Score",
         y = "Population")

Picking joint bandwidth of 0.161

Code

library(dplyr)
totaldata%>%group_by(Population)%>%summarize(quantile(pop_scaled_prs,0.25),median(pop_scaled_prs),mean(pop_scaled_prs),quantile(pop_scaled_prs,0.75))

# A tibble: 1 × 5
  Population `quantile(pop_scaled_prs, 0.25)` `median(pop_scaled_prs)`
  <chr>                                 <dbl>                    <dbl>
1 Saudi                                -0.663                   0.0114
# ℹ 2 more variables: `mean(pop_scaled_prs)` <dbl>,
#   `quantile(pop_scaled_prs, 0.75)` <dbl>

Table1

Code

library(table1)


Attaching package: 'table1'

The following objects are masked from 'package:base':

    units, units<-

Code

dat <- totaldata


dat$pop_scaled_prs <- as.numeric(as.character(round(dat$pop_scaled_prs,2)))

dat <- totaldata
dat$CAD_censor <- as.factor(dat$CAD_censor)
label(dat$Sex) <- "Sex"
label(dat$ldl1_adj) <- "LDL-C (adjusted for Statin)"
label(dat$CAD_censor) <- "CAD Ever"
label(dat$scaled_resid) <- "Overall Scaled Residual PRS"
label(dat$pop_scaled_prs) <- "Pop Scaled Residual PRS"
label(dat$Age) = "Age Enrolled"

# Continue with creating the table
table1(~Sex + ldl1_adj + CAD_censor + Age + round(pop_scaled_prs,4) + scaled_resid , data = dat)

	Overall (N=5399)
Sex
Mean (SD)	0.643 (0.479)
Median [Min, Max]	1.00 [0, 1.00]
LDL-C (adjusted for Statin)
Mean (SD)	136 (53.4)
Median [Min, Max]	127 [3.87, 895]
Missing	952 (17.6%)
CAD Ever
0	2009 (37.2%)
1	3321 (61.5%)
Missing	69 (1.3%)
Age Enrolled
Mean (SD)	54.8 (14.8)
Median [Min, Max]	56.0 [13.0, 102]
Pop Scaled Residual PRS
Mean (SD)	-0.0000000185 (1.00)
Median [Min, Max]	0.0114 [-3.62, 3.38]
Overall Scaled Residual PRS
Mean (SD)	-0.0000000000000000354 (1.00)
Median [Min, Max]	0.0114 [-3.62, 3.38]

Impact of PRS

To assess the impact of LDL-C PRS on CAD risk, you need to establish a two-step relationship: first, between LDL-C PRS and LDL-C levels (per 40 mg/dL change), and then between LDL-C PRS and CAD outcomes. The approach allows you to understand how changes in LDL-C levels attributed to genetic risk (as quantified by PRS) relate to CAD risk.

Step 1: Associate LDL-C PRS with LDL-C Levels (per 40 mg/dL)

Linear Regression for LDL-C Levels: Fit a linear regression model with LDL-C levels as the outcome and LDL-C PRS as the predictor. You might want to adjust for relevant covariates.
Rescale the Coefficient for LDL-C PRS: Adjust the coefficient from tAMR model to reflect a 40 mg/dL change in LDL-C.

Step 2: Associate Rescaled LDL-C PRS with CAD

Incorporate Rescaled LDL-C PRS into Cox Model: Use the rescaled LDL-C PRS from Step 1 as a predictor in a Cox proportional hazards model to analyze its relationship with CAD risk.

Code

library(dplyr)
library(survival)
# Assuming 'df' is your dataset

# List of unique populations
pop="Saudi"

# Initialize an empty list for storing models and results
cox_models <- list()
glm_models = list()


results <- data.frame(
  Population = character(),
  PRS_Mean = numeric(),
  PRS_SD = numeric(),
  Percent_male = numeric(),
  LDL_Coef_CI_Lower = numeric(),
  LDL_Coef_CI_Upper = numeric(),
  Population_Scale_Factor = numeric(),
  LDL_Scaled_Coef_CI_Lower = numeric(),
  LDL_Scaled_Coef_CI_Upper = numeric(),
  HR = numeric(),
  HR_CI_Lower = numeric(),
  HR_CI_Upper = numeric(),
  OR = numeric(),
  OR_LCI = numeric(),
  OR_UCI = numeric(),
  stringsAsFactors = FALSE
)

  pop_data=totaldata
  pop_data$PRS = pop_data$pop_scaled_prs
  ##print("PRS Summary")
  #print(summary(pop_data$PRS))
  
  
  ldl_model <-
    lm(
      ldl1_adj ~ PRS + as.numeric(Age) + as.factor(Sex) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
      data = pop_data[!is.na(pop_data$ldl1_adj),]
    )
  
  ldl_coef_ci <- confint(ldl_model, "PRS")
  #print(paste("LDL_coef_CI for population:", pop))
  if ("PRS" %in% names(coef(ldl_model))) {
    #print(confint(ldl_model, parm = "PRS"))
  } else {
    #print("PRS coefficient is not available in the model.")
  }

NULL

Code

  # Calculate the PRS scaling factor
  prs_scale_factor <- 40 / coef(ldl_model)["PRS"]
  #print(paste0(pop,":",prs_scale_factor))
  #prs_scale_factor=1
  # Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL
  pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor
  
  
  ldl_model_2 <-
    lm(
      ldl1_adj ~ scaled_ldl_prs + Age + Sex + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
      data = pop_data[!is.na(pop_data$ldl1_adj),]
    )
  
  ldl_scaled_coef_ci <- confint(ldl_model_2, "scaled_ldl_prs")
  
  
 
  
  
  glm_model <- glm(
    CAD_censor ~ scaled_ldl_prs + Age+Sex + PC1 +
      PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
    data = pop_data,
    family = "binomial"
  )
  glm_models[[pop]] <- glm_model
  glm_models[[pop]] <- glm_model
  log_or = summary(glm_model)$coefficients["scaled_ldl_prs", "Estimate"]
  log_or_se = summary(glm_model)$coefficients["scaled_ldl_prs", "Std. Error"]
  or = round(exp(log_or), 2)
  lci = round(exp(log_or - 1.96 * log_or_se), 2)
  uci = round(exp(log_or + 1.96 * log_or_se), 2)
  
  glmci = confint(glm_model)

Waiting for profiling to be done...

Code

  glm_summary <- summary(glm_model)
  
  # Compile results into the data frame
  # Append to results
  results <-
    data.frame(
      Population = pop,
      PRS_Mean = mean(pop_data$PRS),
      PRS_SD = sd(pop_data$PRS),
      Percent_male = mean(pop_data$Sex == 1),
      
      
      LDL_Coef_CI_Lower = ldl_coef_ci[1],
      LDL_Coef_CI_Upper = ldl_coef_ci[2],
      Population_Scale_Factor = prs_scale_factor,
      LDL_Scaled_Coef_CI_Lower = ldl_scaled_coef_ci[1],
      # Adjust according to your calculations
      LDL_Scaled_Coef_CI_Upper = ldl_scaled_coef_ci[2],
      OR = or,
      OR_LCI = exp(glmci['scaled_ldl_prs', 1]),
      OR_UCI = exp(glmci['scaled_ldl_prs', 2])
    )
  



results

    Population      PRS_Mean PRS_SD Percent_male LDL_Coef_CI_Lower
PRS      Saudi -3.535379e-17      1    0.6432673          8.403882
    LDL_Coef_CI_Upper Population_Scale_Factor LDL_Scaled_Coef_CI_Lower
PRS          11.50445                4.018418                 33.77031
    LDL_Scaled_Coef_CI_Upper   OR   OR_LCI   OR_UCI
PRS                 46.22969 1.69 1.315959 2.180119

Code

saveRDS(results, "results_saudi.rds")