Ancestry PRS Load

Author

Sarah Urbut

Published

March 7, 2024

LDL PRS analyais

Code
library(dplyr)
library(broom)
library(tidyr)
library(ggplot2)
library(data.table)
library(ggridges)
library(lubridate)
library(stringr)
library(UpSetR)


prs = fread("~/Dropbox/yangstuff/GSA_53K_MID_407.txt")
ancestry=fread("~/Dropbox/yangstuff/HGDP_1KG_real.txt.gz")

s= stringr::str_split_fixed(ancestry$IID, pattern = "-", n = 2)


ldlwithinon = fread("mgbb_data/mgbb_ldl_consentdate.csv")
linker = fread("mgbb_data/all_cvd_phenos.mgbb.tsv")
ldl = merge(linker[, c("IID", "EMPI", "Subject_Id")], ldlwithinon, by =
              "EMPI", all.x = T)

## we will use consent date

pheno_incident = fread("mgbb_data/mgbb_incident_mi_consentdate.csv")
pheno_incident$incident_mi_date = pheno_incident$Date
pheno_has = fread("mgbb_data/mgbb_prevalent_mi_consentdate.csv")
pheno_has$prevalent_mi_date = pheno_has$Date



sum(pheno_incident$incident_mi == 1)
[1] 21358
Code
sum(pheno_has$prevalent_mi == 1)
[1] 11293
Code
pheno_has$inbiobank = ifelse(pheno_has$Subject_Id %in% s[, 2], 1, 0)
pheno_has %>% group_by(inbiobank) %>% summarise(mean(prevalent_mi))
# A tibble: 2 × 2
  inbiobank `mean(prevalent_mi)`
      <dbl>                <dbl>
1         0               0.0657
2         1               0.101 
Code
pheno_incident$inbiobank = ifelse(pheno_incident$Subject_Id %in% s[, 2], 1, 0)
pheno_incident %>% group_by(inbiobank) %>% summarise(mean(incident_mi))
# A tibble: 2 × 2
  inbiobank `mean(incident_mi)`
      <dbl>               <dbl>
1         0               0.121
2         1               0.198
Code
baseline_covs = fread("mgbb_data/mgbb_pce_consent_date.csv")
pheno_all = left_join(pheno_incident, pheno_has[, c("Subject_Id", "prevalent_mi_date", "prevalent_mi")], by = "Subject_Id")

pheno_all_with_covs = left_join(pheno_all, baseline_covs[, c("EMPI",
                                                             "mgbb_consent_date",
                                                             "enrollment_age",
                                                             "sex_imputed",
                                                             "lipidmed")])


## we can see that the rates are much higher for biobank particants
pheno_all$either = ifelse((pheno_all$prevalent_mi + pheno_all$incident_mi) >
                            0, 1, 0)
pheno_all %>% group_by(inbiobank) %>% summarise(mean(either))
# A tibble: 2 × 2
  inbiobank `mean(either)`
      <dbl>          <dbl>
1         0          0.156
2         1          0.237
Code
pheno_dat = merge(pheno_all_with_covs, ldl, by = "EMPI")
final_dat = merge(ancestry,pheno_dat, by = "IID")


final_dat <- final_dat %>%
  mutate(
    mgbb_consent_date = as.Date(mgbb_consent_date, "%Y-%m-%d"),
    incident_cad_date = as.Date(incident_mi_date, "%Y-%m-%d"),
    prevalent_cad_date = as.Date(prevalent_mi_date, "%Y-%m-%d"),
    birthdate = mgbb_consent_date - (enrollment_age * 365.25),
    # Approximating leap years
    age_at_incident_cad = as.numeric(difftime(incident_mi_date, birthdate, units = "days")) / 365.25,
    age_at_prevalent_cad = ifelse(is.na(prevalent_mi_date), NA, as.numeric(
      difftime(prevalent_cad_date, birthdate, units = "days")
    ) / 365.25)
  )


final_dat <- final_dat %>%
  mutate(
    # Calculate censor flag
    CAD_censor = ifelse(incident_mi == 1 | prevalent_mi == 1, 1, 0),
    # Calculate age at censoring or follow-up
    CAD_censor_age = case_when(
      # Use age at prevalent CAD if prevalent CAD exists!is.na(age_at_prevalent_cad) &
      prevalent_mi == 1 ~ age_at_prevalent_cad,
      # Use age at incident CAD if no prevalent available!is.na(age_at_incident_cad) &
      incident_mi == 1 ~ age_at_incident_cad,
      # Calculate age at follow-up end date if no CAD
      TRUE ~ as.numeric(difftime(as.Date("2023-06-15"), birthdate, units = "days")) / 365.25
    )
  )


### note that only 26000 people had LDL
final_dat$ldladj = ifelse(final_dat$lipidmed == "yes",
                          final_dat$ldlc * 5 / 4,
                          final_dat$ldlc)

Phenotyping

We validated previously that MGB pheno implementation of Hard CAD is matched to HARD Cad. So we will now use that here to be c/w AoU HARD CAD pheno.

  • Because we are using age as time scale (or GLM with ‘evere event’ follow up time is really just time from birth to death.

  • We know that time is inhomogenous: even if we use OR instead of HR, the OR will tend to decrease over time because there are more non-genetically mediated CAD events

Now we want to calculate ancestry specific thresholds for PRS. Let’s get the PRS for African, European, and Asian ancestry that causes a 40 mmHg increase in LDL cholesterol in the UK Biobank using the AFR, SAS, and EUR individauls present.

Code
MID_dat=final_dat[final_dat$knn_super%in%"MID",]

final_dat%>%group_by(knn_super)%>%summarise("cad_rate"=mean(CAD_censor),"mean_ldl"=mean(na.omit(ldlc)),"%M"=mean(na.omit(sex_imputed=="male")),"meanAge"=as.numeric(mean(enrollment_age)),"population_size"=length(knn_super))
# A tibble: 6 × 6
  knn_super cad_rate mean_ldl  `%M` meanAge population_size
  <chr>        <dbl>    <dbl> <dbl>   <dbl>           <int>
1 AFR          0.402     104. 0.375    48.9            2650
2 AMR          0.280     102. 0.341    42.7            3946
3 CSA          0.150     101. 0.527    41.6             566
4 EAS          0.101     102. 0.363    41.3            1223
5 EUR          0.227     102. 0.457    55.1           44373
6 MID          0.302     103. 0.528    51.9             517

Histograms

We note that the distributions of these PRS are skewed for non EUR pops but will normalize for our analysis:

Code
prs_merged = inner_join(ancestry, prs, by = c("IID" = "IID"))
prs_MID <- prs_merged %>%
  filter(knn_super == "MID") %>%
  select(IID, combined6 )


MID_dat = merge(prs_MID, final_dat[final_dat$knn_super %in% "MID", ], by = "IID")
totaldata=MID_dat
totaldata$PRS=totaldata$combined6


ggplot(totaldata, aes(x = PRS, y = knn_super, fill = knn_super)) +
  geom_density_ridges(alpha = 0.7) +
  scale_fill_brewer(palette = "Set1") +
  theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
  labs(title = "Density Distribution of Sum Scores",
       x = "Sum Score",
       y = "Population")
Picking joint bandwidth of 2.94

Using residual PRS

Code
pcmod = lm(PRS ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = totaldata)
totaldata$predictscore = predict(object = pcmod, newdata = totaldata)
totaldata$residscore = totaldata$PRS - totaldata$predictscore
totaldata$Population = as.factor(totaldata$knn_super)
totaldata$scaled_resid = scale(totaldata$residscore)
totaldata$pop_scaled_prs = scale(totaldata$residscore)

ggplot(totaldata, aes(x = residscore, y = Population, fill = Population)) +
  geom_density_ridges(alpha = 0.7) +
  scale_fill_brewer(palette = "Set1") +
  theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
  labs(title = "Density Distribution of Sum Scores",
       x = "Sum Score",
       y = "Population")
Picking joint bandwidth of 2.84

Code
library(dplyr)
totaldata%>%group_by(Population)%>%summarize(quantile(pop_scaled_prs,0.25),median(pop_scaled_prs),mean(pop_scaled_prs),quantile(pop_scaled_prs,0.75))
# A tibble: 1 × 5
  Population `quantile(pop_scaled_prs, 0.25)` `median(pop_scaled_prs)`
  <fct>                                 <dbl>                    <dbl>
1 MID                                  -0.638                  -0.0216
# ℹ 2 more variables: `mean(pop_scaled_prs)` <dbl>,
#   `quantile(pop_scaled_prs, 0.75)` <dbl>

Let’s look at the departure from PH:

Code
library(survminer)
Loading required package: ggpubr
Code
library(survival)

Attaching package: 'survival'
The following object is masked from 'package:survminer':

    myeloma
Code
fit3 <- survfit( Surv(time = CAD_censor_age, CAD_censor) ~ Population,data=totaldata)
ggsurvplot(fit3, data = totaldata, conf.int = FALSE,fun = "event",
risk.table = TRUE, risk.table.col="strata",
ggtheme = theme_classic(),xlab="Age",palette = "nejm",xlim=c(0,90))

Code
totaldata%>%group_by(Population)%>%summarize("event_age"=mean(CAD_censor_age[CAD_censor==1]),"censor_age"=mean(CAD_censor_age[CAD_censor==0]),"event_rate"=mean(CAD_censor==1),"oldest_event"=max(CAD_censor_age[CAD_censor==1]),"oldest_control"=max(CAD_censor_age[CAD_censor==1]))
# A tibble: 1 × 6
  Population event_age censor_age event_rate oldest_event oldest_control
  <fct>          <dbl>      <dbl>      <dbl>        <dbl>          <dbl>
1 MID             60.1       54.8      0.302         91.1           91.1

Table1

Code
totaldata$followup=round(difftime(totaldata$CAD_censor_age,totaldata$enrollment_age),2)
library(table1)

Attaching package: 'table1'
The following objects are masked from 'package:base':

    units, units<-
Code
totaldata$followup <- round(difftime(totaldata$CAD_censor_age, totaldata$enrollment_age), 2)
dat <- totaldata

label(dat$sex_imputed)="sex"
label(dat$CAD_censor)="CAD"
dat$CAD_censor=as.factor(dat$CAD_censor)
#table1(~sex_imputed+ldladj+CAD_censor+enrollment_age+pop_scaled_prs+scaled_resid|knn_super,data = totaldata)
# Format the pop_scaled_prs variable to 2 decimal places
dat$pop_scaled_prs <- as.numeric(as.character(round(dat$pop_scaled_prs,2)))

dat <- totaldata
dat$CAD_censor <- as.factor(dat$CAD_censor)
label(dat$sex_imputed) <- "Sex"
label(dat$ldladj) <- "LDL-C (adjusted for Statin)"
label(dat$CAD_censor) <- "CAD Ever"
label(dat$scaled_resid) <- "Overall Scaled Residual PRS"
label(dat$pop_scaled_prs) <- "Pop Scaled Residual PRS"
label(dat$enrollment_age) = "Age Enrolled"

# Continue with creating the table
table1(~sex_imputed + ldladj + CAD_censor + enrollment_age + round(pop_scaled_prs,4) + scaled_resid | knn_super, data = dat)
MID
(N=407)
Overall
(N=407)
Sex
female 183 (45.0%) 183 (45.0%)
male 224 (55.0%) 224 (55.0%)
LDL-C (adjusted for Statin)
Mean (SD) 111 (37.9) 111 (37.9)
Median [Min, Max] 110 [15.0, 239] 110 [15.0, 239]
Missing 188 (46.2%) 188 (46.2%)
CAD Ever
0 284 (69.8%) 284 (69.8%)
1 123 (30.2%) 123 (30.2%)
Age Enrolled
Mean (SD) 52.2 (18.3) 52.2 (18.3)
Median [Min, Max] 53.9 [18.5, 91.1] 53.9 [18.5, 91.1]
Pop Scaled Residual PRS
Mean (SD) -0.000000737 (1.00) -0.000000737 (1.00)
Median [Min, Max] -0.0216 [-2.58, 2.85] -0.0216 [-2.58, 2.85]
Overall Scaled Residual PRS
Mean (SD) 0.0000000000000000300 (1.00) 0.0000000000000000300 (1.00)
Median [Min, Max] -0.0216 [-2.58, 2.85] -0.0216 [-2.58, 2.85]

Impact of PRS

To assess the impact of LDL-C PRS on CAD risk, you need to establish a two-step relationship: first, between LDL-C PRS and LDL-C levels (per 40 mg/dL change), and then between LDL-C PRS and CAD outcomes. TAMR approach allows you to understand how changes in LDL-C levels attributed to genetic risk (as quantified by PRS) relate to CAD risk. Here’s how you can approach tAMR analysis for both European (EUR) and African (AFR) populations:

Step 1: Associate LDL-C PRS with LDL-C Levels (per 40 mg/dL)

  1. Linear Regression for LDL-C Levels: Fit a linear regression model with LDL-C levels as the outcome and LDL-C PRS as the predictor. You might want to adjust for relevant covariates.

  2. Rescale the Coefficient for LDL-C PRS: Adjust the coefficient from tAMR model to reflect a 40 mg/dL change in LDL-C.

Step 2: Associate Rescaled LDL-C PRS with CAD

  1. Incorporate Rescaled LDL-C PRS into Cox Model: Use the rescaled LDL-C PRS from Step 1 as a predictor in a Cox proportional hazards model to analyze its relationship with CAD risk.

  2. Perform Analysis for Both EUR and AFR Populations: Repeat these steps separately for EUR and AFR populations to capture population-specific effects.

Here’s an example in R for the European population. The same steps would be repeated for the African population:

Code
library(dplyr)
library(survival)
# Assuming 'df' is your dataset

# List of unique populations
populations <- c("MID")

# Initialize an empty list for storing models and results
cox_models <- list()
glm_models = list()


results <- data.frame(
  Population = character(),
  PRS_Mean = numeric(),
  PRS_SD = numeric(),
  Percent_male = numeric(),
  LDL_Coef_CI_Lower = numeric(),
  LDL_Coef_CI_Upper = numeric(),
  Population_Scale_Factor = numeric(),
  LDL_Scaled_Coef_CI_Lower = numeric(),
  LDL_Scaled_Coef_CI_Upper = numeric(),
  HR = numeric(),
  HR_CI_Lower = numeric(),
  HR_CI_Upper = numeric(),
  OR = numeric(),
  OR_LCI = numeric(),
  OR_UCI = numeric(),
  stringsAsFactors = FALSE
)


pop="MID"
  # Subset data for the specific population
  pop_data = totaldata[totaldata$knn_super %in% pop,]
  
  # scale the residual score per pouplation
  #pop_data$PRS = scale(pop_data$residscore)
  pop_data$PRS = pop_data$pop_scaled_prs
  ##print("PRS Summary")
  #print(summary(pop_data$PRS))
  
  
  ldl_model <-
    lm(
      ldladj ~ PRS + as.numeric(enrollment_age) + as.factor(sex_imputed) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
      data = pop_data[!is.na(pop_data$ldladj),]
    )
  
  ldl_coef_ci <- confint(ldl_model, "PRS")
  #print(paste("LDL_coef_CI for population:", pop))
  if ("PRS" %in% names(coef(ldl_model))) {
    #print(confint(ldl_model, parm = "PRS"))
  } else {
    #print("PRS coefficient is not available in the model.")
  }
NULL
Code
  # Calculate the PRS scaling factor
  prs_scale_factor <- 40 / coef(ldl_model)["PRS"]
  #print(paste0(pop,":",prs_scale_factor))
  #prs_scale_factor=1
  # Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL
  pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor
  
  
  ldl_model_2 <-
    lm(
      ldladj ~ scaled_ldl_prs + as.numeric(enrollment_age) + as.factor(sex_imputed) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
      data = pop_data[!is.na(pop_data$ldladj),]
    )
  
  ldl_scaled_coef_ci <- confint(ldl_model_2, "scaled_ldl_prs")
  
  
  #print(summary(pop_data$scaled_ldl_prs))
  # Step 2: Cox Model for CAD
  ## since age is time scale don't use
  cox_model <-
    coxph(
      Surv(CAD_censor_age, CAD_censor) ~ scaled_ldl_prs
      + as.factor(sex_imputed) + PC1 + PC2 + PC3 + PC4 +
        PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
      data = pop_data
    )
  #cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+sex, data = pop_data,family = "binomial")
  cox_models[[pop]] <- cox_model
  
  
  
  glm_model <- glm(
    CAD_censor ~ scaled_ldl_prs + enrollment_age+as.factor(sex_imputed) + PC1 +
      PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
    data = pop_data,
    family = "binomial"
  )
  glm_models[[pop]] <- glm_model
  glm_models[[pop]] <- glm_model
  log_or = summary(glm_model)$coefficients["scaled_ldl_prs", "Estimate"]
  log_or_se = summary(glm_model)$coefficients["scaled_ldl_prs", "Std. Error"]
  or = round(exp(log_or), 2)
  lci = round(exp(log_or - 1.96 * log_or_se), 2)
  uci = round(exp(log_or + 1.96 * log_or_se), 2)
  
  glmci = confint(glm_model)
Waiting for profiling to be done...
Code
  # Extract summary stats
  cox_summary <- summary(cox_model)
  glm_summary <- summary(glm_model)
  
  # Compile results into the data frame
  # Append to results
  results <- rbind(
    results,
    data.frame(
      Population = pop,
      PRS_Mean = mean(pop_data$PRS),
      PRS_SD = sd(pop_data$PRS),
      Percent_male = mean(pop_data$sex_imputed == "male"),
      
      
      LDL_Coef_CI_Lower = ldl_coef_ci[1],
      LDL_Coef_CI_Upper = ldl_coef_ci[2],
      Population_Scale_Factor = prs_scale_factor,
      LDL_Scaled_Coef_CI_Lower = ldl_scaled_coef_ci[1],
      # Adjust according to your calculations
      LDL_Scaled_Coef_CI_Upper = ldl_scaled_coef_ci[2],
      HR = cox_summary$conf.int["scaled_ldl_prs", 1],
      HR_CI_Lower = cox_summary$conf.int["scaled_ldl_prs", 3],
      # Assuming 'cox_summary' is properly defined
      HR_CI_Upper = cox_summary$conf.int["scaled_ldl_prs", 4],
      # Assuming 'cox_summary' is properly defined
      OR = or,
      OR_LCI = exp(glmci['scaled_ldl_prs', 1]),
      OR_UCI = exp(glmci['scaled_ldl_prs', 2])
    )
  )


results
    Population     PRS_Mean PRS_SD Percent_male LDL_Coef_CI_Lower
PRS        MID 3.000603e-17      1    0.5503686         0.3265832
    LDL_Coef_CI_Upper Population_Scale_Factor LDL_Scaled_Coef_CI_Lower
PRS          10.37096                7.478351                 2.442304
    LDL_Scaled_Coef_CI_Upper       HR HR_CI_Lower HR_CI_Upper   OR    OR_LCI
PRS                  77.5577 3.710259   0.9753146    14.11444 5.61 0.9612839
      OR_UCI
PRS 33.78524
Code
saveRDS(results, "results_mgb_mid.rds")

# Access and summarize the Cox models for each population
lapply(cox_models, function(x) {
  exp(coef(x)["scaled_ldl_prs"])
})
$MID
scaled_ldl_prs 
      3.710259