Ancestry PRS Load

Author

Sarah Urbut

Published

March 7, 2024

LDL PRS analyais

Code

library(dplyr)
library(broom)
library(tidyr)
library(ggplot2)
library(data.table)
library(ggridges)
library(lubridate)
library(stringr)
library(UpSetR)


prs = fread("~/Dropbox/yangstuff/GSA_53K_MID_407.txt")
ancestry=fread("~/Dropbox/yangstuff/HGDP_1KG_real.txt.gz")

s= stringr::str_split_fixed(ancestry$IID, pattern = "-", n = 2)


ldlwithinon = fread("mgbb_data/mgbb_ldl_consentdate.csv")
linker = fread("mgbb_data/all_cvd_phenos.mgbb.tsv")
ldl = merge(linker[, c("IID", "EMPI", "Subject_Id")], ldlwithinon, by =
              "EMPI", all.x = T)

## we will use consent date

pheno_incident = fread("mgbb_data/mgbb_incident_mi_consentdate.csv")
pheno_incident$incident_mi_date = pheno_incident$Date
pheno_has = fread("mgbb_data/mgbb_prevalent_mi_consentdate.csv")
pheno_has$prevalent_mi_date = pheno_has$Date



sum(pheno_incident$incident_mi == 1)

[1] 21358

Code

sum(pheno_has$prevalent_mi == 1)

[1] 11293

Code

pheno_has$inbiobank = ifelse(pheno_has$Subject_Id %in% s[, 2], 1, 0)
pheno_has %>% group_by(inbiobank) %>% summarise(mean(prevalent_mi))

# A tibble: 2 × 2
  inbiobank `mean(prevalent_mi)`
      <dbl>                <dbl>
1         0               0.0657
2         1               0.101

Code

pheno_incident$inbiobank = ifelse(pheno_incident$Subject_Id %in% s[, 2], 1, 0)
pheno_incident %>% group_by(inbiobank) %>% summarise(mean(incident_mi))

# A tibble: 2 × 2
  inbiobank `mean(incident_mi)`
      <dbl>               <dbl>
1         0               0.121
2         1               0.198

Code

baseline_covs = fread("mgbb_data/mgbb_pce_consent_date.csv")
pheno_all = left_join(pheno_incident, pheno_has[, c("Subject_Id", "prevalent_mi_date", "prevalent_mi")], by = "Subject_Id")

pheno_all_with_covs = left_join(pheno_all, baseline_covs[, c("EMPI",
                                                             "mgbb_consent_date",
                                                             "enrollment_age",
                                                             "sex_imputed",
                                                             "lipidmed")])


## we can see that the rates are much higher for biobank particants
pheno_all$either = ifelse((pheno_all$prevalent_mi + pheno_all$incident_mi) >
                            0, 1, 0)
pheno_all %>% group_by(inbiobank) %>% summarise(mean(either))

# A tibble: 2 × 2
  inbiobank `mean(either)`
      <dbl>          <dbl>
1         0          0.156
2         1          0.237

Code

pheno_dat = merge(pheno_all_with_covs, ldl, by = "EMPI")
final_dat = merge(ancestry,pheno_dat, by = "IID")


final_dat <- final_dat %>%
  mutate(
    mgbb_consent_date = as.Date(mgbb_consent_date, "%Y-%m-%d"),
    incident_cad_date = as.Date(incident_mi_date, "%Y-%m-%d"),
    prevalent_cad_date = as.Date(prevalent_mi_date, "%Y-%m-%d"),
    birthdate = mgbb_consent_date - (enrollment_age * 365.25),
    # Approximating leap years
    age_at_incident_cad = as.numeric(difftime(incident_mi_date, birthdate, units = "days")) / 365.25,
    age_at_prevalent_cad = ifelse(is.na(prevalent_mi_date), NA, as.numeric(
      difftime(prevalent_cad_date, birthdate, units = "days")
    ) / 365.25)
  )


final_dat <- final_dat %>%
  mutate(
    # Calculate censor flag
    CAD_censor = ifelse(incident_mi == 1 | prevalent_mi == 1, 1, 0),
    # Calculate age at censoring or follow-up
    CAD_censor_age = case_when(
      # Use age at prevalent CAD if prevalent CAD exists!is.na(age_at_prevalent_cad) &
      prevalent_mi == 1 ~ age_at_prevalent_cad,
      # Use age at incident CAD if no prevalent available!is.na(age_at_incident_cad) &
      incident_mi == 1 ~ age_at_incident_cad,
      # Calculate age at follow-up end date if no CAD
      TRUE ~ as.numeric(difftime(as.Date("2023-06-15"), birthdate, units = "days")) / 365.25
    )
  )


### note that only 26000 people had LDL
final_dat$ldladj = ifelse(final_dat$lipidmed == "yes",
                          final_dat$ldlc * 5 / 4,
                          final_dat$ldlc)

Phenotyping

We validated previously that MGB pheno implementation of Hard CAD is matched to HARD Cad. So we will now use that here to be c/w AoU HARD CAD pheno.

Because we are using age as time scale (or GLM with ‘evere event’ follow up time is really just time from birth to death.
We know that time is inhomogenous: even if we use OR instead of HR, the OR will tend to decrease over time because there are more non-genetically mediated CAD events

Now we want to calculate ancestry specific thresholds for PRS. Let’s get the PRS for African, European, and Asian ancestry that causes a 40 mmHg increase in LDL cholesterol in the UK Biobank using the AFR, SAS, and EUR individauls present.

Code

MID_dat=final_dat[final_dat$knn_super%in%"MID",]

final_dat%>%group_by(knn_super)%>%summarise("cad_rate"=mean(CAD_censor),"mean_ldl"=mean(na.omit(ldlc)),"%M"=mean(na.omit(sex_imputed=="male")),"meanAge"=as.numeric(mean(enrollment_age)),"population_size"=length(knn_super))

# A tibble: 6 × 6
  knn_super cad_rate mean_ldl  `%M` meanAge population_size
  <chr>        <dbl>    <dbl> <dbl>   <dbl>           <int>
1 AFR          0.402     104. 0.375    48.9            2650
2 AMR          0.280     102. 0.341    42.7            3946
3 CSA          0.150     101. 0.527    41.6             566
4 EAS          0.101     102. 0.363    41.3            1223
5 EUR          0.227     102. 0.457    55.1           44373
6 MID          0.302     103. 0.528    51.9             517

Histograms

We note that the distributions of these PRS are skewed for non EUR pops but will normalize for our analysis:

Code

prs_merged = inner_join(ancestry, prs, by = c("IID" = "IID"))
prs_MID <- prs_merged %>%
  filter(knn_super == "MID") %>%
  select(IID, combined6 )


MID_dat = merge(prs_MID, final_dat[final_dat$knn_super %in% "MID", ], by = "IID")
totaldata=MID_dat
totaldata$PRS=totaldata$combined6


ggplot(totaldata, aes(x = PRS, y = knn_super, fill = knn_super)) +
  geom_density_ridges(alpha = 0.7) +
  scale_fill_brewer(palette = "Set1") +
  theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
  labs(title = "Density Distribution of Sum Scores",
       x = "Sum Score",
       y = "Population")

Picking joint bandwidth of 2.94

Using residual PRS

Code

pcmod = lm(PRS ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = totaldata)
totaldata$predictscore = predict(object = pcmod, newdata = totaldata)
totaldata$residscore = totaldata$PRS - totaldata$predictscore
totaldata$Population = as.factor(totaldata$knn_super)
totaldata$scaled_resid = scale(totaldata$residscore)
totaldata$pop_scaled_prs = scale(totaldata$residscore)

ggplot(totaldata, aes(x = residscore, y = Population, fill = Population)) +
  geom_density_ridges(alpha = 0.7) +
  scale_fill_brewer(palette = "Set1") +
  theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
  labs(title = "Density Distribution of Sum Scores",
       x = "Sum Score",
       y = "Population")

Picking joint bandwidth of 2.84

Code

library(dplyr)
totaldata%>%group_by(Population)%>%summarize(quantile(pop_scaled_prs,0.25),median(pop_scaled_prs),mean(pop_scaled_prs),quantile(pop_scaled_prs,0.75))

# A tibble: 1 × 5
  Population `quantile(pop_scaled_prs, 0.25)` `median(pop_scaled_prs)`
  <fct>                                 <dbl>                    <dbl>
1 MID                                  -0.638                  -0.0216
# ℹ 2 more variables: `mean(pop_scaled_prs)` <dbl>,
#   `quantile(pop_scaled_prs, 0.75)` <dbl>

Let’s look at the departure from PH:

Code

library(survminer)

Loading required package: ggpubr

Code

library(survival)


Attaching package: 'survival'

The following object is masked from 'package:survminer':

    myeloma

Code

fit3 <- survfit( Surv(time = CAD_censor_age, CAD_censor) ~ Population,data=totaldata)
ggsurvplot(fit3, data = totaldata, conf.int = FALSE,fun = "event",
risk.table = TRUE, risk.table.col="strata",
ggtheme = theme_classic(),xlab="Age",palette = "nejm",xlim=c(0,90))

Code

totaldata%>%group_by(Population)%>%summarize("event_age"=mean(CAD_censor_age[CAD_censor==1]),"censor_age"=mean(CAD_censor_age[CAD_censor==0]),"event_rate"=mean(CAD_censor==1),"oldest_event"=max(CAD_censor_age[CAD_censor==1]),"oldest_control"=max(CAD_censor_age[CAD_censor==1]))

# A tibble: 1 × 6
  Population event_age censor_age event_rate oldest_event oldest_control
  <fct>          <dbl>      <dbl>      <dbl>        <dbl>          <dbl>
1 MID             60.1       54.8      0.302         91.1           91.1

Table1

Code

totaldata$followup=round(difftime(totaldata$CAD_censor_age,totaldata$enrollment_age),2)
library(table1)


Attaching package: 'table1'

The following objects are masked from 'package:base':

    units, units<-

Code

totaldata$followup <- round(difftime(totaldata$CAD_censor_age, totaldata$enrollment_age), 2)
dat <- totaldata

label(dat$sex_imputed)="sex"
label(dat$CAD_censor)="CAD"
dat$CAD_censor=as.factor(dat$CAD_censor)
#table1(~sex_imputed+ldladj+CAD_censor+enrollment_age+pop_scaled_prs+scaled_resid|knn_super,data = totaldata)
# Format the pop_scaled_prs variable to 2 decimal places
dat$pop_scaled_prs <- as.numeric(as.character(round(dat$pop_scaled_prs,2)))

dat <- totaldata
dat$CAD_censor <- as.factor(dat$CAD_censor)
label(dat$sex_imputed) <- "Sex"
label(dat$ldladj) <- "LDL-C (adjusted for Statin)"
label(dat$CAD_censor) <- "CAD Ever"
label(dat$scaled_resid) <- "Overall Scaled Residual PRS"
label(dat$pop_scaled_prs) <- "Pop Scaled Residual PRS"
label(dat$enrollment_age) = "Age Enrolled"

# Continue with creating the table
table1(~sex_imputed + ldladj + CAD_censor + enrollment_age + round(pop_scaled_prs,4) + scaled_resid | knn_super, data = dat)

	MID (N=407)	Overall (N=407)
Sex
female	183 (45.0%)	183 (45.0%)
male	224 (55.0%)	224 (55.0%)
LDL-C (adjusted for Statin)
Mean (SD)	111 (37.9)	111 (37.9)
Median [Min, Max]	110 [15.0, 239]	110 [15.0, 239]
Missing	188 (46.2%)	188 (46.2%)
CAD Ever
0	284 (69.8%)	284 (69.8%)
1	123 (30.2%)	123 (30.2%)
Age Enrolled
Mean (SD)	52.2 (18.3)	52.2 (18.3)
Median [Min, Max]	53.9 [18.5, 91.1]	53.9 [18.5, 91.1]
Pop Scaled Residual PRS
Mean (SD)	-0.000000737 (1.00)	-0.000000737 (1.00)
Median [Min, Max]	-0.0216 [-2.58, 2.85]	-0.0216 [-2.58, 2.85]
Overall Scaled Residual PRS
Mean (SD)	0.0000000000000000300 (1.00)	0.0000000000000000300 (1.00)
Median [Min, Max]	-0.0216 [-2.58, 2.85]	-0.0216 [-2.58, 2.85]

Impact of PRS

To assess the impact of LDL-C PRS on CAD risk, you need to establish a two-step relationship: first, between LDL-C PRS and LDL-C levels (per 40 mg/dL change), and then between LDL-C PRS and CAD outcomes. TAMR approach allows you to understand how changes in LDL-C levels attributed to genetic risk (as quantified by PRS) relate to CAD risk. Here’s how you can approach tAMR analysis for both European (EUR) and African (AFR) populations:

Step 1: Associate LDL-C PRS with LDL-C Levels (per 40 mg/dL)

Linear Regression for LDL-C Levels: Fit a linear regression model with LDL-C levels as the outcome and LDL-C PRS as the predictor. You might want to adjust for relevant covariates.
Rescale the Coefficient for LDL-C PRS: Adjust the coefficient from tAMR model to reflect a 40 mg/dL change in LDL-C.

Step 2: Associate Rescaled LDL-C PRS with CAD

Incorporate Rescaled LDL-C PRS into Cox Model: Use the rescaled LDL-C PRS from Step 1 as a predictor in a Cox proportional hazards model to analyze its relationship with CAD risk.
Perform Analysis for Both EUR and AFR Populations: Repeat these steps separately for EUR and AFR populations to capture population-specific effects.

Here’s an example in R for the European population. The same steps would be repeated for the African population:

Code

library(dplyr)
library(survival)
# Assuming 'df' is your dataset

# List of unique populations
populations <- c("MID")

# Initialize an empty list for storing models and results
cox_models <- list()
glm_models = list()


results <- data.frame(
  Population = character(),
  PRS_Mean = numeric(),
  PRS_SD = numeric(),
  Percent_male = numeric(),
  LDL_Coef_CI_Lower = numeric(),
  LDL_Coef_CI_Upper = numeric(),
  Population_Scale_Factor = numeric(),
  LDL_Scaled_Coef_CI_Lower = numeric(),
  LDL_Scaled_Coef_CI_Upper = numeric(),
  HR = numeric(),
  HR_CI_Lower = numeric(),
  HR_CI_Upper = numeric(),
  OR = numeric(),
  OR_LCI = numeric(),
  OR_UCI = numeric(),
  stringsAsFactors = FALSE
)


pop="MID"
  # Subset data for the specific population
  pop_data = totaldata[totaldata$knn_super %in% pop,]
  
  # scale the residual score per pouplation
  #pop_data$PRS = scale(pop_data$residscore)
  pop_data$PRS = pop_data$pop_scaled_prs
  ##print("PRS Summary")
  #print(summary(pop_data$PRS))
  
  
  ldl_model <-
    lm(
      ldladj ~ PRS + as.numeric(enrollment_age) + as.factor(sex_imputed) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
      data = pop_data[!is.na(pop_data$ldladj),]
    )
  
  ldl_coef_ci <- confint(ldl_model, "PRS")
  #print(paste("LDL_coef_CI for population:", pop))
  if ("PRS" %in% names(coef(ldl_model))) {
    #print(confint(ldl_model, parm = "PRS"))
  } else {
    #print("PRS coefficient is not available in the model.")
  }

NULL

Code

  # Calculate the PRS scaling factor
  prs_scale_factor <- 40 / coef(ldl_model)["PRS"]
  #print(paste0(pop,":",prs_scale_factor))
  #prs_scale_factor=1
  # Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL
  pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor
  
  
  ldl_model_2 <-
    lm(
      ldladj ~ scaled_ldl_prs + as.numeric(enrollment_age) + as.factor(sex_imputed) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
      data = pop_data[!is.na(pop_data$ldladj),]
    )
  
  ldl_scaled_coef_ci <- confint(ldl_model_2, "scaled_ldl_prs")
  
  
  #print(summary(pop_data$scaled_ldl_prs))
  # Step 2: Cox Model for CAD
  ## since age is time scale don't use
  cox_model <-
    coxph(
      Surv(CAD_censor_age, CAD_censor) ~ scaled_ldl_prs
      + as.factor(sex_imputed) + PC1 + PC2 + PC3 + PC4 +
        PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
      data = pop_data
    )
  #cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+sex, data = pop_data,family = "binomial")
  cox_models[[pop]] <- cox_model
  
  
  
  glm_model <- glm(
    CAD_censor ~ scaled_ldl_prs + enrollment_age+as.factor(sex_imputed) + PC1 +
      PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
    data = pop_data,
    family = "binomial"
  )
  glm_models[[pop]] <- glm_model
  glm_models[[pop]] <- glm_model
  log_or = summary(glm_model)$coefficients["scaled_ldl_prs", "Estimate"]
  log_or_se = summary(glm_model)$coefficients["scaled_ldl_prs", "Std. Error"]
  or = round(exp(log_or), 2)
  lci = round(exp(log_or - 1.96 * log_or_se), 2)
  uci = round(exp(log_or + 1.96 * log_or_se), 2)
  
  glmci = confint(glm_model)

Waiting for profiling to be done...

Code

  # Extract summary stats
  cox_summary <- summary(cox_model)
  glm_summary <- summary(glm_model)
  
  # Compile results into the data frame
  # Append to results
  results <- rbind(
    results,
    data.frame(
      Population = pop,
      PRS_Mean = mean(pop_data$PRS),
      PRS_SD = sd(pop_data$PRS),
      Percent_male = mean(pop_data$sex_imputed == "male"),
      
      
      LDL_Coef_CI_Lower = ldl_coef_ci[1],
      LDL_Coef_CI_Upper = ldl_coef_ci[2],
      Population_Scale_Factor = prs_scale_factor,
      LDL_Scaled_Coef_CI_Lower = ldl_scaled_coef_ci[1],
      # Adjust according to your calculations
      LDL_Scaled_Coef_CI_Upper = ldl_scaled_coef_ci[2],
      HR = cox_summary$conf.int["scaled_ldl_prs", 1],
      HR_CI_Lower = cox_summary$conf.int["scaled_ldl_prs", 3],
      # Assuming 'cox_summary' is properly defined
      HR_CI_Upper = cox_summary$conf.int["scaled_ldl_prs", 4],
      # Assuming 'cox_summary' is properly defined
      OR = or,
      OR_LCI = exp(glmci['scaled_ldl_prs', 1]),
      OR_UCI = exp(glmci['scaled_ldl_prs', 2])
    )
  )


results

    Population     PRS_Mean PRS_SD Percent_male LDL_Coef_CI_Lower
PRS        MID 3.000603e-17      1    0.5503686         0.3265832
    LDL_Coef_CI_Upper Population_Scale_Factor LDL_Scaled_Coef_CI_Lower
PRS          10.37096                7.478351                 2.442304
    LDL_Scaled_Coef_CI_Upper       HR HR_CI_Lower HR_CI_Upper   OR    OR_LCI
PRS                  77.5577 3.710259   0.9753146    14.11444 5.61 0.9612839
      OR_UCI
PRS 33.78524

Code

saveRDS(results, "results_mgb_mid.rds")

# Access and summarize the Cox models for each population
lapply(cox_models, function(x) {
  exp(coef(x)["scaled_ldl_prs"])
})

$MID
scaled_ldl_prs 
      3.710259