baseline_covs =fread("mgbb_data/mgbb_pce_consent_date.csv")pheno_all =left_join(pheno_incident, pheno_has[, c("Subject_Id", "prevalent_mi_date", "prevalent_mi")], by ="Subject_Id")pheno_all_with_covs =left_join(pheno_all, baseline_covs[, c("EMPI","mgbb_consent_date","enrollment_age","sex_imputed","lipidmed")])## we can see that the rates are much higher for biobank particantspheno_all$either =ifelse((pheno_all$prevalent_mi + pheno_all$incident_mi) >0, 1, 0)pheno_all %>%group_by(inbiobank) %>%summarise(mean(either))
pheno_dat =merge(pheno_all_with_covs, ldl, by ="EMPI")final_dat =merge(ancestry,pheno_dat, by ="IID")final_dat <- final_dat %>%mutate(mgbb_consent_date =as.Date(mgbb_consent_date, "%Y-%m-%d"),incident_cad_date =as.Date(incident_mi_date, "%Y-%m-%d"),prevalent_cad_date =as.Date(prevalent_mi_date, "%Y-%m-%d"),birthdate = mgbb_consent_date - (enrollment_age *365.25),# Approximating leap yearsage_at_incident_cad =as.numeric(difftime(incident_mi_date, birthdate, units ="days")) /365.25,age_at_prevalent_cad =ifelse(is.na(prevalent_mi_date), NA, as.numeric(difftime(prevalent_cad_date, birthdate, units ="days") ) /365.25) )final_dat <- final_dat %>%mutate(# Calculate censor flagCAD_censor =ifelse(incident_mi ==1| prevalent_mi ==1, 1, 0),# Calculate age at censoring or follow-upCAD_censor_age =case_when(# Use age at prevalent CAD if prevalent CAD exists!is.na(age_at_prevalent_cad) & prevalent_mi ==1~ age_at_prevalent_cad,# Use age at incident CAD if no prevalent available!is.na(age_at_incident_cad) & incident_mi ==1~ age_at_incident_cad,# Calculate age at follow-up end date if no CADTRUE~as.numeric(difftime(as.Date("2023-06-15"), birthdate, units ="days")) /365.25 ) )### note that only 26000 people had LDLfinal_dat$ldladj =ifelse(final_dat$lipidmed =="yes", final_dat$ldlc *5/4, final_dat$ldlc)
Phenotyping
We validated previously that MGB pheno implementation of Hard CAD is matched to HARD Cad. So we will now use that here to be c/w AoU HARD CAD pheno.
Because we are using age as time scale (or GLM with ‘evere event’ follow up time is really just time from birth to death.
We know that time is inhomogenous: even if we use OR instead of HR, the OR will tend to decrease over time because there are more non-genetically mediated CAD events
Now we want to calculate ancestry specific thresholds for PRS. Let’s get the PRS for African, European, and Asian ancestry that causes a 40 mmHg increase in LDL cholesterol in the UK Biobank using the AFR, SAS, and EUR individauls present.
The following objects are masked from 'package:base':
units, units<-
Code
totaldata$followup <-round(difftime(totaldata$CAD_censor_age, totaldata$enrollment_age), 2)dat <- totaldatalabel(dat$sex_imputed)="sex"label(dat$CAD_censor)="CAD"dat$CAD_censor=as.factor(dat$CAD_censor)#table1(~sex_imputed+ldladj+CAD_censor+enrollment_age+pop_scaled_prs+scaled_resid|knn_super,data = totaldata)# Format the pop_scaled_prs variable to 2 decimal placesdat$pop_scaled_prs <-as.numeric(as.character(round(dat$pop_scaled_prs,2)))dat <- totaldatadat$CAD_censor <-as.factor(dat$CAD_censor)label(dat$sex_imputed) <-"Sex"label(dat$ldladj) <-"LDL-C (adjusted for Statin)"label(dat$CAD_censor) <-"CAD Ever"label(dat$scaled_resid) <-"Overall Scaled Residual PRS"label(dat$pop_scaled_prs) <-"Pop Scaled Residual PRS"label(dat$enrollment_age) ="Age Enrolled"# Continue with creating the tabletable1(~sex_imputed + ldladj + CAD_censor + enrollment_age +round(pop_scaled_prs,4) + scaled_resid | knn_super, data = dat)
MID (N=407)
Overall (N=407)
Sex
female
183 (45.0%)
183 (45.0%)
male
224 (55.0%)
224 (55.0%)
LDL-C (adjusted for Statin)
Mean (SD)
111 (37.9)
111 (37.9)
Median [Min, Max]
110 [15.0, 239]
110 [15.0, 239]
Missing
188 (46.2%)
188 (46.2%)
CAD Ever
0
284 (69.8%)
284 (69.8%)
1
123 (30.2%)
123 (30.2%)
Age Enrolled
Mean (SD)
52.2 (18.3)
52.2 (18.3)
Median [Min, Max]
53.9 [18.5, 91.1]
53.9 [18.5, 91.1]
Pop Scaled Residual PRS
Mean (SD)
-0.000000737 (1.00)
-0.000000737 (1.00)
Median [Min, Max]
-0.0216 [-2.58, 2.85]
-0.0216 [-2.58, 2.85]
Overall Scaled Residual PRS
Mean (SD)
0.0000000000000000300 (1.00)
0.0000000000000000300 (1.00)
Median [Min, Max]
-0.0216 [-2.58, 2.85]
-0.0216 [-2.58, 2.85]
Impact of PRS
To assess the impact of LDL-C PRS on CAD risk, you need to establish a two-step relationship: first, between LDL-C PRS and LDL-C levels (per 40 mg/dL change), and then between LDL-C PRS and CAD outcomes. TAMR approach allows you to understand how changes in LDL-C levels attributed to genetic risk (as quantified by PRS) relate to CAD risk. Here’s how you can approach tAMR analysis for both European (EUR) and African (AFR) populations:
Linear Regression for LDL-C Levels: Fit a linear regression model with LDL-C levels as the outcome and LDL-C PRS as the predictor. You might want to adjust for relevant covariates.
Rescale the Coefficient for LDL-C PRS: Adjust the coefficient from tAMR model to reflect a 40 mg/dL change in LDL-C.
Step 2: Associate Rescaled LDL-C PRS with CAD
Incorporate Rescaled LDL-C PRS into Cox Model: Use the rescaled LDL-C PRS from Step 1 as a predictor in a Cox proportional hazards model to analyze its relationship with CAD risk.
Perform Analysis for Both EUR and AFR Populations: Repeat these steps separately for EUR and AFR populations to capture population-specific effects.
Here’s an example in R for the European population. The same steps would be repeated for the African population:
Code
library(dplyr)library(survival)# Assuming 'df' is your dataset# List of unique populationspopulations <-c("MID")# Initialize an empty list for storing models and resultscox_models <-list()glm_models =list()results <-data.frame(Population =character(),PRS_Mean =numeric(),PRS_SD =numeric(),Percent_male =numeric(),LDL_Coef_CI_Lower =numeric(),LDL_Coef_CI_Upper =numeric(),Population_Scale_Factor =numeric(),LDL_Scaled_Coef_CI_Lower =numeric(),LDL_Scaled_Coef_CI_Upper =numeric(),HR =numeric(),HR_CI_Lower =numeric(),HR_CI_Upper =numeric(),OR =numeric(),OR_LCI =numeric(),OR_UCI =numeric(),stringsAsFactors =FALSE)pop="MID"# Subset data for the specific population pop_data = totaldata[totaldata$knn_super %in% pop,]# scale the residual score per pouplation#pop_data$PRS = scale(pop_data$residscore) pop_data$PRS = pop_data$pop_scaled_prs##print("PRS Summary")#print(summary(pop_data$PRS)) ldl_model <-lm( ldladj ~ PRS +as.numeric(enrollment_age) +as.factor(sex_imputed) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,data = pop_data[!is.na(pop_data$ldladj),] ) ldl_coef_ci <-confint(ldl_model, "PRS")#print(paste("LDL_coef_CI for population:", pop))if ("PRS"%in%names(coef(ldl_model))) {#print(confint(ldl_model, parm = "PRS")) } else {#print("PRS coefficient is not available in the model.") }
NULL
Code
# Calculate the PRS scaling factor prs_scale_factor <-40/coef(ldl_model)["PRS"]#print(paste0(pop,":",prs_scale_factor))#prs_scale_factor=1# Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor ldl_model_2 <-lm( ldladj ~ scaled_ldl_prs +as.numeric(enrollment_age) +as.factor(sex_imputed) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,data = pop_data[!is.na(pop_data$ldladj),] ) ldl_scaled_coef_ci <-confint(ldl_model_2, "scaled_ldl_prs")#print(summary(pop_data$scaled_ldl_prs))# Step 2: Cox Model for CAD## since age is time scale don't use cox_model <-coxph(Surv(CAD_censor_age, CAD_censor) ~ scaled_ldl_prs+as.factor(sex_imputed) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,data = pop_data )#cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+sex, data = pop_data,family = "binomial") cox_models[[pop]] <- cox_model glm_model <-glm( CAD_censor ~ scaled_ldl_prs + enrollment_age+as.factor(sex_imputed) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,data = pop_data,family ="binomial" ) glm_models[[pop]] <- glm_model glm_models[[pop]] <- glm_model log_or =summary(glm_model)$coefficients["scaled_ldl_prs", "Estimate"] log_or_se =summary(glm_model)$coefficients["scaled_ldl_prs", "Std. Error"] or =round(exp(log_or), 2) lci =round(exp(log_or -1.96* log_or_se), 2) uci =round(exp(log_or +1.96* log_or_se), 2) glmci =confint(glm_model)
Waiting for profiling to be done...
Code
# Extract summary stats cox_summary <-summary(cox_model) glm_summary <-summary(glm_model)# Compile results into the data frame# Append to results results <-rbind( results,data.frame(Population = pop,PRS_Mean =mean(pop_data$PRS),PRS_SD =sd(pop_data$PRS),Percent_male =mean(pop_data$sex_imputed =="male"),LDL_Coef_CI_Lower = ldl_coef_ci[1],LDL_Coef_CI_Upper = ldl_coef_ci[2],Population_Scale_Factor = prs_scale_factor,LDL_Scaled_Coef_CI_Lower = ldl_scaled_coef_ci[1],# Adjust according to your calculationsLDL_Scaled_Coef_CI_Upper = ldl_scaled_coef_ci[2],HR = cox_summary$conf.int["scaled_ldl_prs", 1],HR_CI_Lower = cox_summary$conf.int["scaled_ldl_prs", 3],# Assuming 'cox_summary' is properly definedHR_CI_Upper = cox_summary$conf.int["scaled_ldl_prs", 4],# Assuming 'cox_summary' is properly definedOR = or,OR_LCI =exp(glmci['scaled_ldl_prs', 1]),OR_UCI =exp(glmci['scaled_ldl_prs', 2]) ) )results
saveRDS(results, "results_mgb_mid.rds")# Access and summarize the Cox models for each populationlapply(cox_models, function(x) {exp(coef(x)["scaled_ldl_prs"])})