We validated previously that UKB pheno implementation of Hard CAD is matched to HARD Cad. So we will now use that here to be c/w AoU HARD CAD pheno.
Because we are using age as time scale (or GLM with ‘evere event’ follow up time is really just time from birth to death.
We know that time is inhomogenous: even if we use OR instead of HR, the OR will tend to decrease over time because there are more non-genetically mediated CAD events
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.04654 55.13210 62.57906 62.05194 70.03970 84.76112
Code
summary(m2$Cad_0_censor_age[m2$Cad_Sta==0])
Min. 1st Qu. Median Mean 3rd Qu. Max.
40.38 63.13 70.49 69.60 76.13 88.63
Code
## old phenotypes# df2=fread("~/Library/CloudStorage/Dropbox/UKBB_HardCAD_202109.tsv")# m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","f.31.0.0")],eth,by.x="identifier",by.y="identifier")# m2=merge(m2,df2[,c("sample_id","has_disease","censor_age")],by.x = "identifier",by.y="sample_id")# m2$Cad_Sta=m2$has_disease# m2$Cad_0_censor_age=m2$censor_age## get these labs readymed_codes=readRDS("~/Library/CloudStorage/Dropbox/medcodes.rds")#levels(as.factor(med_codes$meaning))## what do you do about meds? I think we need to eliminate, but will skew the distribtuionbaseline_meds=merge(med_codes,baseline_levs,by="identifier")baseline_meds$ldl_adj=ifelse(baseline_meds$meaning%in%"Cholesterol lowering medication",baseline_meds$ldl/0.8,baseline_meds$ldl)baseline_meds$sbp_adj=ifelse(baseline_meds$meaning%in%"Blood pressure medication",baseline_meds$f.4080.0.0+10,baseline_meds$f.4080.0.0)## merge allfinal_dat=merge(m2,baseline_meds[,c("identifier","sbp_adj","ldl_adj","hdl",paste0(rep("PC",10),seq(1:10)))],by.x="identifier",by.y="identifier")library(lubridate)
Attaching package: 'lubridate'
The following objects are masked from 'package:data.table':
hour, isoweek, mday, minute, month, quarter, second, wday, week,
yday, year
The following objects are masked from 'package:base':
date, intersect, setdiff, union
Now we want to calculate ancestry specific thresholds for PRS. Let’s get the PRS for African, European, and Asian ancestry that causes a 40 mmHg increase in LDL cholesterol in the UK Biobank using the AFR, SAS, and EUR individauls present.
Code
# final_dat <- final_dat %>%# mutate(recoded_ethnicity = case_when(# meaning %in% c("African", "Any other Black background", "Black or Black British","Caribbean","Any other mixed background","Mixed") ~ "AFR",# meaning %in% c("White", "Irish", "Any other white background","British") ~ "White",# meaning %in% c("Indian", "Pakistani","Bangladeshi") ~ "South Asian",# meaning == "Chinese" ~ "Chinese",# TRUE ~ "Other" # TAMR will be used for any ethnicity not covered above# ))# # final_dat$Cad_Sta=ifelse(final_dat$Cad_0_Any==2,1,0)# final_dat$sex=final_dat$f.31.0.0# # table(final_dat$recoded_ethnicity)#ggplot(lt,aes(x=Var1,y=value,fill=Var1))+geom_bar(stat="identity")# gae=fread("~/Library/CloudStorage/Dropbox/ukb.kgp_projected.tsv.gz")# final_dat=merge(final_dat,gae[,c("eid","rf80")],by.x="identifier",by.y="eid")# table(final_dat$rf80)gae=fread("~/Dropbox/yangstuff/UKB_HGDP_1KG_real.txt.gz")gae=data.frame("IID"=as.character(gae$IID),"knn"=gae$knn_super)final_dat=merge(final_dat,gae,by.y="IID",by.x="identifier")table(final_dat$knn)
AFR AMR CSA EAS EUR MID OCE
9152 1637 9981 2809 456173 1136 1
# A tibble: 1 × 5
Population event_rate total_subjects percent_female enrollment_age
<fct> <dbl> <int> <dbl> <drtn>
1 MID 0.0977 1136 0.387 53.18859 days
Code
summary(totaldata$pop_scaled_prs)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-2.827728 -0.641920 -0.007595 0.000000 0.657082 2.947659
Code
ggplot(totaldata, aes(x = pop_scaled_prs, y = Population, fill = Population)) +geom_density_ridges(alpha =0.7) +scale_fill_brewer(palette ="Set1") +theme_ridges() +# Optional: Adds a nice theme suitable for ridgeline plotslabs(title ="Density Distribution of Sum Scores: Scaled for Population",x ="Sum Score",y ="Population")
The following objects are masked from 'package:base':
units, units<-
Code
dat <- totaldatadat$Sex=ifelse(dat$f.31.0.0==1,"Male","Female")dat$Sex=as.factor(dat$Sex)dat$Cad_Sta=as.factor(dat$Cad_Sta)dat$pop_scaled_prs <-as.numeric(as.character(round(dat$pop_scaled_prs,2)))dat$enrollage=as.numeric(dat$enrollage)dat$CAD_censor <-as.factor(dat$Cad_Sta)label(dat$ldl_adj) <-"LDL-C (adjusted for Statin)"label(dat$Cad_Sta) <-"CAD Ever"label(dat$scaled_resid) <-"Overall Scaled Residual PRS"label(dat$pop_scaled_prs) <-"Pop Scaled Residual PRS"label(dat$enrollage) ="Age Enrolled"# Continue with creating the tabletable1(~ Sex + ldl_adj + Cad_Sta + enrollage + followup+round(pop_scaled_prs,4) + scaled_resid | Population, data = dat)
MID (N=1136)
Overall (N=1136)
Sex
Female
440 (38.7%)
440 (38.7%)
Male
696 (61.3%)
696 (61.3%)
LDL-C (adjusted for Statin)
Mean (SD)
139 (33.2)
139 (33.2)
Median [Min, Max]
139 [49.9, 315]
139 [49.9, 315]
Missing
73 (6.4%)
73 (6.4%)
CAD Ever
0
1025 (90.2%)
1025 (90.2%)
1
111 (9.8%)
111 (9.8%)
Age Enrolled
Mean (SD)
53.2 (8.04)
53.2 (8.04)
Median [Min, Max]
52.4 [40.3, 70.1]
52.4 [40.3, 70.1]
followup
Mean (SD)
11.8 (5.71)
11.8 (5.71)
Median [Min, Max]
13.0 [-53.6, 15.5]
13.0 [-53.6, 15.5]
Pop Scaled Residual PRS
Mean (SD)
0.0000528 (1.00)
0.0000528 (1.00)
Median [Min, Max]
-0.0100 [-2.83, 2.95]
-0.0100 [-2.83, 2.95]
Overall Scaled Residual PRS
Mean (SD)
-0.0000000000000000118 (1.00)
-0.0000000000000000118 (1.00)
Median [Min, Max]
-0.00760 [-2.83, 2.95]
-0.00760 [-2.83, 2.95]
Impact of PRS
To assess the impact of LDL-C PRS on CAD risk, you need to establish a two-step relationship: first, between LDL-C PRS and LDL-C levels (per 40 mg/dL change), and then between LDL-C PRS and CAD outcomes. TAMR approach allows you to understand how changes in LDL-C levels attributed to genetic risk (as quantified by PRS) relate to CAD risk. Here’s how you can approach tAMR analysis for both European (EUR) and African (AFR) populations:
Linear Regression for LDL-C Levels: Fit a linear regression model with LDL-C levels as the outcome and LDL-C PRS as the predictor. You might want to adjust for relevant covariates.
Rescale the Coefficient for LDL-C PRS: Adjust the coefficient from tAMR model to reflect a 40 mg/dL change in LDL-C.
Step 2: Associate Rescaled LDL-C PRS with CAD
Incorporate Rescaled LDL-C PRS into Cox Model: Use the rescaled LDL-C PRS from Step 1 as a predictor in a Cox proportional hazards model to analyze its relationship with CAD risk.
Perform Analysis for Both EUR and AFR Populations: Repeat these steps separately for EUR and AFR populations to capture population-specific effects.
Here’s an example in R for the European population. The same steps would be repeated for the African population:
Code
library(dplyr)library(survival)# Before the loopresults <-data.frame(Population =character(),PRS_Mean =numeric(),PRS_SD =numeric(),Percent_male =numeric(),LDL_Coef_CI_Lower =numeric(),LDL_Coef_CI_Upper =numeric(),Population_Scale_Factor =numeric(),LDL_Scaled_Coef_CI_Lower =numeric(),LDL_Scaled_Coef_CI_Upper =numeric(),HR=numeric(),HR_CI_Lower =numeric(),HR_CI_Upper =numeric(),OR =numeric(),OR_LCI =numeric(),OR_UCI =numeric(),stringsAsFactors =FALSE)# Assuming 'df' is your dataset# List of unique populationspopulations <-"MID"library(broom)library(dplyr)# Initialize an empty data frame to store results# Initialize an empty list for storing models and resultscox_models <-list()glm_models =list()pop ="MID"# Subset data for the specific populationpop_data = totaldata[totaldata$knn%in%pop, ]pop_data$PRS = pop_data$pop_scaled_prs# Step 1: Linear Regression for LDL-C Levels# Assuming 'pop' is defined elsewhere and 'pop_data' is your dataset#if (pop == "EUR") {ldl_model <-lm( ldl_adj ~ PRS +as.numeric(enrollage) +as.factor(f.31.0.0) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,data = pop_data)#} else {# pop_data$PRS=-1*pop_data$PRS# ldl_model <- lm(ldl_adj ~ PRS + as.numeric(enrollage) + as.factor(f.31.0.0) + PC1 + PC2 + #PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = pop_data)#}ldl_coef_ci <-confint(ldl_model, "PRS")#print(paste("LDL_coef_CI for population:", pop))if ("PRS"%in%names(coef(ldl_model))) {#print(confint(ldl_model, parm = "PRS"))} else {#print("PRS coefficient is not available in the model.")}
NULL
Code
# Calculate the PRS scaling factorprs_scale_factor <-40/coef(ldl_model)["PRS"]#print(paste0(pop,":",prs_scale_factor))#prs_scale_factor=1# Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDLpop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factorldl_model_2 <-lm( ldl_adj ~ scaled_ldl_prs +as.numeric(enrollage) +as.factor(f.31.0.0) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,data = pop_data)ldl_scaled_coef_ci <-confint(ldl_model_2, "scaled_ldl_prs")#print(summary(pop_data$scaled_ldl_prs))# Step 2: Cox Model for CAD -- inherengly accounting for age by using age as time scalecox_model <-coxph(Surv(Cad_0_censor_age, Cad_Sta) ~ scaled_ldl_prs+as.factor(f.31.0.0) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,data = pop_data)#cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+sex, data = pop_data,family = "binomial")cox_models[[pop]] <- cox_modelglm_model <-glm( Cad_Sta ~ scaled_ldl_prs +as.numeric(enrollage)+as.factor(f.31.0.0) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,data = pop_data,family ="binomial")glm_models[[pop]] <- glm_modellog_or =summary(glm_model)$coefficients["scaled_ldl_prs", "Estimate"]log_or_se =summary(glm_model)$coefficients["scaled_ldl_prs", "Std. Error"]or =round(exp(log_or), 2)lci =round(exp(log_or -1.96* log_or_se), 2)uci =round(exp(log_or +1.96* log_or_se), 2)glmci =confint(glm_model)
Waiting for profiling to be done...
Code
# Extract summary statscox_summary <-summary(cox_model)glm_summary <-summary(glm_model)# Predict the probability of CAD for each individual in the datasetpredicted_risk <-predict(glm_model, pop_data, type ="response")# Compile results into the data frame# Append to resultsresults <-data.frame(Population = pop,PRS_Mean =mean(pop_data$PRS),PRS_SD =sd(pop_data$PRS),Percent_male =mean(pop_data$f.31.0.0==1),LDL_Coef_CI_Lower = ldl_coef_ci[1],LDL_Coef_CI_Upper = ldl_coef_ci[2],Population_Scale_Factor = prs_scale_factor,LDL_Scaled_Coef_CI_Lower = ldl_scaled_coef_ci[1],LDL_Scaled_Coef_CI_Upper = ldl_scaled_coef_ci[2],HR = cox_summary$conf.int["scaled_ldl_prs", 1],HR_CI_Lower = cox_summary$conf.int["scaled_ldl_prs", 3],HR_CI_Upper = cox_summary$conf.int["scaled_ldl_prs", 4],OR = or,OR_LCI =exp(glmci['scaled_ldl_prs', 1]),OR_UCI =exp(glmci['scaled_ldl_prs', 2]))# Display resultsprint(results)