We validated previously that UKB pheno implementation of Hard CAD is matched to HARD Cad. So we will now use that here to be c/w AoU HARD CAD pheno.
Because we are using age as time scale (or GLM with ‘evere event’ follow up time is really just time from birth to death.
We know that time is inhomogenous: even if we use OR instead of HR, the OR will tend to decrease over time because there are more non-genetically mediated CAD events
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.04654 55.13689 62.57906 62.05269 70.03970 84.76112
Code
summary(m2$Cad_0_censor_age[m2$Cad_Sta==0])
Min. 1st Qu. Median Mean 3rd Qu. Max.
40.38 63.13 70.51 69.60 76.13 88.63
Code
## old phenotypes# df2=fread("~/Library/CloudStorage/Dropbox-Personal/UKBB_HardCAD_202109.tsv")# m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","f.31.0.0")],eth,by.x="identifier",by.y="identifier")# m2=merge(m2,df2[,c("sample_id","has_disease","censor_age")],by.x = "identifier",by.y="sample_id")# m2$Cad_Sta=m2$has_disease# m2$Cad_0_censor_age=m2$censor_age## get these labs readymed_codes=readRDS("~/Library/CloudStorage/Dropbox-Personal/medcodes.rds")#levels(as.factor(med_codes$meaning))## what do you do about meds? I think we need to eliminate, but will skew the distribtuionbaseline_meds=merge(med_codes,baseline_levs,by="identifier")baseline_meds$ldl_adj=ifelse(baseline_meds$meaning%in%"Cholesterol lowering medication",baseline_meds$ldl/0.8,baseline_meds$ldl)baseline_meds$sbp_adj=ifelse(baseline_meds$meaning%in%"Blood pressure medication",baseline_meds$f.4080.0.0+10,baseline_meds$f.4080.0.0)## merge allfinal_dat=merge(m2,baseline_meds[,c("identifier","sbp_adj","ldl_adj","hdl",paste0(rep("PC",10),seq(1:10)))],by.x="identifier",by.y="identifier")library(lubridate)
Attaching package: 'lubridate'
The following objects are masked from 'package:data.table':
hour, isoweek, mday, minute, month, quarter, second, wday, week,
yday, year
The following objects are masked from 'package:base':
date, intersect, setdiff, union
Now we want to calculate ancestry specific thresholds for PRS. Let’s get the PRS for African, European, and Asian ancestry that causes a 40 mmHg increase in LDL cholesterol in the UK Biobank using the AFR, SAS, and EUR individauls present.
Code
# final_dat <- final_dat %>%# mutate(recoded_ethnicity = case_when(# meaning %in% c("African", "Any other Black background", "Black or Black British","Caribbean","Any other mixed background","Mixed") ~ "AFR",# meaning %in% c("White", "Irish", "Any other white background","British") ~ "White",# meaning %in% c("Indian", "Pakistani","Bangladeshi") ~ "South Asian",# meaning == "Chinese" ~ "Chinese",# TRUE ~ "Other" # TAMR will be used for any ethnicity not covered above# ))# # final_dat$Cad_Sta=ifelse(final_dat$Cad_0_Any==2,1,0)# final_dat$sex=final_dat$f.31.0.0# # table(final_dat$recoded_ethnicity)#ggplot(lt,aes(x=Var1,y=value,fill=Var1))+geom_bar(stat="identity")gae=fread("~/Library/CloudStorage/Dropbox-Personal/ukb.kgp_projected.tsv.gz")final_dat=merge(final_dat,gae[,c("eid","rf80")],by.x="identifier",by.y="eid")table(final_dat$rf80)
AFR AMR EAS EUR SAS
8426 707 2249 448106 9085
Code
## with wallace's prs_SAS=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/SAS_SUM_SCORE_FINAL_mixed")prs_AFR=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/AFR_SUM_SCORE_FINAL_mixed")prs_EUR=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/EUR_SUM_SCORE_FINAL_mixed")prs_EAS=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/EAS_SUM_SCORE_FINAL_mixed")prs_AMR=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/HISP_SUM_SCORE_FINAL_mixed")AFR=final_dat$identifier[final_dat$rf80%in%"AFR"]EUR=final_dat$identifier[final_dat$rf80%in%"EUR"]SAS=final_dat$identifier[final_dat$rf80%in%"SAS"]AMR=final_dat$identifier[final_dat$rf80%in%"AMR"]EAS=final_dat$identifier[final_dat$rf80%in%"EAS"]colnames(prs_SAS)=colnames(prs_AFR)=colnames(prs_EUR)=colnames(prs_EAS)=colnames(prs_AMR)=c("IID","PRS")SAS_dat=merge(prs_SAS,final_dat[final_dat$rf80%in%"SAS",],by.x="IID",by.y="identifier")AFR_dat=merge(prs_AFR,final_dat[final_dat$rf80%in%"AFR",],by.x="IID",by.y="identifier")EUR_dat=merge(prs_EUR,final_dat[final_dat$rf80%in%"EUR",],by.x="IID",by.y="identifier")AMR_dat=merge(prs_AMR,final_dat[final_dat$rf80%in%"AMR",],by.x="IID",by.y="identifier")EAS_dat=merge(prs_EAS,final_dat[final_dat$rf80%in%"EAS",],by.x="IID",by.y="identifier")pop_data_list=list(SAS_dat,AFR_dat,EUR_dat,AMR_dat,EAS_dat)names(pop_data_list)=c("SAS","AFR","EUR","AMR","EAS")final_dat=final_dat[!is.na(final_dat$rf80),]final_dat%>%group_by(rf80)%>%summarise("cad_rate"=mean(Cad_Sta),"mean_ldl"=mean(na.omit(ldl_adj)),"%M"=mean(na.omit(as.numeric(f.31.0.0))),"meanAge"=as.numeric(mean(enrollage)))
We note that the distributions of these PRS are skewed for non EUR pops but will normalize for our analysis:
Code
prs_AFR$Population <-'AFR';prs_AFR=prs_AFR[prs_AFR$IID%in%AFR]prs_EUR$Population <-'EUR';prs_EUR=prs_EUR[prs_EUR$IID%in%EUR]prs_SAS$Population <-'SAS';prs_SAS=prs_SAS[prs_SAS$IID%in%SAS]prs_AMR$Population ='AMR'; prs_AMR=prs_AMR[prs_AMR$IID%in%AMR]prs_EAS$Population ='EAS'; prs_EAS=prs_EAS[prs_EAS$IID%in%EAS]combined_data <-rbind(prs_AFR, prs_EUR, prs_SAS,prs_AMR,prs_EAS)ggplot(combined_data, aes(x = PRS, y = Population, fill = Population)) +geom_density_ridges(alpha =0.7) +scale_fill_brewer(palette ="Set1") +theme_ridges() +# Optional: Adds a nice theme suitable for ridgeline plotslabs(title ="Density Distribution of Sum Scores",x ="Sum Score",y ="Population")
Picking joint bandwidth of 0.0469
Using residual PRS
Code
totaldata=do.call(rbind,pop_data_list)pcmod=lm(PRS~PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,data = totaldata)totaldata$predictscore=predict(object = pcmod,newdata=totaldata)totaldata$residscore=totaldata$PRS-totaldata$predictscoretotaldata$Population=as.factor(totaldata$rf80)totaldata$scaled_resid=scale(totaldata$residscore)ggplot(totaldata, aes(x = residscore, y = Population, fill = Population)) +geom_density_ridges(alpha =0.7) +scale_fill_brewer(palette ="Set1") +theme_ridges() +# Optional: Adds a nice theme suitable for ridgeline plotslabs(title ="Density Distribution of Sum Scores",x ="Sum Score",y ="Population")
Picking joint bandwidth of 0.0473
Code
totaldata$rf80[totaldata$rf80=="AMR"]="AMR"
Impact of PRS
To assess the impact of LDL-C PRS on CAD risk, you need to establish a two-step relationship: first, between LDL-C PRS and LDL-C levels (per 40 mg/dL change), and then between LDL-C PRS and CAD outcomes. TAMR approach allows you to understand how changes in LDL-C levels attributed to genetic risk (as quantified by PRS) relate to CAD risk. Here’s how you can approach tAMR analysis for both European (EUR) and African (AFR) populations:
Linear Regression for LDL-C Levels: Fit a linear regression model with LDL-C levels as the outcome and LDL-C PRS as the predictor. You might want to adjust for relevant covariates.
Rescale the Coefficient for LDL-C PRS: Adjust the coefficient from tAMR model to reflect a 40 mg/dL change in LDL-C.
Step 2: Associate Rescaled LDL-C PRS with CAD
Incorporate Rescaled LDL-C PRS into Cox Model: Use the rescaled LDL-C PRS from Step 1 as a predictor in a Cox proportional hazards model to analyze its relationship with CAD risk.
Perform Analysis for Both EUR and AFR Populations: Repeat these steps separately for EUR and AFR populations to capture population-specific effects.
Here’s an example in R for the European population. The same steps would be repeated for the African population:
Code
library(dplyr)library(survival)# Assuming 'df' is your dataset# List of unique populationspopulations <-c("AFR","SAS","EUR","AMR","EAS")# Initialize an empty list for storing models and resultscox_models <-list()glm_models =list()for(pop in populations) {# Subset data for the specific population pop_data=totaldata[totaldata$rf80%in%pop,]# Choose residual score or PRS pop_data$PRS=scale(pop_data$PRS) pop_data$PRS=scale(pop_data$residscore)#pop_data$PRS=pop_data$residscoreprint("PRS Summary")print(summary(pop_data$PRS))# Step 1: Linear Regression for LDL-C Levels# Assuming 'pop' is defined elsewhere and 'pop_data' is your dataset#if (pop == "EUR") { ldl_model <-lm(ldl_adj ~ PRS +as.numeric(enrollage) +as.factor(f.31.0.0) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = pop_data)#} else {# pop_data$PRS=-1*pop_data$PRS# ldl_model <- lm(ldl_adj ~ PRS + as.numeric(enrollage) + as.factor(f.31.0.0) + PC1 + PC2 + #PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = pop_data)#}print(paste("LDL_coef_CI for population:", pop))if ("PRS"%in%names(coef(ldl_model))) {print(confint(ldl_model, parm ="PRS"))} else {print("PRS coefficient is not available in the model.")}# Calculate the PRS scaling factor prs_scale_factor <-40/coef(ldl_model)["PRS"]print(paste0(pop,":",prs_scale_factor))#prs_scale_factor=1# Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factorprint(summary(pop_data$scaled_ldl_prs))# Step 2: Cox Model for CAD -- inherengly accounting for age by using age as time scale cox_model <-coxph(Surv(Cad_0_censor_age, Cad_Sta) ~ scaled_ldl_prs+as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data)#cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+sex, data = pop_data,family = "binomial") cox_models[[pop]] <- cox_model glm_model <-glm(Cad_Sta ~ scaled_ldl_prs+as.numeric(enrollage) +as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data,family ="binomial") glm_models[[pop]] <- glm_model}
[1] "PRS Summary"
Min. 1st Qu. Median Mean 3rd Qu. Max.
-4.89093 -0.62501 0.07598 0.00000 0.68240 3.51648
[1] "LDL_coef_CI for population: AFR"
2.5 % 97.5 %
PRS 12.11459 13.50644
[1] "AFR:3.12243539531292"
Min. 1st Qu. Median Mean 3rd Qu. Max.
-1.56638 -0.20017 0.02433 0.00000 0.21855 1.12620
[1] "PRS Summary"
Min. 1st Qu. Median Mean 3rd Qu. Max.
-4.31027 -0.64594 0.03341 0.00000 0.70225 3.22404
[1] "LDL_coef_CI for population: SAS"
2.5 % 97.5 %
PRS 8.15468 9.495848
[1] "SAS:4.53244224048928"
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.950982 -0.142514 0.007371 0.000000 0.154937 0.711326
[1] "PRS Summary"
Min. 1st Qu. Median Mean 3rd Qu. Max.
-4.89947 -0.65269 0.02883 0.00000 0.68557 4.22746
[1] "LDL_coef_CI for population: EUR"
2.5 % 97.5 %
PRS 11.58239 11.75977
[1] "EUR:3.42727457116371"
Min. 1st Qu. Median Mean 3rd Qu. Max.
-1.429552 -0.190441 0.008413 0.000000 0.200034 1.233475
[1] "PRS Summary"
Min. 1st Qu. Median Mean 3rd Qu. Max.
-3.83980 -0.60837 0.05459 0.00000 0.66863 2.91596
[1] "LDL_coef_CI for population: AMR"
2.5 % 97.5 %
PRS 8.792547 13.88534
[1] "AMR:3.52766605506817"
Min. 1st Qu. Median Mean 3rd Qu. Max.
-1.08848 -0.17246 0.01547 0.00000 0.18954 0.82660
[1] "PRS Summary"
Min. 1st Qu. Median Mean 3rd Qu. Max.
-4.04886 -0.57366 0.09183 0.00000 0.69031 2.75622
[1] "LDL_coef_CI for population: EAS"
2.5 % 97.5 %
PRS 8.997966 11.40513
[1] "EAS:3.9209732005657"
Min. 1st Qu. Median Mean 3rd Qu. Max.
-1.03262 -0.14631 0.02342 0.00000 0.17606 0.70294
Code
# Access and summarize the Cox models for each populationlapply(cox_models, function(x){exp(coef(x)["scaled_ldl_prs"])})
Now we want to take a two-step Bayesian approach where you first calculate a global mean hazard ratio (HR) or odds ratio (OR) from population-specific estimates and then use tAMR global mean as a prior in a Bayesian adjustment of the population-specific estimates. In such a case, weighting the global mean by the standard error (or variance) of each population-specific estimate is indeed a good approach, as it accounts for the different levels of uncertainty in each population estimate.
Calculate Weighted Global Mean HR/OR:
For each population, you have an HR/OR and its standard error (SE).
You can weight the HR/OR by the inverse of the variance (which is SE²).
The weighted mean HR/OR is then the sum of each weighted HR/OR divided by the sum of the weights.
Use Weighted Mean as Prior for Bayesian Adjustment:
Use the weighted global mean HR/OR as the mean for your prior distribution in the Bayesian model for each population.
The variance of the prior could be based on the pooled standard errors or a measure of the variability of the HR/OR across populations.
Variance and Weights
$$$$The variance of each log-transformed odds ratio (OR) can be approximated from the confidence interval: \[\text{Variance} \approx \left( \frac{\log(\text{Upper CI}) - \log(\text{Lower CI})}{2 \times 1.96} \right)^2
\]
Based on tAMR variance, the weight for each observation is:
\[
w_i = \frac{1}{\text{Variance}_i}
\]
Weighted Mean The weighted mean is then calculated as:
\[ \sum_{i=1}^{n} w_i^2 \] is the sum of the squared weights, and \[ x_i \] is each individual log-transformed OR.
Code
# Assuming you have a dataframe 'results' with columns 'Population', 'OR', and 'SE'# Calculate variance and weightsresults <- results %>%mutate(log_OR =log(OR),log_Lower =log(Lower),log_Upper =log(Upper),Variance = ((log_Upper - log_Lower) / (2*1.96))^2,Weight =1/ Variance )# Calculate weighted mean and variance for the priorsum_weights <-sum(results$Weight)sum_squared_weights <-sum(results$Weight^2)weighted_mean_log_OR <-sum(results$log_OR * results$Weight) / sum_weightsweighted_variance <- (sum_weights / (sum_weights^2- sum_squared_weights)) *sum(results$Weight * (results$log_OR - weighted_mean_log_OR)^2)prior_mean <- weighted_mean_log_ORprior_variance <- weighted_variance# Perform Bayesian updating for each populationresults <- results %>%mutate(Posterior_Variance =1/ (1/ prior_variance +1/ Variance),Posterior_Mean = Posterior_Variance * (prior_mean / prior_variance + log_OR / Variance) )# Exponentiating the posterior mean to get back to the OR scaleresults$Posterior_OR <-exp(results$Posterior_Mean)results <- results %>%mutate(Posterior_Lower_Log = Posterior_Mean -1.96*sqrt(Posterior_Variance),Posterior_Upper_Log = Posterior_Mean +1.96*sqrt(Posterior_Variance),Posterior_Lower_OR =exp(Posterior_Lower_Log),Posterior_Upper_OR =exp(Posterior_Upper_Log) )# Now 'results' contains the posterior mean and variance for each populationprint(results)
library(forestplot)library(ggplot2)library(dplyr)# Assuming 'results' is already loaded with your data# We need to create a new data frame for ggplot# First, we create two separate data frames for original and posterior results and then bind them together# plot_data <- results %>%# select(Population, OR, Lower, Upper, Posterior_OR, Posterior_Lower_OR, Posterior_Upper_OR) %>%# gather(key = "Type", value = "Value", -Population, -Lower, -Upper, -Posterior_Lower_OR, -Posterior_Upper_OR) %>%# mutate(# ymin = ifelse(Type == "OR", Lower, Posterior_Lower_OR),# ymax = ifelse(Type == "OR", Upper, Posterior_Upper_OR),# Type = factor(Type, levels = c("OR", "Posterior_OR"))# )# # # Now we plot with ggplot2# ggplot(plot_data, aes(y = Population, x = Value, xmin = ymin, xmax = ymax, color = Type)) +# geom_errorbarh(height = 0.2) +# geom_point(size = 2.5) +# scale_color_manual(values = c("OR" = "black", "Posterior_OR" = "red")) +# theme_minimal() + lims(x=c(0.5,2.5))+# labs(x = "Odds Ratio", y = "", color = "Estimate Type") +# theme(legend.position = "bottom")# Adding an identifier for each rowresults$ID <-seq_along(results$Population)results$ID <-seq_along(results$Population)# Create a long-format data frame for plottingplot_data <- tidyr::pivot_longer( results, cols =c("OR", "Posterior_OR"), names_to ="Estimate_Type", values_to ="Value") %>%mutate(ymin =ifelse(Estimate_Type =="OR", Lower, Posterior_Lower_OR),ymax =ifelse(Estimate_Type =="OR", Upper, Posterior_Upper_OR),Estimate_Type =factor(Estimate_Type, levels =c("OR", "Posterior_OR")),# Adjust the y position of the posterior estimatesypos =as.numeric(ID) +ifelse(Estimate_Type =="OR", -0.1, 0.1) )# Create the plotggplot(plot_data, aes(x = Value, y = ypos, color = Estimate_Type)) +geom_point(size =3) +geom_errorbar(aes(xmin = ymin, xmax = ymax), width =0.2) +scale_color_manual(values =c("OR"="black", "Posterior_OR"="red")) +labs(x ="Odds Ratio", y ="") +theme_minimal() +theme(axis.text.y =element_text(vjust =0.5), axis.ticks.y =element_blank(),panel.grid.major =element_blank(),panel.grid.minor =element_blank(),legend.position ="bottom") +scale_y_continuous(breaks =1:length(unique(plot_data$ID)), labels = results$Population)
We can do tAMR in stan and calculate heterogneiety as well:
Code
library(brms)results <- results %>%mutate(se_log_OR =1/sqrt(Weight))# Given prior study's log OR and its standard errorprior_log_or <-log(1/0.76) # log of the hazard ratio from the studyprior_sd <- (log(1/0.64) -log(1/0.91)) / (2*1.96) # standard error of the log OR# or try tAMR:# # rosuvastatin_with_event <- 235# rosuvastatin_without_event <- 6361 - rosuvastatin_with_event# # placebo_with_event <- 304# placebo_without_event <- 6344 - placebo_with_event# # # Compute the OR# odds_case <- rosuvastatin_with_event / rosuvastatin_without_event# odds_control <- placebo_with_event / placebo_without_event# OR <- odds_control / odds_case# # # Convert OR to log scale for the prior# prior_log_or <- log(OR)# # # Calculate standard error of the log OR# prior_log_or_se <- sqrt(1 / placebo_with_event +# 1 / placebo_without_event +# 1 / rosuvastatin_with_event +# 1 / rosuvastatin_without_event)# # # Print the prior log OR and its standard error# print(prior_log_or)# print(prior_log_or_se)# # # Print the OR# print(OR)# # Define the priors based on the prior study for the global effect size#prior_effect <- set_prior(paste("normal(", prior_log_or, ", ", prior_log_or_se, ")"), class = "Intercept")prior_effect <-set_prior(paste("normal(", weighted_mean_log_OR, ", ", sqrt(weighted_variance), ")"), class ="Intercept")# Define a non-informative prior for the heterogeneityprior_heterogeneity <-set_prior("student_t(3, 0, 10)", class ="sd", group ="Population")prior_global <-set_prior(paste0("normal(", prior_log_or, ", ", prior_log_or_se, ")"), class ="Intercept")prior_heterogeneity <-set_prior("student_t(3, 0, 10)", class ="sd",group ="Population")# Fit the modelmeta_analysis_model <-brm(bf(log_OR |se(se_log_OR) ~1+ (1| Population)),data = results,prior =c(prior_global, prior_heterogeneity),family =gaussian(),chains =4,iter =4000,warmup =2000,control =list(adapt_delta =0.999999) # adjust for convergence)# Check the summary of the modelsummary(meta_analysis_model)# Extract population-specific effectsranef(meta_analysis_model)global_effect=fixef(meta_analysis_model)[1,1]# Get the random effects (population-specific deviations)population_effects <-ranef(meta_analysis_model)# Calculate the population-specific log ORs by adding the global effect to the random effectspopulation_specific_log_ORs <- global_effect + population_effects$Population[,,"Intercept"][,"Estimate"]# Convert the log ORs to ORspopulation_specific_ORs <-exp(population_specific_log_ORs)# Now you have the population-specific ORsprint(population_specific_ORs)
Heterogeneity assessment:
Code
# Extract the standard deviation of the random effectsheterogeneity_estimate <-VarCorr(meta_analysis_model)$Population$sd# Print the heterogeneity estimateprint(heterogeneity_estimate)# Conduct posterior predictive checkspp_check(meta_analysis_model)# If you want to plot the distribution of the standard deviation of the random effectsposterior_samples <-posterior_samples(meta_analysis_model, pars ="sd_Population__Intercept")AMRt(posterior_samples[, "sd_Population__Intercept"], breaks =40, main ="Distribution of Heterogeneity (sd Population)",x="Heterogeneity (SD)",fill="blue")# Summarize the posterior distribution of the heterogeneity parameterposterior_summary <-summary(posterior_samples[, "sd_Population__Intercept"])print(posterior_summary)