library(dplyr)library(broom)library(tidyr)library(ggplot2)library(data.table)library(ggridges)load("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/output/merged_pheno_censor_final_withdrugs_smoke.rds")pheno_prs=dfheth=readRDS("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/ethnicity.rds")eth=eth[!is.na(eth$f.21000.0.0),]eth=eth[,c("identifier","meaning")]baseline_levs=readRDS("~/Library/CloudStorage/Dropbox-Personal/dfukb_chol_bp_smoke.rds")baseline_levs$hdl=baseline_levs$f.30760.0.0*38.4baseline_levs$ldl=baseline_levs$f.30780.0.0*38.4PCS=readRDS("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/baseline_withPCS.rds")[,c(1,20:39)]colnames(PCS)=c("identifier",paste0(rep("PC",20),seq(1:20)))baseline_levs=merge(baseline_levs,PCS,by="identifier")m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","Cad_0_Any","Cad_0_censor_age","f.31.0.0")],eth,by.x="identifier",by.y="identifier")m2$Cad_Sta=ifelse(m2$Cad_0_Any==2,1,0)#df2=fread("~/Library/CloudStorage/Dropbox-Personal/UKBB_HardCAD_202109.tsv")# m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","f.31.0.0")],eth,by.x="identifier",by.y="identifier")#m2=merge(m2,df2[,c("sample_id","has_disease","censor_age")],by.x = "identifier",by.y="sample_id")#m2$Cad_Sta=m2$has_disease#m2$Cad_0_censor_age=m2$censor_age## get these labs readymed_codes=readRDS("~/Library/CloudStorage/Dropbox-Personal/medcodes.rds")#levels(as.factor(med_codes$meaning))## what do you do about meds? I think we need to eliminate, but will skew the distribtuionbaseline_meds=merge(med_codes,baseline_levs,by="identifier")baseline_meds$ldl_adj=ifelse(baseline_meds$meaning%in%"Cholesterol lowering medication",baseline_meds$ldl/0.8,baseline_meds$ldl)baseline_meds$sbp_adj=ifelse(baseline_meds$meaning%in%"Blood pressure medication",baseline_meds$f.4080.0.0+10,baseline_meds$f.4080.0.0)## merge allfinal_dat=merge(m2,baseline_meds[,c("identifier","sbp_adj","ldl_adj","hdl",paste0(rep("PC",10),seq(1:10)))],by.x="identifier",by.y="identifier")
Now we want to calculate ancestry specific thresholds for PRS. Let’s get the PRS for African, European, and Asian ancestry that causes a 40 mmHg increase in LDL cholesterol in the UK Biobank using the AFR, SAS, and EUR individauls present.
Code
# final_dat <- final_dat %>%# mutate(recoded_ethnicity = case_when(# meaning %in% c("African", "Any other Black background", "Black or Black British","Caribbean","Any other mixed background","Mixed") ~ "AFR",# meaning %in% c("White", "Irish", "Any other white background","British") ~ "White",# meaning %in% c("Indian", "Pakistani","Bangladeshi") ~ "South Asian",# meaning == "Chinese" ~ "Chinese",# TRUE ~ "Other" # This will be used for any ethnicity not covered above# ))# # final_dat$Cad_Sta=ifelse(final_dat$Cad_0_Any==2,1,0)# final_dat$sex=final_dat$f.31.0.0# # table(final_dat$recoded_ethnicity)#ggplot(lt,aes(x=Var1,y=value,fill=Var1))+geom_bar(stat="identity")gae=fread("~/Library/CloudStorage/Dropbox-Personal/ukb.kgp_projected.tsv.gz")final_dat=merge(final_dat,gae[,c("eid","rf80")],by.x="identifier",by.y="eid")table(final_dat$rf80)
AFR AMR EAS EUR SAS
8426 707 2249 448138 9085
Code
# prs_SAS=fread("~/Library/CloudStorage/Dropbox-Personal/ukbb_ldlcprs.SAS.tsv")# prs_AFR=fread("~/Library/CloudStorage/Dropbox-Personal/ukbb_ldlcprs.AFR.tsv")# prs_EUR=fread("~/Library/CloudStorage/Dropbox-Personal/ukbb_ldlcprs.EUR.tsv")# SAS_dat=merge(prs_SAS[,c(1:5,20)],final_dat,by.x="eid",by.y="identifier")# AFR_dat=merge(prs_AFR[,c(1:5,20)],final_dat,by.x="eid",by.y="identifier")# EUR_dat=merge(prs_EUR[,c(1:5,20)],final_dat,by.x="eid",by.y="identifier")## with wallace's prs_SAS=fread("~/Library/CloudStorage/Dropbox-Personal/PRSethnicity/SAS_SUM_SCORE_FINAL")prs_AFR=fread("~/Library/CloudStorage/Dropbox-Personal/PRSethnicity/AFR_SUM_SCORE_FINAL")prs_EUR=fread("~/Library/CloudStorage/Dropbox-Personal/PRSethnicity/EUR_SUM_SCORE_FINAL")AFR=final_dat$identifier[final_dat$rf80%in%"AFR"]EUR=final_dat$identifier[final_dat$rf80%in%"EUR"]SAS=final_dat$identifier[final_dat$rf80%in%"SAS"]colnames(prs_SAS)=colnames(prs_AFR)=colnames(prs_EUR)=c("IID","PRS")SAS_dat=merge(prs_SAS,final_dat[final_dat$rf80%in%"SAS",],by.x="IID",by.y="identifier")AFR_dat=merge(prs_AFR,final_dat[final_dat$rf80%in%"AFR",],by.x="IID",by.y="identifier")EUR_dat=merge(prs_EUR,final_dat[final_dat$rf80%in%"EUR",],by.x="IID",by.y="identifier")pop_data_list=list(SAS_dat,AFR_dat,EUR_dat)names(pop_data_list)=c("SAS","AFR","EUR")
Histograms
We note that the distributions of these PRS are skewed for non EUR pops but will normalize for our analysis:
Code
prs_AFR$Population <-'AFR';prs_AFR=prs_AFR[prs_AFR$IID%in%AFR]prs_EUR$Population <-'EUR';prs_EUR=prs_EUR[prs_EUR$IID%in%EUR]prs_SAS$Population <-'SAS';prs_SAS=prs_SAS[prs_SAS$IID%in%SAS]combined_data <-rbind(prs_AFR, prs_EUR, prs_SAS)ggplot(combined_data, aes(x = PRS, y = Population, fill = Population)) +geom_density_ridges(alpha =0.7) +scale_fill_brewer(palette ="Set1") +theme_ridges() +# Optional: Adds a nice theme suitable for ridgeline plotslabs(title ="Density Distribution of Sum Scores",x ="Sum Score",y ="Population")
Picking joint bandwidth of 0.0353
Using residual PRS
Code
totaldata=do.call(rbind,pop_data_list)pcmod=lm(PRS~PC1+PC2+PC3+PC4+PC5,data = totaldata)totaldata$predictscore=predict(object = pcmod,newdata=totaldata)totaldata$residscore=totaldata$PRS-totaldata$predictscoretotaldata$Population=as.factor(totaldata$rf80)totaldata$scaled_resid=scale(totaldata$residscore)ggplot(totaldata, aes(x = residscore, y = Population, fill = Population)) +geom_density_ridges(alpha =0.7) +scale_fill_brewer(palette ="Set1") +theme_ridges() +# Optional: Adds a nice theme suitable for ridgeline plotslabs(title ="Density Distribution of Sum Scores",x ="Sum Score",y ="Population")
Picking joint bandwidth of 0.0353
Impact of PRS
To assess the impact of LDL-C PRS on CAD risk, you need to establish a two-step relationship: first, between LDL-C PRS and LDL-C levels (per 40 mg/dL change), and then between LDL-C PRS and CAD outcomes. This approach allows you to understand how changes in LDL-C levels attributed to genetic risk (as quantified by PRS) relate to CAD risk. Here’s how you can approach this analysis for both European (EUR) and African (AFR) populations:
Linear Regression for LDL-C Levels: Fit a linear regression model with LDL-C levels as the outcome and LDL-C PRS as the predictor. You might want to adjust for relevant covariates.
Rescale the Coefficient for LDL-C PRS: Adjust the coefficient from this model to reflect a 40 mg/dL change in LDL-C.
Step 2: Associate Rescaled LDL-C PRS with CAD
Incorporate Rescaled LDL-C PRS into Cox Model: Use the rescaled LDL-C PRS from Step 1 as a predictor in a Cox proportional hazards model to analyze its relationship with CAD risk.
Perform Analysis for Both EUR and AFR Populations: Repeat these steps separately for EUR and AFR populations to capture population-specific effects.
Here’s an example in R for the European population. The same steps would be repeated for the African population:
Code
library(dplyr)library(survival)# Assuming 'df' is your dataset# List of unique populationspopulations <-c("AFR","SAS","EUR")# Initialize an empty list for storing models and resultscox_models <-list()glm_models =list()for(pop in populations) {# Subset data for the specific population pop_data=totaldata[totaldata$rf80%in%pop,]# Choose residual score or PRS#pop_data$PRS=scale(pop_data$PRS) pop_data$PRS=scale(pop_data$residscore)print("PRS Summary")summary(pop_data$PRS)# Step 1: Linear Regression for LDL-C Levelsif(pop=="AFR"){ ldl_model <-lm(ldl_adj ~ PRS +as.numeric(enrollage) +as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data)print(paste("LDL_coef_CI:",pop,":",confint(ldl_model,parm ="PRS"))) }else{ ldl_model <-lm(ldl_adj ~ PRS +as.numeric(enrollage) +as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10#+as.factor(array) , data = pop_data)print(paste("LDL_coef_CI:",pop,":",confint(ldl_model,parm ="PRS"))) }# Calculate the PRS scaling factor prs_scale_factor <-40/coef(ldl_model)["PRS"]# Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor# Step 2: Cox Model for CAD cox_model <-coxph(Surv(Cad_0_censor_age, Cad_Sta) ~ scaled_ldl_prs+as.numeric(enrollage)+as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data)#cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+sex, data = pop_data,family = "binomial") cox_models[[pop]] <- cox_model glm_model <-glm(Cad_Sta ~ scaled_ldl_prs+as.numeric(enrollage) +as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data,family ="binomial") glm_models[[pop]] <- glm_model}
Now we want to take a two-step Bayesian approach where you first calculate a global mean hazard ratio (HR) or odds ratio (OR) from population-specific estimates and then use this global mean as a prior in a Bayesian adjustment of the population-specific estimates. In such a case, weighting the global mean by the standard error (or variance) of each population-specific estimate is indeed a good approach, as it accounts for the different levels of uncertainty in each population estimate.
Calculate Weighted Global Mean HR/OR:
For each population, you have an HR/OR and its standard error (SE).
You can weight the HR/OR by the inverse of the variance (which is SE²).
The weighted mean HR/OR is then the sum of each weighted HR/OR divided by the sum of the weights.
Use Weighted Mean as Prior for Bayesian Adjustment:
Use the weighted global mean HR/OR as the mean for your prior distribution in the Bayesian model for each population.
The variance of the prior could be based on the pooled standard errors or a measure of the variability of the HR/OR across populations.
Variance and Weights
$$$$The variance of each log-transformed odds ratio (OR) can be approximated from the confidence interval: \[\text{Variance} \approx \left( \frac{\log(\text{Upper CI}) - \log(\text{Lower CI})}{2 \times 1.96} \right)^2
\]
Based on this variance, the weight for each observation is:
\[
w_i = \frac{1}{\text{Variance}_i}
\]
Weighted Mean The weighted mean is then calculated as:
\[ \sum_{i=1}^{n} w_i^2 \] is the sum of the squared weights, and \[ x_i \] is each individual log-transformed OR.
Code
# Assuming you have a dataframe 'results' with columns 'Population', 'OR', and 'SE'# Calculate variance and weightsresults <- results %>%mutate(log_OR =log(OR),log_Lower =log(Lower),log_Upper =log(Upper),Variance = ((log_Upper - log_Lower) / (2*1.96))^2,Weight =1/ Variance )# Calculate weighted mean and variance for the priorsum_weights <-sum(results$Weight)sum_squared_weights <-sum(results$Weight^2)weighted_mean_log_OR <-sum(results$log_OR * results$Weight) / sum_weightsweighted_variance <- (sum_weights / (sum_weights^2- sum_squared_weights)) *sum(results$Weight * (results$log_OR - weighted_mean_log_OR)^2)prior_mean <- weighted_mean_log_ORprior_variance <- weighted_variance# Perform Bayesian updating for each populationresults <- results %>%mutate(Posterior_Variance =1/ (1/ prior_variance +1/ Variance),Posterior_Mean = Posterior_Variance * (prior_mean / prior_variance + log_OR / Variance) )# Exponentiating the posterior mean to get back to the OR scaleresults$Posterior_OR <-exp(results$Posterior_Mean)results <- results %>%mutate(Posterior_Lower_Log = Posterior_Mean -1.96*sqrt(Posterior_Variance),Posterior_Upper_Log = Posterior_Mean +1.96*sqrt(Posterior_Variance),Posterior_Lower_OR =exp(Posterior_Lower_Log),Posterior_Upper_OR =exp(Posterior_Upper_Log) )# Now 'results' contains the posterior mean and variance for each populationprint(results)
library(forestplot)library(ggplot2)library(dplyr)# Assuming 'results' is already loaded with your data# We need to create a new data frame for ggplot# First, we create two separate data frames for original and posterior results and then bind them together# plot_data <- results %>%# select(Population, OR, Lower, Upper, Posterior_OR, Posterior_Lower_OR, Posterior_Upper_OR) %>%# gather(key = "Type", value = "Value", -Population, -Lower, -Upper, -Posterior_Lower_OR, -Posterior_Upper_OR) %>%# mutate(# ymin = ifelse(Type == "OR", Lower, Posterior_Lower_OR),# ymax = ifelse(Type == "OR", Upper, Posterior_Upper_OR),# Type = factor(Type, levels = c("OR", "Posterior_OR"))# )# # # Now we plot with ggplot2# ggplot(plot_data, aes(y = Population, x = Value, xmin = ymin, xmax = ymax, color = Type)) +# geom_errorbarh(height = 0.2) +# geom_point(size = 2.5) +# scale_color_manual(values = c("OR" = "black", "Posterior_OR" = "red")) +# theme_minimal() + lims(x=c(0.5,2.5))+# labs(x = "Odds Ratio", y = "", color = "Estimate Type") +# theme(legend.position = "bottom")# Adding an identifier for each rowresults$ID <-seq_along(results$Population)results$ID <-seq_along(results$Population)# Create a long-format data frame for plottingplot_data <- tidyr::pivot_longer( results, cols =c("OR", "Posterior_OR"), names_to ="Estimate_Type", values_to ="Value") %>%mutate(ymin =ifelse(Estimate_Type =="OR", Lower, Posterior_Lower_OR),ymax =ifelse(Estimate_Type =="OR", Upper, Posterior_Upper_OR),Estimate_Type =factor(Estimate_Type, levels =c("OR", "Posterior_OR")),# Adjust the y position of the posterior estimatesypos =as.numeric(ID) +ifelse(Estimate_Type =="OR", -0.1, 0.1) )# Create the plotggplot(plot_data, aes(x = Value, y = ypos, color = Estimate_Type)) +geom_point(size =3) +geom_errorbar(aes(xmin = ymin, xmax = ymax), width =0.2) +scale_color_manual(values =c("OR"="black", "Posterior_OR"="red")) +labs(x ="Odds Ratio", y ="") +theme_minimal() +theme(axis.text.y =element_text(vjust =0.5), axis.ticks.y =element_blank(),panel.grid.major =element_blank(),panel.grid.minor =element_blank(),legend.position ="bottom") +scale_y_continuous(breaks =1:length(unique(plot_data$ID)), labels = results$Population) +xlim(0.8, 2.5) # Set x-axis limits
We can do this in stan and calculate heterogneiety as well:
Code
library(brms)results <- results %>%mutate(se_log_OR =1/sqrt(Weight))# Given prior study's log OR and its standard errorprior_log_or <-log(1/0.76) # log of the hazard ratio from the studyprior_sd <- (log(1/0.64) -log(1/0.91)) / (2*1.96) # standard error of the log OR# or try this:# # rosuvastatin_with_event <- 235# rosuvastatin_without_event <- 6361 - rosuvastatin_with_event# # placebo_with_event <- 304# placebo_without_event <- 6344 - placebo_with_event# # # Compute the OR# odds_case <- rosuvastatin_with_event / rosuvastatin_without_event# odds_control <- placebo_with_event / placebo_without_event# OR <- odds_control / odds_case# # # Convert OR to log scale for the prior# prior_log_or <- log(OR)# # # Calculate standard error of the log OR# prior_log_or_se <- sqrt(1 / placebo_with_event +# 1 / placebo_without_event +# 1 / rosuvastatin_with_event +# 1 / rosuvastatin_without_event)# # # Print the prior log OR and its standard error# print(prior_log_or)# print(prior_log_or_se)# # # Print the OR# print(OR)# # Define the priors based on the prior study for the global effect size#prior_effect <- set_prior(paste("normal(", prior_log_or, ", ", prior_log_or_se, ")"), class = "Intercept")prior_effect <-set_prior(paste("normal(", weighted_mean_log_OR, ", ", sqrt(weighted_variance), ")"), class ="Intercept")# Define a non-informative prior for the heterogeneityprior_heterogeneity <-set_prior("student_t(3, 0, 10)", class ="sd", group ="Population")prior_global <-set_prior(paste0("normal(", prior_log_or, ", ", prior_log_or_se, ")"), class ="Intercept")prior_heterogeneity <-set_prior("student_t(3, 0, 10)", class ="sd",group ="Population")# Fit the modelmeta_analysis_model <-brm(bf(log_OR |se(se_log_OR) ~1+ (1| Population)),data = results,prior =c(prior_global, prior_heterogeneity),family =gaussian(),chains =4,iter =4000,warmup =2000,control =list(adapt_delta =0.999999) # adjust for convergence)# Check the summary of the modelsummary(meta_analysis_model)# Extract population-specific effectsranef(meta_analysis_model)global_effect=fixef(meta_analysis_model)[1,1]# Get the random effects (population-specific deviations)population_effects <-ranef(meta_analysis_model)# Calculate the population-specific log ORs by adding the global effect to the random effectspopulation_specific_log_ORs <- global_effect + population_effects$Population[,,"Intercept"][,"Estimate"]# Convert the log ORs to ORspopulation_specific_ORs <-exp(population_specific_log_ORs)# Now you have the population-specific ORsprint(population_specific_ORs)
Heterogeneity assessment:
Code
# Extract the standard deviation of the random effectsheterogeneity_estimate <-VarCorr(meta_analysis_model)$Population$sd# Print the heterogeneity estimateprint(heterogeneity_estimate)# Conduct posterior predictive checkspp_check(meta_analysis_model)# If you want to plot the distribution of the standard deviation of the random effectsposterior_samples <-posterior_samples(meta_analysis_model, pars ="sd_Population__Intercept")hist(posterior_samples[, "sd_Population__Intercept"], breaks =40, main ="Distribution of Heterogeneity (sd Population)",x="Heterogeneity (SD)",fill="blue")# Summarize the posterior distribution of the heterogeneity parameterposterior_summary <-summary(posterior_samples[, "sd_Population__Intercept"])print(posterior_summary)