Ancestry PRS Load

Author

Sarah Urbut

Published

March 7, 2024

LDL PRS analyais

Code

library(dplyr)
library(broom)
library(tidyr)
library(ggplot2)
library(data.table)
library(ggridges)
load("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/output/merged_pheno_censor_final_withdrugs_smoke.rds")
pheno_prs=dfh
pheno2=readRDS("~/Library/CloudStorage/Dropbox-Personal/df_ukb_pheno_updated.rds")

pheno_prs=merge(pheno_prs,pheno2[,c("Identifier","Cad_hard_Any","Cad_hard_censor_age")],by.x="identifier",by.y="Identifier")
eth=readRDS("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/ethnicity.rds")
eth=eth[!is.na(eth$f.21000.0.0),]
eth=eth[,c("identifier","meaning")]


baseline_levs=readRDS("~/Library/CloudStorage/Dropbox-Personal/dfukb_chol_bp_smoke.rds")
baseline_levs$hdl=baseline_levs$f.30760.0.0*38.4
baseline_levs$ldl=baseline_levs$f.30780.0.0*38.4

PCS=readRDS("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/baseline_withPCS.rds")[,c(1,20:39)]
colnames(PCS)=c("identifier",paste0(rep("PC",20),seq(1:20)))
baseline_levs=merge(baseline_levs,PCS,by="identifier")

Phenotyping

We validated previously that UKB pheno implementation of Hard CAD is matched to HARD Cad. So we will now use that here to be c/w AoU HARD CAD pheno.

Because we are using age as time scale (or GLM with ‘evere event’ follow up time is really just time from birth to death.
We know that time is inhomogenous: even if we use OR instead of HR, the OR will tend to decrease over time because there are more non-genetically mediated CAD events

Code

# m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","Cad_0_Any","Cad_0_censor_age","f.31.0.0")],eth,by.x="identifier",by.y="identifier")
# m2$Cad_Sta=ifelse(m2$Cad_0_Any==2,1,0)


m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","Cad_hard_Any","Cad_hard_censor_age","f.31.0.0")],eth,by.x="identifier",by.y="identifier")


m2$Cad_Sta=ifelse(m2$Cad_hard_Any==2,1,0)
m2$Cad_0_censor_age=m2$Cad_hard_censor_age



m2=m2[!is.na(m2$Cad_Sta),]
m2=m2[!is.na(m2$Cad_0_censor_age),]
summary(m2$Cad_0_censor_age[m2$Cad_Sta==1])

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
 0.04654 55.13689 62.57906 62.05269 70.03970 84.76112

Code

summary(m2$Cad_0_censor_age[m2$Cad_Sta==0])

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  40.38   63.13   70.51   69.60   76.13   88.63

Code

## old phenotypes
# df2=fread("~/Library/CloudStorage/Dropbox-Personal/UKBB_HardCAD_202109.tsv")
# m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","f.31.0.0")],eth,by.x="identifier",by.y="identifier")
# m2=merge(m2,df2[,c("sample_id","has_disease","censor_age")],by.x = "identifier",by.y="sample_id")
# m2$Cad_Sta=m2$has_disease
# m2$Cad_0_censor_age=m2$censor_age

## get these labs ready
med_codes=readRDS("~/Library/CloudStorage/Dropbox-Personal/medcodes.rds")
#levels(as.factor(med_codes$meaning))
## what do you do about meds? I think we need to eliminate, but will skew the distribtuion
baseline_meds=merge(med_codes,baseline_levs,by="identifier")
baseline_meds$ldl_adj=ifelse(baseline_meds$meaning%in%"Cholesterol lowering medication",baseline_meds$ldl/0.8,baseline_meds$ldl)
baseline_meds$sbp_adj=ifelse(baseline_meds$meaning%in%"Blood pressure medication",baseline_meds$f.4080.0.0+10,baseline_meds$f.4080.0.0)

## merge all
final_dat=merge(m2,baseline_meds[,c("identifier","sbp_adj","ldl_adj","hdl",paste0(rep("PC",10),seq(1:10)))],by.x="identifier",by.y="identifier")
library(lubridate)


Attaching package: 'lubridate'

The following objects are masked from 'package:data.table':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year

The following objects are masked from 'package:base':

    date, intersect, setdiff, union

Code

final_dat$yob=year(final_dat$Birthdate)

#final_dat=final_dat[year(final_dat$Birthdate)>1950,]

Now we want to calculate ancestry specific thresholds for PRS. Let’s get the PRS for African, European, and Asian ancestry that causes a 40 mmHg increase in LDL cholesterol in the UK Biobank using the AFR, SAS, and EUR individauls present.

Code

# final_dat <- final_dat %>%
#   mutate(recoded_ethnicity = case_when(
#     meaning %in% c("African", "Any other Black background", "Black or Black British","Caribbean","Any other mixed background","Mixed") ~ "AFR",
#     meaning %in% c("White", "Irish", "Any other white background","British") ~ "White",
#     meaning %in% c("Indian", "Pakistani","Bangladeshi") ~ "South Asian",
#     meaning == "Chinese" ~ "Chinese",
#     TRUE ~ "Other" # TAMR will be used for any ethnicity not covered above
#   ))
# 
# final_dat$Cad_Sta=ifelse(final_dat$Cad_0_Any==2,1,0)
# final_dat$sex=final_dat$f.31.0.0
# 
# table(final_dat$recoded_ethnicity)
#ggplot(lt,aes(x=Var1,y=value,fill=Var1))+geom_bar(stat="identity")

gae=fread("~/Library/CloudStorage/Dropbox-Personal/ukb.kgp_projected.tsv.gz")
final_dat=merge(final_dat,gae[,c("eid","rf80")],by.x="identifier",by.y="eid")
table(final_dat$rf80)


   AFR    AMR    EAS    EUR    SAS 
  8426    707   2249 448106   9085

Code

## with wallace's 

prs_SAS=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/SAS_SUM_SCORE_FINAL_mixed")
prs_AFR=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/AFR_SUM_SCORE_FINAL_mixed")
prs_EUR=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/EUR_SUM_SCORE_FINAL_mixed")
prs_EAS=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/EAS_SUM_SCORE_FINAL_mixed")
prs_AMR=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/HISP_SUM_SCORE_FINAL_mixed")


AFR=final_dat$identifier[final_dat$rf80%in%"AFR"]
EUR=final_dat$identifier[final_dat$rf80%in%"EUR"]
SAS=final_dat$identifier[final_dat$rf80%in%"SAS"]
AMR=final_dat$identifier[final_dat$rf80%in%"AMR"]
EAS=final_dat$identifier[final_dat$rf80%in%"EAS"]

colnames(prs_SAS)=colnames(prs_AFR)=colnames(prs_EUR)=colnames(prs_EAS)=colnames(prs_AMR)=c("IID","PRS")

SAS_dat=merge(prs_SAS,final_dat[final_dat$rf80%in%"SAS",],by.x="IID",by.y="identifier")
AFR_dat=merge(prs_AFR,final_dat[final_dat$rf80%in%"AFR",],by.x="IID",by.y="identifier")
EUR_dat=merge(prs_EUR,final_dat[final_dat$rf80%in%"EUR",],by.x="IID",by.y="identifier")
AMR_dat=merge(prs_AMR,final_dat[final_dat$rf80%in%"AMR",],by.x="IID",by.y="identifier")
EAS_dat=merge(prs_EAS,final_dat[final_dat$rf80%in%"EAS",],by.x="IID",by.y="identifier")


pop_data_list=list(SAS_dat,AFR_dat,EUR_dat,AMR_dat,EAS_dat)
names(pop_data_list)=c("SAS","AFR","EUR","AMR","EAS")

final_dat=final_dat[!is.na(final_dat$rf80),]
final_dat%>%group_by(rf80)%>%summarise("cad_rate"=mean(Cad_Sta),"mean_ldl"=mean(na.omit(ldl_adj)),"%M"=mean(na.omit(as.numeric(f.31.0.0))),"meanAge"=as.numeric(mean(enrollage)))

# A tibble: 5 × 5
  rf80  cad_rate mean_ldl  `%M` meanAge
  <chr>    <dbl>    <dbl> <dbl>   <dbl>
1 AFR     0.0405     129. 0.417    52.4
2 AMR     0.0368     142. 0.332    52.5
3 EAS     0.0293     135. 0.311    52.9
4 EUR     0.0772     142. 0.458    57.3
5 SAS     0.132      135. 0.539    54.0

Histograms

We note that the distributions of these PRS are skewed for non EUR pops but will normalize for our analysis:

Code

prs_AFR$Population <- 'AFR';prs_AFR=prs_AFR[prs_AFR$IID%in%AFR]
prs_EUR$Population <- 'EUR';prs_EUR=prs_EUR[prs_EUR$IID%in%EUR]
prs_SAS$Population <- 'SAS';prs_SAS=prs_SAS[prs_SAS$IID%in%SAS]
prs_AMR$Population = 'AMR'; prs_AMR=prs_AMR[prs_AMR$IID%in%AMR]
prs_EAS$Population = 'EAS'; prs_EAS=prs_EAS[prs_EAS$IID%in%EAS]


combined_data <- rbind(prs_AFR, prs_EUR, prs_SAS,prs_AMR,prs_EAS)

ggplot(combined_data, aes(x = PRS, y = Population, fill = Population)) +
    geom_density_ridges(alpha = 0.7) +
    scale_fill_brewer(palette = "Set1") +
    theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
    labs(title = "Density Distribution of Sum Scores",
         x = "Sum Score",
         y = "Population")

Picking joint bandwidth of 0.0469

Using residual PRS

Code

totaldata=do.call(rbind,pop_data_list)
pcmod=lm(PRS~PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,data = totaldata)
totaldata$predictscore=predict(object = pcmod,newdata=totaldata)
totaldata$residscore=totaldata$PRS-totaldata$predictscore
totaldata$Population=as.factor(totaldata$rf80)
totaldata$scaled_resid=scale(totaldata$residscore)

ggplot(totaldata, aes(x = residscore, y = Population, fill = Population)) +
    geom_density_ridges(alpha = 0.7) +
    scale_fill_brewer(palette = "Set1") +
    theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
    labs(title = "Density Distribution of Sum Scores",
         x = "Sum Score",
         y = "Population")

Picking joint bandwidth of 0.0473

Code

totaldata$rf80[totaldata$rf80=="AMR"]="AMR"

Impact of PRS

To assess the impact of LDL-C PRS on CAD risk, you need to establish a two-step relationship: first, between LDL-C PRS and LDL-C levels (per 40 mg/dL change), and then between LDL-C PRS and CAD outcomes. TAMR approach allows you to understand how changes in LDL-C levels attributed to genetic risk (as quantified by PRS) relate to CAD risk. Here’s how you can approach tAMR analysis for both European (EUR) and African (AFR) populations:

Step 1: Associate LDL-C PRS with LDL-C Levels (per 40 mg/dL)

Linear Regression for LDL-C Levels: Fit a linear regression model with LDL-C levels as the outcome and LDL-C PRS as the predictor. You might want to adjust for relevant covariates.
Rescale the Coefficient for LDL-C PRS: Adjust the coefficient from tAMR model to reflect a 40 mg/dL change in LDL-C.

Step 2: Associate Rescaled LDL-C PRS with CAD

Incorporate Rescaled LDL-C PRS into Cox Model: Use the rescaled LDL-C PRS from Step 1 as a predictor in a Cox proportional hazards model to analyze its relationship with CAD risk.
Perform Analysis for Both EUR and AFR Populations: Repeat these steps separately for EUR and AFR populations to capture population-specific effects.

Here’s an example in R for the European population. The same steps would be repeated for the African population:

Code

library(dplyr)
library(survival)

# Assuming 'df' is your dataset

# List of unique populations
populations <- c("AFR","SAS","EUR","AMR","EAS")

# Initialize an empty list for storing models and results
cox_models <- list()
glm_models = list()
for(pop in populations) {
  # Subset data for the specific population
  pop_data=totaldata[totaldata$rf80%in%pop,]
  
  # Choose residual score or PRS
  
  pop_data$PRS=scale(pop_data$PRS)
   pop_data$PRS=scale(pop_data$residscore)
  #pop_data$PRS=pop_data$residscore
   print("PRS Summary")
   print(summary(pop_data$PRS))
  
  # Step 1: Linear Regression for LDL-C Levels
# Assuming 'pop' is defined elsewhere and 'pop_data' is your dataset
#if (pop == "EUR") {
    ldl_model <- lm(ldl_adj ~ PRS + as.numeric(enrollage) + as.factor(f.31.0.0) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = pop_data)
#} else {
#  pop_data$PRS=-1*pop_data$PRS
#    ldl_model <- lm(ldl_adj ~  PRS + as.numeric(enrollage) + as.factor(f.31.0.0) + PC1 + PC2 + #PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = pop_data)
#}

print(paste("LDL_coef_CI for population:", pop))
if ("PRS" %in% names(coef(ldl_model))) {
    print(confint(ldl_model, parm = "PRS"))
} else {
    print("PRS coefficient is not available in the model.")
}


  
  # Calculate the PRS scaling factor
  prs_scale_factor <- 40 / coef(ldl_model)["PRS"]
  print(paste0(pop,":",prs_scale_factor))
  #prs_scale_factor=1
  # Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL
  pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor
  
  print(summary(pop_data$scaled_ldl_prs))
  # Step 2: Cox Model for CAD -- inherengly accounting for age by using age as time scale
  cox_model <- coxph(Surv(Cad_0_censor_age, Cad_Sta) ~ scaled_ldl_prs
                     +as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data)
  #cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+sex, data = pop_data,family = "binomial")
  cox_models[[pop]] <- cox_model
  
    

    glm_model <- glm(Cad_Sta ~ scaled_ldl_prs+as.numeric(enrollage) 
                     + as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data,family = "binomial")
  glm_models[[pop]] <- glm_model
}

[1] "PRS Summary"
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-4.89093 -0.62501  0.07598  0.00000  0.68240  3.51648 
[1] "LDL_coef_CI for population: AFR"
       2.5 %   97.5 %
PRS 12.11459 13.50644
[1] "AFR:3.12243539531292"
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-1.56638 -0.20017  0.02433  0.00000  0.21855  1.12620 
[1] "PRS Summary"
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-4.31027 -0.64594  0.03341  0.00000  0.70225  3.22404 
[1] "LDL_coef_CI for population: SAS"
      2.5 %   97.5 %
PRS 8.15468 9.495848
[1] "SAS:4.53244224048928"
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-0.950982 -0.142514  0.007371  0.000000  0.154937  0.711326 
[1] "PRS Summary"
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-4.89947 -0.65269  0.02883  0.00000  0.68557  4.22746 
[1] "LDL_coef_CI for population: EUR"
       2.5 %   97.5 %
PRS 11.58239 11.75977
[1] "EUR:3.42727457116371"
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-1.429552 -0.190441  0.008413  0.000000  0.200034  1.233475 
[1] "PRS Summary"
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-3.83980 -0.60837  0.05459  0.00000  0.66863  2.91596 
[1] "LDL_coef_CI for population: AMR"
       2.5 %   97.5 %
PRS 8.792547 13.88534
[1] "AMR:3.52766605506817"
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-1.08848 -0.17246  0.01547  0.00000  0.18954  0.82660 
[1] "PRS Summary"
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-4.04886 -0.57366  0.09183  0.00000  0.69031  2.75622 
[1] "LDL_coef_CI for population: EAS"
       2.5 %   97.5 %
PRS 8.997966 11.40513
[1] "EAS:3.9209732005657"
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-1.03262 -0.14631  0.02342  0.00000  0.17606  0.70294

Code

# Access and summarize the Cox models for each population
lapply(cox_models, function(x){exp(coef(x)["scaled_ldl_prs"])})

$AFR
scaled_ldl_prs 
      1.370463 

$SAS
scaled_ldl_prs 
      1.802675 

$EUR
scaled_ldl_prs 
      1.697199 

$AMR
scaled_ldl_prs 
      3.396973 

$EAS
scaled_ldl_prs 
     0.9217767

Now let’s look at a meta analysis:

Code

# Create a data frame to store the results
results <- data.frame(Population = character(),
                      HR = numeric(),
                      Lower = numeric(),
                      Upper = numeric(),
                      stringsAsFactors = FALSE)

 #Extract the results from each model
 for (pop in names(cox_models)) {
   model_summary <- summary(cox_models[[pop]])
   hr <- model_summary$coefficients["scaled_ldl_prs", "exp(coef)"]
  lower <- model_summary$conf.int["scaled_ldl_prs","lower .95"]
  upper <- model_summary$conf.int["scaled_ldl_prs","upper .95"]

  # Add to the results data frame
  results <- rbind(results, data.frame(Population = pop, HR = hr, Lower = lower, Upper = upper))
}

ggplot(results, aes(x = Population, y = HR, ymin = Lower, ymax = Upper)) +
  geom_point() + # Add points for hazard ratios
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.1) + # Add error bars
  coord_flip() + # Flip coordinates for a horizontal plot
  xlab("Population") +
  ylab("Hazard Ratio (HR)") +
  ggtitle("HR for LDL-C_40 on CAD across Populations:UKB") +
  theme_minimal()

And now with OR/GLM:

Code

lapply(glm_models, function(x){exp(coef(x)["scaled_ldl_prs"])})

$AFR
scaled_ldl_prs 
      1.402505 

$SAS
scaled_ldl_prs 
      1.928061 

$EUR
scaled_ldl_prs 
      1.748224 

$AMR
scaled_ldl_prs 
       2.43257 

$EAS
scaled_ldl_prs 
     0.8964945

Code

# 
results <- data.frame(Population = character(),
                      OR = numeric(),
                      Lower = numeric(),
                      Upper = numeric(),
                      stringsAsFactors = FALSE)

for (pop in names(cox_models)) {
    model_summary <- tidy(glm_models[[pop]], conf.int = TRUE, exponentiate = TRUE)

    if ("scaled_ldl_prs" %in% model_summary$term) {
        term_row <- model_summary %>% filter(term == "scaled_ldl_prs")

        results <- rbind(results, data.frame(Population = pop,
                                             OR = term_row$estimate,
                                             Lower = term_row$conf.low,
                                             Upper = term_row$conf.high))
    } else {
        results <- rbind(results, data.frame(Population = pop, OR = NA, Lower = NA, Upper = NA))
    }
}
# # Create the forest plot using ggplot2
# 
ggplot(results, aes(x = Population, y = OR, ymin = Lower, ymax = Upper)) +
    geom_point() +
    geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.1) +
    coord_flip()+
    xlab("Population") +
    ylab("Odds Ratio (OR)") +
  ggtitle("OR for LDL-C PRS_40 on CAD:UKB")  +theme_minimal()

Bayesian Meta-analysis

Now we want to take a two-step Bayesian approach where you first calculate a global mean hazard ratio (HR) or odds ratio (OR) from population-specific estimates and then use tAMR global mean as a prior in a Bayesian adjustment of the population-specific estimates. In such a case, weighting the global mean by the standard error (or variance) of each population-specific estimate is indeed a good approach, as it accounts for the different levels of uncertainty in each population estimate.

Calculate Weighted Global Mean HR/OR:
- For each population, you have an HR/OR and its standard error (SE).
- You can weight the HR/OR by the inverse of the variance (which is SE²).
- The weighted mean HR/OR is then the sum of each weighted HR/OR divided by the sum of the weights.
Use Weighted Mean as Prior for Bayesian Adjustment:
- Use the weighted global mean HR/OR as the mean for your prior distribution in the Bayesian model for each population.
- The variance of the prior could be based on the pooled standard errors or a measure of the variability of the HR/OR across populations.

Variance and Weights

$$$$The variance of each log-transformed odds ratio (OR) can be approximated from the confidence interval: \[\text{Variance} \approx \left( \frac{\log(\text{Upper CI}) - \log(\text{Lower CI})}{2 \times 1.96} \right)^2 \]

Based on tAMR variance, the weight for each observation is:

\[ w_i = \frac{1}{\text{Variance}_i} \]

Weighted Mean The weighted mean is then calculated as:

\[\ \text{Weighted Mean} = \frac{\sum_{i=1}^{n} w_i x_i}{\sum_{i=1}^{n} w_i} \ \]

where \[( x_i ) \] is the log(OR) for each observation \[i\].

Weighted Variance The weighted variance is given by:

\[ \text{Weighted Variance} = \frac{\sum w_i}{\left( \sum w_i \right)^2 - \sum w_i^2} \sum w_i (x_i - \text{Weighted Mean})^2 \]

The weighted variance is given by:

\[\text{Weighted Variance} = \frac{\sum_{i=1}^{n} w_i}{\left( \sum_{i=1}^{n} w_i \right)^2 - \sum_{i=1}^{n} w_i^2} \sum_{i=1}^{n} w_i (x_i - \text{Weighted Mean})^2\]Here,

\[ \sum_{i=1}^{n} w_i \]

is the sum of the weights,

\[ \sum_{i=1}^{n} w_i^2 \] is the sum of the squared weights, and \[ x_i \] is each individual log-transformed OR.

Code

# Assuming you have a dataframe 'results' with columns 'Population', 'OR', and 'SE'

# Calculate variance and weights
results <- results %>%
  mutate(
    log_OR = log(OR),
    log_Lower = log(Lower),
    log_Upper = log(Upper),
    Variance = ((log_Upper - log_Lower) / (2 * 1.96))^2,
    Weight = 1 / Variance
  )


# Calculate weighted mean and variance for the prior
sum_weights <- sum(results$Weight)
sum_squared_weights <- sum(results$Weight^2)

weighted_mean_log_OR <- sum(results$log_OR * results$Weight) / sum_weights
weighted_variance <- (sum_weights / (sum_weights^2 - sum_squared_weights)) * 
                     sum(results$Weight * (results$log_OR - weighted_mean_log_OR)^2)

prior_mean <- weighted_mean_log_OR
prior_variance <- weighted_variance

# Perform Bayesian updating for each population
results <- results %>%
  mutate(
    Posterior_Variance = 1 / (1 / prior_variance + 1 / Variance),
    Posterior_Mean = Posterior_Variance * (prior_mean / prior_variance + log_OR / Variance)
  )

# Exponentiating the posterior mean to get back to the OR scale
results$Posterior_OR <- exp(results$Posterior_Mean)

results <- results %>%
  mutate(
    Posterior_Lower_Log = Posterior_Mean - 1.96 * sqrt(Posterior_Variance),
    Posterior_Upper_Log = Posterior_Mean + 1.96 * sqrt(Posterior_Variance),
    Posterior_Lower_OR = exp(Posterior_Lower_Log),
    Posterior_Upper_OR = exp(Posterior_Upper_Log)
  )
# Now 'results' contains the posterior mean and variance for each population
print(results)

  Population        OR     Lower     Upper     log_OR   log_Lower log_Upper
1        AFR 1.4025052 0.9793945  2.020129  0.3382601 -0.02082079 0.7031616
2        SAS 1.9280613 1.4216083  2.619395  0.6565150  0.35178885 0.9629436
3        EUR 1.7482240 1.6809276  1.818275  0.5586004  0.51934580 0.5978884
4        AMR 2.4325701 0.5411120 11.516530  0.8889483 -0.61412902 2.4437834
5        EAS 0.8964945 0.3414912  2.440573 -0.1092631 -1.07443323 0.8922330
     Variance      Weight Posterior_Variance Posterior_Mean Posterior_OR
1 0.034110165   29.316774       0.0145130506      0.4638344     1.590160
2 0.024306935   41.140523       0.0123873908      0.6076324     1.836079
3 0.000401456 2490.932910       0.0003951758      0.5585727     1.748176
4 0.608524320    1.643320       0.0242541561      0.5700684     1.768388
5 0.251703457    3.972929       0.0229570221      0.4960789     1.642269
  Posterior_Lower_Log Posterior_Upper_Log Posterior_Lower_OR Posterior_Upper_OR
1           0.2277129           0.6999558           1.255725           2.013664
2           0.3894870           0.8257777           1.476223           2.283656
3           0.5196098           0.5975356           1.681371           1.817634
4           0.2648229           0.8753138           1.303200           2.399628
5           0.1991080           0.7930497           1.220314           2.210126

Forest Plot

We can then make the Bayesian meta analysis

Code

library(dplyr)
library(ggplot2)
library(forestplot)

Loading required package: grid

Loading required package: checkmate

Loading required package: abind

Code

library(forestplot)
library(ggplot2)
library(dplyr)

# Assuming 'results' is already loaded with your data

# We need to create a new data frame for ggplot
# First, we create two separate data frames for original and posterior results and then bind them together
# plot_data <- results %>%
#   select(Population, OR, Lower, Upper, Posterior_OR, Posterior_Lower_OR, Posterior_Upper_OR) %>%
#   gather(key = "Type", value = "Value", -Population, -Lower, -Upper, -Posterior_Lower_OR, -Posterior_Upper_OR) %>%
#   mutate(
#     ymin = ifelse(Type == "OR", Lower, Posterior_Lower_OR),
#     ymax = ifelse(Type == "OR", Upper, Posterior_Upper_OR),
#     Type = factor(Type, levels = c("OR", "Posterior_OR"))
#   )
# 
# # Now we plot with ggplot2
# ggplot(plot_data, aes(y = Population, x = Value, xmin = ymin, xmax = ymax, color = Type)) +
#   geom_errorbarh(height = 0.2) +
#   geom_point(size = 2.5) +
#   scale_color_manual(values = c("OR" = "black", "Posterior_OR" = "red")) +
#   theme_minimal() + lims(x=c(0.5,2.5))+
#   labs(x = "Odds Ratio", y = "", color = "Estimate Type") +
#   theme(legend.position = "bottom")



# Adding an identifier for each row
results$ID <- seq_along(results$Population)

results$ID <- seq_along(results$Population)

# Create a long-format data frame for plotting
plot_data <- tidyr::pivot_longer(
  results, 
  cols = c("OR", "Posterior_OR"), 
  names_to = "Estimate_Type", 
  values_to = "Value"
) %>%
  mutate(
    ymin = ifelse(Estimate_Type == "OR", Lower, Posterior_Lower_OR),
    ymax = ifelse(Estimate_Type == "OR", Upper, Posterior_Upper_OR),
    Estimate_Type = factor(Estimate_Type, levels = c("OR", "Posterior_OR")),
    # Adjust the y position of the posterior estimates
    ypos = as.numeric(ID) + ifelse(Estimate_Type == "OR", -0.1, 0.1)
  )

# Create the plot
ggplot(plot_data, aes(x = Value, y = ypos, color = Estimate_Type)) +
  geom_point(size = 3) +
  geom_errorbar(aes(xmin = ymin, xmax = ymax), width = 0.2) +
  scale_color_manual(values = c("OR" = "black", "Posterior_OR" = "red")) +
  labs(x = "Odds Ratio", y = "") +
  theme_minimal() +
  theme(axis.text.y = element_text(vjust = 0.5), 
        axis.ticks.y = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "bottom") +
  scale_y_continuous(breaks = 1:length(unique(plot_data$ID)), labels = results$Population)

We can do tAMR in stan and calculate heterogneiety as well:

Code

library(brms)

results <- results %>%
mutate(se_log_OR = 1 / sqrt(Weight))

# Given prior study's log OR and its standard error
prior_log_or <- log(1/0.76)  # log of the hazard ratio from the study
prior_sd <- (log(1/0.64) - log(1/0.91)) / (2 * 1.96)  # standard error of the log OR


# or try tAMR:
#   
# rosuvastatin_with_event <- 235
# rosuvastatin_without_event <- 6361 - rosuvastatin_with_event
# 
# placebo_with_event <- 304
# placebo_without_event <- 6344 - placebo_with_event
# 
# # Compute the OR
# odds_case <- rosuvastatin_with_event / rosuvastatin_without_event
# odds_control <- placebo_with_event / placebo_without_event
# OR <- odds_control / odds_case
# 
# # Convert OR to log scale for the prior
# prior_log_or <- log(OR)
# 
# # Calculate standard error of the log OR
# prior_log_or_se <- sqrt(1 / placebo_with_event +
#                         1 / placebo_without_event +
#                         1 / rosuvastatin_with_event +
#                         1 / rosuvastatin_without_event)
# 
# # Print the prior log OR and its standard error
# print(prior_log_or)
# print(prior_log_or_se)
# 
# # Print the OR
# print(OR)
# # Define the priors based on the prior study for the global effect size
#prior_effect <- set_prior(paste("normal(", prior_log_or, ", ", prior_log_or_se, ")"), class = "Intercept")

prior_effect <- set_prior(paste("normal(", weighted_mean_log_OR, ", ", sqrt(weighted_variance), ")"), class = "Intercept")

# Define a non-informative prior for the heterogeneity
prior_heterogeneity <- set_prior("student_t(3, 0, 10)", class = "sd", group = "Population")

prior_global <- set_prior(paste0("normal(", prior_log_or, ", ", prior_log_or_se, ")"), class = "Intercept")
prior_heterogeneity <- set_prior("student_t(3, 0, 10)", class = "sd",group = "Population")

# Fit the model
meta_analysis_model <- brm(
  bf(log_OR | se(se_log_OR) ~ 1 + (1 | Population)),
  data = results,
  prior = c(prior_global, prior_heterogeneity),
  family = gaussian(),
  chains = 4,
  iter = 4000,
  warmup = 2000,
  control = list(adapt_delta = 0.999999) # adjust for convergence
)

# Check the summary of the model
summary(meta_analysis_model)

# Extract population-specific effects
ranef(meta_analysis_model)

global_effect= fixef(meta_analysis_model)[1,1]

# Get the random effects (population-specific deviations)
population_effects <- ranef(meta_analysis_model)

# Calculate the population-specific log ORs by adding the global effect to the random effects
population_specific_log_ORs <- global_effect + population_effects$Population[,,"Intercept"][,"Estimate"]

# Convert the log ORs to ORs
population_specific_ORs <- exp(population_specific_log_ORs)

# Now you have the population-specific ORs
print(population_specific_ORs)

Heterogeneity assessment:

Code

# Extract the standard deviation of the random effects
heterogeneity_estimate <- VarCorr(meta_analysis_model)$Population$sd

# Print the heterogeneity estimate
print(heterogeneity_estimate)

# Conduct posterior predictive checks
pp_check(meta_analysis_model)

# If you want to plot the distribution of the standard deviation of the random effects
posterior_samples <- posterior_samples(meta_analysis_model, pars = "sd_Population__Intercept")
AMRt(posterior_samples[, "sd_Population__Intercept"], breaks = 40, main = "Distribution of Heterogeneity (sd Population)",x="Heterogeneity (SD)",fill="blue")

# Summarize the posterior distribution of the heterogeneity parameter
posterior_summary <- summary(posterior_samples[, "sd_Population__Intercept"])
print(posterior_summary)