Ancestry PRS Load

Author

Sarah Urbut

Published

March 7, 2024

LDL PRS analyais

Code
library(dplyr)
library(broom)
library(tidyr)
library(ggplot2)
library(data.table)
library(ggridges)
load("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/output/merged_pheno_censor_final_withdrugs_smoke.rds")
pheno_prs=dfh
pheno2=readRDS("~/Library/CloudStorage/Dropbox-Personal/df_ukb_pheno_updated.rds")

pheno_prs=merge(pheno_prs,pheno2[,c("Identifier","Cad_hard_Any","Cad_hard_censor_age")],by.x="identifier",by.y="Identifier")
eth=readRDS("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/ethnicity.rds")
eth=eth[!is.na(eth$f.21000.0.0),]
eth=eth[,c("identifier","meaning")]


baseline_levs=readRDS("~/Library/CloudStorage/Dropbox-Personal/dfukb_chol_bp_smoke.rds")
baseline_levs$hdl=baseline_levs$f.30760.0.0*38.4
baseline_levs$ldl=baseline_levs$f.30780.0.0*38.4

PCS=readRDS("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/baseline_withPCS.rds")[,c(1,20:39)]
colnames(PCS)=c("identifier",paste0(rep("PC",20),seq(1:20)))
baseline_levs=merge(baseline_levs,PCS,by="identifier")

Phenotyping

We validated previously that UKB pheno implementation of Hard CAD is matched to HARD Cad. So we will now use that here to be c/w AoU HARD CAD pheno.

  • Because we are using age as time scale (or GLM with ‘evere event’ follow up time is really just time from birth to death.

  • We know that time is inhomogenous: even if we use OR instead of HR, the OR will tend to decrease over time because there are more non-genetically mediated CAD events

Code
# m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","Cad_0_Any","Cad_0_censor_age","f.31.0.0")],eth,by.x="identifier",by.y="identifier")
# m2$Cad_Sta=ifelse(m2$Cad_0_Any==2,1,0)


m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","Cad_hard_Any","Cad_hard_censor_age","f.31.0.0")],eth,by.x="identifier",by.y="identifier")


m2$Cad_Sta=ifelse(m2$Cad_hard_Any==2,1,0)
m2$Cad_0_censor_age=m2$Cad_hard_censor_age



m2=m2[!is.na(m2$Cad_Sta),]
m2=m2[!is.na(m2$Cad_0_censor_age),]
summary(m2$Cad_0_censor_age[m2$Cad_Sta==1])
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
 0.04654 55.13689 62.57906 62.05269 70.03970 84.76112 
Code
summary(m2$Cad_0_censor_age[m2$Cad_Sta==0])
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  40.38   63.13   70.51   69.60   76.13   88.63 
Code
## old phenotypes
# df2=fread("~/Library/CloudStorage/Dropbox-Personal/UKBB_HardCAD_202109.tsv")
# m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","f.31.0.0")],eth,by.x="identifier",by.y="identifier")
# m2=merge(m2,df2[,c("sample_id","has_disease","censor_age")],by.x = "identifier",by.y="sample_id")
# m2$Cad_Sta=m2$has_disease
# m2$Cad_0_censor_age=m2$censor_age

## get these labs ready
med_codes=readRDS("~/Library/CloudStorage/Dropbox-Personal/medcodes.rds")
#levels(as.factor(med_codes$meaning))
## what do you do about meds? I think we need to eliminate, but will skew the distribtuion
baseline_meds=merge(med_codes,baseline_levs,by="identifier")
baseline_meds$ldl_adj=ifelse(baseline_meds$meaning%in%"Cholesterol lowering medication",baseline_meds$ldl/0.8,baseline_meds$ldl)
baseline_meds$sbp_adj=ifelse(baseline_meds$meaning%in%"Blood pressure medication",baseline_meds$f.4080.0.0+10,baseline_meds$f.4080.0.0)

## merge all
final_dat=merge(m2,baseline_meds[,c("identifier","sbp_adj","ldl_adj","hdl",paste0(rep("PC",10),seq(1:10)))],by.x="identifier",by.y="identifier")
library(lubridate)

Attaching package: 'lubridate'
The following objects are masked from 'package:data.table':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
Code
final_dat$yob=year(final_dat$Birthdate)

#final_dat=final_dat[year(final_dat$Birthdate)>1950,]

Now we want to calculate ancestry specific thresholds for PRS. Let’s get the PRS for African, European, and Asian ancestry that causes a 40 mmHg increase in LDL cholesterol in the UK Biobank using the AFR, SAS, and EUR individauls present.

Code
# final_dat <- final_dat %>%
#   mutate(recoded_ethnicity = case_when(
#     meaning %in% c("African", "Any other Black background", "Black or Black British","Caribbean","Any other mixed background","Mixed") ~ "AFR",
#     meaning %in% c("White", "Irish", "Any other white background","British") ~ "White",
#     meaning %in% c("Indian", "Pakistani","Bangladeshi") ~ "South Asian",
#     meaning == "Chinese" ~ "Chinese",
#     TRUE ~ "Other" # TAMR will be used for any ethnicity not covered above
#   ))
# 
# final_dat$Cad_Sta=ifelse(final_dat$Cad_0_Any==2,1,0)
# final_dat$sex=final_dat$f.31.0.0
# 
# table(final_dat$recoded_ethnicity)
#ggplot(lt,aes(x=Var1,y=value,fill=Var1))+geom_bar(stat="identity")

gae=fread("~/Library/CloudStorage/Dropbox-Personal/ukb.kgp_projected.tsv.gz")
final_dat=merge(final_dat,gae[,c("eid","rf80")],by.x="identifier",by.y="eid")
table(final_dat$rf80)

   AFR    AMR    EAS    EUR    SAS 
  8426    707   2249 448106   9085 
Code
## with wallace's 

prs_SAS=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/SAS_SUM_SCORE_FINAL_mixed")
prs_AFR=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/AFR_SUM_SCORE_FINAL_mixed")
prs_EUR=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/EUR_SUM_SCORE_FINAL_mixed")
prs_EAS=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/EAS_SUM_SCORE_FINAL_mixed")
prs_AMR=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/HISP_SUM_SCORE_FINAL_mixed")


AFR=final_dat$identifier[final_dat$rf80%in%"AFR"]
EUR=final_dat$identifier[final_dat$rf80%in%"EUR"]
SAS=final_dat$identifier[final_dat$rf80%in%"SAS"]
AMR=final_dat$identifier[final_dat$rf80%in%"AMR"]
EAS=final_dat$identifier[final_dat$rf80%in%"EAS"]

colnames(prs_SAS)=colnames(prs_AFR)=colnames(prs_EUR)=colnames(prs_EAS)=colnames(prs_AMR)=c("IID","PRS")

SAS_dat=merge(prs_SAS,final_dat[final_dat$rf80%in%"SAS",],by.x="IID",by.y="identifier")
AFR_dat=merge(prs_AFR,final_dat[final_dat$rf80%in%"AFR",],by.x="IID",by.y="identifier")
EUR_dat=merge(prs_EUR,final_dat[final_dat$rf80%in%"EUR",],by.x="IID",by.y="identifier")
AMR_dat=merge(prs_AMR,final_dat[final_dat$rf80%in%"AMR",],by.x="IID",by.y="identifier")
EAS_dat=merge(prs_EAS,final_dat[final_dat$rf80%in%"EAS",],by.x="IID",by.y="identifier")


pop_data_list=list(SAS_dat,AFR_dat,EUR_dat,AMR_dat,EAS_dat)
names(pop_data_list)=c("SAS","AFR","EUR","AMR","EAS")

final_dat=final_dat[!is.na(final_dat$rf80),]
final_dat%>%group_by(rf80)%>%summarise("cad_rate"=mean(Cad_Sta),"mean_ldl"=mean(na.omit(ldl_adj)),"%M"=mean(na.omit(as.numeric(f.31.0.0))),"meanAge"=as.numeric(mean(enrollage)))
# A tibble: 5 × 5
  rf80  cad_rate mean_ldl  `%M` meanAge
  <chr>    <dbl>    <dbl> <dbl>   <dbl>
1 AFR     0.0405     129. 0.417    52.4
2 AMR     0.0368     142. 0.332    52.5
3 EAS     0.0293     135. 0.311    52.9
4 EUR     0.0772     142. 0.458    57.3
5 SAS     0.132      135. 0.539    54.0

Histograms

We note that the distributions of these PRS are skewed for non EUR pops but will normalize for our analysis:

Code
prs_AFR$Population <- 'AFR';prs_AFR=prs_AFR[prs_AFR$IID%in%AFR]
prs_EUR$Population <- 'EUR';prs_EUR=prs_EUR[prs_EUR$IID%in%EUR]
prs_SAS$Population <- 'SAS';prs_SAS=prs_SAS[prs_SAS$IID%in%SAS]
prs_AMR$Population = 'AMR'; prs_AMR=prs_AMR[prs_AMR$IID%in%AMR]
prs_EAS$Population = 'EAS'; prs_EAS=prs_EAS[prs_EAS$IID%in%EAS]


combined_data <- rbind(prs_AFR, prs_EUR, prs_SAS,prs_AMR,prs_EAS)

ggplot(combined_data, aes(x = PRS, y = Population, fill = Population)) +
    geom_density_ridges(alpha = 0.7) +
    scale_fill_brewer(palette = "Set1") +
    theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
    labs(title = "Density Distribution of Sum Scores",
         x = "Sum Score",
         y = "Population")
Picking joint bandwidth of 0.0469

Using residual PRS

Code
totaldata=do.call(rbind,pop_data_list)
pcmod=lm(PRS~PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,data = totaldata)
totaldata$predictscore=predict(object = pcmod,newdata=totaldata)
totaldata$residscore=totaldata$PRS-totaldata$predictscore
totaldata$Population=as.factor(totaldata$rf80)
totaldata$scaled_resid=scale(totaldata$residscore)

ggplot(totaldata, aes(x = residscore, y = Population, fill = Population)) +
    geom_density_ridges(alpha = 0.7) +
    scale_fill_brewer(palette = "Set1") +
    theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
    labs(title = "Density Distribution of Sum Scores",
         x = "Sum Score",
         y = "Population")
Picking joint bandwidth of 0.0473

Code
totaldata$rf80[totaldata$rf80=="AMR"]="AMR"

Impact of PRS

To assess the impact of LDL-C PRS on CAD risk, you need to establish a two-step relationship: first, between LDL-C PRS and LDL-C levels (per 40 mg/dL change), and then between LDL-C PRS and CAD outcomes. TAMR approach allows you to understand how changes in LDL-C levels attributed to genetic risk (as quantified by PRS) relate to CAD risk. Here’s how you can approach tAMR analysis for both European (EUR) and African (AFR) populations:

Step 1: Associate LDL-C PRS with LDL-C Levels (per 40 mg/dL)

  1. Linear Regression for LDL-C Levels: Fit a linear regression model with LDL-C levels as the outcome and LDL-C PRS as the predictor. You might want to adjust for relevant covariates.

  2. Rescale the Coefficient for LDL-C PRS: Adjust the coefficient from tAMR model to reflect a 40 mg/dL change in LDL-C.

Step 2: Associate Rescaled LDL-C PRS with CAD

  1. Incorporate Rescaled LDL-C PRS into Cox Model: Use the rescaled LDL-C PRS from Step 1 as a predictor in a Cox proportional hazards model to analyze its relationship with CAD risk.

  2. Perform Analysis for Both EUR and AFR Populations: Repeat these steps separately for EUR and AFR populations to capture population-specific effects.

Here’s an example in R for the European population. The same steps would be repeated for the African population:

Code
library(dplyr)
library(survival)

# Assuming 'df' is your dataset

# List of unique populations
populations <- c("AFR","SAS","EUR","AMR","EAS")

# Initialize an empty list for storing models and results
cox_models <- list()
glm_models = list()
for(pop in populations) {
  # Subset data for the specific population
  pop_data=totaldata[totaldata$rf80%in%pop,]
  
  # Choose residual score or PRS
  
  pop_data$PRS=scale(pop_data$PRS)
   pop_data$PRS=scale(pop_data$residscore)
  #pop_data$PRS=pop_data$residscore
   print("PRS Summary")
   print(summary(pop_data$PRS))
  
  # Step 1: Linear Regression for LDL-C Levels
# Assuming 'pop' is defined elsewhere and 'pop_data' is your dataset
#if (pop == "EUR") {
    ldl_model <- lm(ldl_adj ~ PRS + as.numeric(enrollage) + as.factor(f.31.0.0) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = pop_data)
#} else {
#  pop_data$PRS=-1*pop_data$PRS
#    ldl_model <- lm(ldl_adj ~  PRS + as.numeric(enrollage) + as.factor(f.31.0.0) + PC1 + PC2 + #PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = pop_data)
#}

print(paste("LDL_coef_CI for population:", pop))
if ("PRS" %in% names(coef(ldl_model))) {
    print(confint(ldl_model, parm = "PRS"))
} else {
    print("PRS coefficient is not available in the model.")
}


  
  # Calculate the PRS scaling factor
  prs_scale_factor <- 40 / coef(ldl_model)["PRS"]
  print(paste0(pop,":",prs_scale_factor))
  #prs_scale_factor=1
  # Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL
  pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor
  
  print(summary(pop_data$scaled_ldl_prs))
  # Step 2: Cox Model for CAD -- inherengly accounting for age by using age as time scale
  cox_model <- coxph(Surv(Cad_0_censor_age, Cad_Sta) ~ scaled_ldl_prs
                     +as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data)
  #cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+sex, data = pop_data,family = "binomial")
  cox_models[[pop]] <- cox_model
  
    

    glm_model <- glm(Cad_Sta ~ scaled_ldl_prs+as.numeric(enrollage) 
                     + as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data,family = "binomial")
  glm_models[[pop]] <- glm_model
}
[1] "PRS Summary"
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-4.89093 -0.62501  0.07598  0.00000  0.68240  3.51648 
[1] "LDL_coef_CI for population: AFR"
       2.5 %   97.5 %
PRS 12.11459 13.50644
[1] "AFR:3.12243539531292"
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-1.56638 -0.20017  0.02433  0.00000  0.21855  1.12620 
[1] "PRS Summary"
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-4.31027 -0.64594  0.03341  0.00000  0.70225  3.22404 
[1] "LDL_coef_CI for population: SAS"
      2.5 %   97.5 %
PRS 8.15468 9.495848
[1] "SAS:4.53244224048928"
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-0.950982 -0.142514  0.007371  0.000000  0.154937  0.711326 
[1] "PRS Summary"
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-4.89947 -0.65269  0.02883  0.00000  0.68557  4.22746 
[1] "LDL_coef_CI for population: EUR"
       2.5 %   97.5 %
PRS 11.58239 11.75977
[1] "EUR:3.42727457116371"
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-1.429552 -0.190441  0.008413  0.000000  0.200034  1.233475 
[1] "PRS Summary"
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-3.83980 -0.60837  0.05459  0.00000  0.66863  2.91596 
[1] "LDL_coef_CI for population: AMR"
       2.5 %   97.5 %
PRS 8.792547 13.88534
[1] "AMR:3.52766605506817"
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-1.08848 -0.17246  0.01547  0.00000  0.18954  0.82660 
[1] "PRS Summary"
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-4.04886 -0.57366  0.09183  0.00000  0.69031  2.75622 
[1] "LDL_coef_CI for population: EAS"
       2.5 %   97.5 %
PRS 8.997966 11.40513
[1] "EAS:3.9209732005657"
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-1.03262 -0.14631  0.02342  0.00000  0.17606  0.70294 
Code
# Access and summarize the Cox models for each population
lapply(cox_models, function(x){exp(coef(x)["scaled_ldl_prs"])})
$AFR
scaled_ldl_prs 
      1.370463 

$SAS
scaled_ldl_prs 
      1.802675 

$EUR
scaled_ldl_prs 
      1.697199 

$AMR
scaled_ldl_prs 
      3.396973 

$EAS
scaled_ldl_prs 
     0.9217767 

Now let’s look at a meta analysis:

Code
# Create a data frame to store the results
results <- data.frame(Population = character(),
                      HR = numeric(),
                      Lower = numeric(),
                      Upper = numeric(),
                      stringsAsFactors = FALSE)

 #Extract the results from each model
 for (pop in names(cox_models)) {
   model_summary <- summary(cox_models[[pop]])
   hr <- model_summary$coefficients["scaled_ldl_prs", "exp(coef)"]
  lower <- model_summary$conf.int["scaled_ldl_prs","lower .95"]
  upper <- model_summary$conf.int["scaled_ldl_prs","upper .95"]

  # Add to the results data frame
  results <- rbind(results, data.frame(Population = pop, HR = hr, Lower = lower, Upper = upper))
}

ggplot(results, aes(x = Population, y = HR, ymin = Lower, ymax = Upper)) +
  geom_point() + # Add points for hazard ratios
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.1) + # Add error bars
  coord_flip() + # Flip coordinates for a horizontal plot
  xlab("Population") +
  ylab("Hazard Ratio (HR)") +
  ggtitle("HR for LDL-C_40 on CAD across Populations:UKB") +
  theme_minimal()

And now with OR/GLM:

Code
lapply(glm_models, function(x){exp(coef(x)["scaled_ldl_prs"])})
$AFR
scaled_ldl_prs 
      1.402505 

$SAS
scaled_ldl_prs 
      1.928061 

$EUR
scaled_ldl_prs 
      1.748224 

$AMR
scaled_ldl_prs 
       2.43257 

$EAS
scaled_ldl_prs 
     0.8964945 
Code
# 
results <- data.frame(Population = character(),
                      OR = numeric(),
                      Lower = numeric(),
                      Upper = numeric(),
                      stringsAsFactors = FALSE)

for (pop in names(cox_models)) {
    model_summary <- tidy(glm_models[[pop]], conf.int = TRUE, exponentiate = TRUE)

    if ("scaled_ldl_prs" %in% model_summary$term) {
        term_row <- model_summary %>% filter(term == "scaled_ldl_prs")

        results <- rbind(results, data.frame(Population = pop,
                                             OR = term_row$estimate,
                                             Lower = term_row$conf.low,
                                             Upper = term_row$conf.high))
    } else {
        results <- rbind(results, data.frame(Population = pop, OR = NA, Lower = NA, Upper = NA))
    }
}
# # Create the forest plot using ggplot2
# 
ggplot(results, aes(x = Population, y = OR, ymin = Lower, ymax = Upper)) +
    geom_point() +
    geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.1) +
    coord_flip()+
    xlab("Population") +
    ylab("Odds Ratio (OR)") +
  ggtitle("OR for LDL-C PRS_40 on CAD:UKB")  +theme_minimal()

Bayesian Meta-analysis

Now we want to take a two-step Bayesian approach where you first calculate a global mean hazard ratio (HR) or odds ratio (OR) from population-specific estimates and then use tAMR global mean as a prior in a Bayesian adjustment of the population-specific estimates. In such a case, weighting the global mean by the standard error (or variance) of each population-specific estimate is indeed a good approach, as it accounts for the different levels of uncertainty in each population estimate.

  1. Calculate Weighted Global Mean HR/OR:

    • For each population, you have an HR/OR and its standard error (SE).

    • You can weight the HR/OR by the inverse of the variance (which is SE²).

    • The weighted mean HR/OR is then the sum of each weighted HR/OR divided by the sum of the weights.

  2. Use Weighted Mean as Prior for Bayesian Adjustment:

    • Use the weighted global mean HR/OR as the mean for your prior distribution in the Bayesian model for each population.

    • The variance of the prior could be based on the pooled standard errors or a measure of the variability of the HR/OR across populations.

Variance and Weights

$$$$The variance of each log-transformed odds ratio (OR) can be approximated from the confidence interval: \[\text{Variance} \approx \left( \frac{\log(\text{Upper CI}) - \log(\text{Lower CI})}{2 \times 1.96} \right)^2 \]

Based on tAMR variance, the weight for each observation is:

\[ w_i = \frac{1}{\text{Variance}_i} \]

Weighted Mean The weighted mean is then calculated as:

\[\ \text{Weighted Mean} = \frac{\sum_{i=1}^{n} w_i x_i}{\sum_{i=1}^{n} w_i} \ \]

where \[( x_i ) \] is the log(OR) for each observation \[i\].

Weighted Variance The weighted variance is given by:

\[ \text{Weighted Variance} = \frac{\sum w_i}{\left( \sum w_i \right)^2 - \sum w_i^2} \sum w_i (x_i - \text{Weighted Mean})^2 \]

The weighted variance is given by:

\[\text{Weighted Variance} = \frac{\sum_{i=1}^{n} w_i}{\left( \sum_{i=1}^{n} w_i \right)^2 - \sum_{i=1}^{n} w_i^2} \sum_{i=1}^{n} w_i (x_i - \text{Weighted Mean})^2\]Here,

\[ \sum_{i=1}^{n} w_i \]

is the sum of the weights,

\[ \sum_{i=1}^{n} w_i^2 \] is the sum of the squared weights, and \[ x_i \] is each individual log-transformed OR.

Code
# Assuming you have a dataframe 'results' with columns 'Population', 'OR', and 'SE'

# Calculate variance and weights
results <- results %>%
  mutate(
    log_OR = log(OR),
    log_Lower = log(Lower),
    log_Upper = log(Upper),
    Variance = ((log_Upper - log_Lower) / (2 * 1.96))^2,
    Weight = 1 / Variance
  )


# Calculate weighted mean and variance for the prior
sum_weights <- sum(results$Weight)
sum_squared_weights <- sum(results$Weight^2)

weighted_mean_log_OR <- sum(results$log_OR * results$Weight) / sum_weights
weighted_variance <- (sum_weights / (sum_weights^2 - sum_squared_weights)) * 
                     sum(results$Weight * (results$log_OR - weighted_mean_log_OR)^2)

prior_mean <- weighted_mean_log_OR
prior_variance <- weighted_variance

# Perform Bayesian updating for each population
results <- results %>%
  mutate(
    Posterior_Variance = 1 / (1 / prior_variance + 1 / Variance),
    Posterior_Mean = Posterior_Variance * (prior_mean / prior_variance + log_OR / Variance)
  )

# Exponentiating the posterior mean to get back to the OR scale
results$Posterior_OR <- exp(results$Posterior_Mean)

results <- results %>%
  mutate(
    Posterior_Lower_Log = Posterior_Mean - 1.96 * sqrt(Posterior_Variance),
    Posterior_Upper_Log = Posterior_Mean + 1.96 * sqrt(Posterior_Variance),
    Posterior_Lower_OR = exp(Posterior_Lower_Log),
    Posterior_Upper_OR = exp(Posterior_Upper_Log)
  )
# Now 'results' contains the posterior mean and variance for each population
print(results)
  Population        OR     Lower     Upper     log_OR   log_Lower log_Upper
1        AFR 1.4025052 0.9793945  2.020129  0.3382601 -0.02082079 0.7031616
2        SAS 1.9280613 1.4216083  2.619395  0.6565150  0.35178885 0.9629436
3        EUR 1.7482240 1.6809276  1.818275  0.5586004  0.51934580 0.5978884
4        AMR 2.4325701 0.5411120 11.516530  0.8889483 -0.61412902 2.4437834
5        EAS 0.8964945 0.3414912  2.440573 -0.1092631 -1.07443323 0.8922330
     Variance      Weight Posterior_Variance Posterior_Mean Posterior_OR
1 0.034110165   29.316774       0.0145130506      0.4638344     1.590160
2 0.024306935   41.140523       0.0123873908      0.6076324     1.836079
3 0.000401456 2490.932910       0.0003951758      0.5585727     1.748176
4 0.608524320    1.643320       0.0242541561      0.5700684     1.768388
5 0.251703457    3.972929       0.0229570221      0.4960789     1.642269
  Posterior_Lower_Log Posterior_Upper_Log Posterior_Lower_OR Posterior_Upper_OR
1           0.2277129           0.6999558           1.255725           2.013664
2           0.3894870           0.8257777           1.476223           2.283656
3           0.5196098           0.5975356           1.681371           1.817634
4           0.2648229           0.8753138           1.303200           2.399628
5           0.1991080           0.7930497           1.220314           2.210126

Forest Plot

We can then make the Bayesian meta analysis

Code
library(dplyr)
library(ggplot2)
library(forestplot)
Loading required package: grid
Loading required package: checkmate
Loading required package: abind
Code
library(forestplot)
library(ggplot2)
library(dplyr)

# Assuming 'results' is already loaded with your data

# We need to create a new data frame for ggplot
# First, we create two separate data frames for original and posterior results and then bind them together
# plot_data <- results %>%
#   select(Population, OR, Lower, Upper, Posterior_OR, Posterior_Lower_OR, Posterior_Upper_OR) %>%
#   gather(key = "Type", value = "Value", -Population, -Lower, -Upper, -Posterior_Lower_OR, -Posterior_Upper_OR) %>%
#   mutate(
#     ymin = ifelse(Type == "OR", Lower, Posterior_Lower_OR),
#     ymax = ifelse(Type == "OR", Upper, Posterior_Upper_OR),
#     Type = factor(Type, levels = c("OR", "Posterior_OR"))
#   )
# 
# # Now we plot with ggplot2
# ggplot(plot_data, aes(y = Population, x = Value, xmin = ymin, xmax = ymax, color = Type)) +
#   geom_errorbarh(height = 0.2) +
#   geom_point(size = 2.5) +
#   scale_color_manual(values = c("OR" = "black", "Posterior_OR" = "red")) +
#   theme_minimal() + lims(x=c(0.5,2.5))+
#   labs(x = "Odds Ratio", y = "", color = "Estimate Type") +
#   theme(legend.position = "bottom")



# Adding an identifier for each row
results$ID <- seq_along(results$Population)

results$ID <- seq_along(results$Population)

# Create a long-format data frame for plotting
plot_data <- tidyr::pivot_longer(
  results, 
  cols = c("OR", "Posterior_OR"), 
  names_to = "Estimate_Type", 
  values_to = "Value"
) %>%
  mutate(
    ymin = ifelse(Estimate_Type == "OR", Lower, Posterior_Lower_OR),
    ymax = ifelse(Estimate_Type == "OR", Upper, Posterior_Upper_OR),
    Estimate_Type = factor(Estimate_Type, levels = c("OR", "Posterior_OR")),
    # Adjust the y position of the posterior estimates
    ypos = as.numeric(ID) + ifelse(Estimate_Type == "OR", -0.1, 0.1)
  )

# Create the plot
ggplot(plot_data, aes(x = Value, y = ypos, color = Estimate_Type)) +
  geom_point(size = 3) +
  geom_errorbar(aes(xmin = ymin, xmax = ymax), width = 0.2) +
  scale_color_manual(values = c("OR" = "black", "Posterior_OR" = "red")) +
  labs(x = "Odds Ratio", y = "") +
  theme_minimal() +
  theme(axis.text.y = element_text(vjust = 0.5), 
        axis.ticks.y = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "bottom") +
  scale_y_continuous(breaks = 1:length(unique(plot_data$ID)), labels = results$Population) 

We can do tAMR in stan and calculate heterogneiety as well:

Code
library(brms)

results <- results %>%
mutate(se_log_OR = 1 / sqrt(Weight))

# Given prior study's log OR and its standard error
prior_log_or <- log(1/0.76)  # log of the hazard ratio from the study
prior_sd <- (log(1/0.64) - log(1/0.91)) / (2 * 1.96)  # standard error of the log OR


# or try tAMR:
#   
# rosuvastatin_with_event <- 235
# rosuvastatin_without_event <- 6361 - rosuvastatin_with_event
# 
# placebo_with_event <- 304
# placebo_without_event <- 6344 - placebo_with_event
# 
# # Compute the OR
# odds_case <- rosuvastatin_with_event / rosuvastatin_without_event
# odds_control <- placebo_with_event / placebo_without_event
# OR <- odds_control / odds_case
# 
# # Convert OR to log scale for the prior
# prior_log_or <- log(OR)
# 
# # Calculate standard error of the log OR
# prior_log_or_se <- sqrt(1 / placebo_with_event +
#                         1 / placebo_without_event +
#                         1 / rosuvastatin_with_event +
#                         1 / rosuvastatin_without_event)
# 
# # Print the prior log OR and its standard error
# print(prior_log_or)
# print(prior_log_or_se)
# 
# # Print the OR
# print(OR)
# # Define the priors based on the prior study for the global effect size
#prior_effect <- set_prior(paste("normal(", prior_log_or, ", ", prior_log_or_se, ")"), class = "Intercept")

prior_effect <- set_prior(paste("normal(", weighted_mean_log_OR, ", ", sqrt(weighted_variance), ")"), class = "Intercept")

# Define a non-informative prior for the heterogeneity
prior_heterogeneity <- set_prior("student_t(3, 0, 10)", class = "sd", group = "Population")

prior_global <- set_prior(paste0("normal(", prior_log_or, ", ", prior_log_or_se, ")"), class = "Intercept")
prior_heterogeneity <- set_prior("student_t(3, 0, 10)", class = "sd",group = "Population")

# Fit the model
meta_analysis_model <- brm(
  bf(log_OR | se(se_log_OR) ~ 1 + (1 | Population)),
  data = results,
  prior = c(prior_global, prior_heterogeneity),
  family = gaussian(),
  chains = 4,
  iter = 4000,
  warmup = 2000,
  control = list(adapt_delta = 0.999999) # adjust for convergence
)

# Check the summary of the model
summary(meta_analysis_model)

# Extract population-specific effects
ranef(meta_analysis_model)

global_effect= fixef(meta_analysis_model)[1,1]

# Get the random effects (population-specific deviations)
population_effects <- ranef(meta_analysis_model)

# Calculate the population-specific log ORs by adding the global effect to the random effects
population_specific_log_ORs <- global_effect + population_effects$Population[,,"Intercept"][,"Estimate"]

# Convert the log ORs to ORs
population_specific_ORs <- exp(population_specific_log_ORs)

# Now you have the population-specific ORs
print(population_specific_ORs)

Heterogeneity assessment:

Code
# Extract the standard deviation of the random effects
heterogeneity_estimate <- VarCorr(meta_analysis_model)$Population$sd

# Print the heterogeneity estimate
print(heterogeneity_estimate)

# Conduct posterior predictive checks
pp_check(meta_analysis_model)

# If you want to plot the distribution of the standard deviation of the random effects
posterior_samples <- posterior_samples(meta_analysis_model, pars = "sd_Population__Intercept")
AMRt(posterior_samples[, "sd_Population__Intercept"], breaks = 40, main = "Distribution of Heterogeneity (sd Population)",x="Heterogeneity (SD)",fill="blue")

# Summarize the posterior distribution of the heterogeneity parameter
posterior_summary <- summary(posterior_samples[, "sd_Population__Intercept"])
print(posterior_summary)