Ancestry PRS Load

Author

Sarah Urbut

Published

January 7, 2024

LDL PRS analyais

Code

library(dplyr)
library(broom)
library(tidyr)
library(ggplot2)
library(data.table)
library(ggridges)
load("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/output/merged_pheno_censor_final_withdrugs_smoke.rds")
pheno_prs=dfh



eth=readRDS("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/ethnicity.rds")
eth=eth[!is.na(eth$f.21000.0.0),]
eth=eth[,c("identifier","meaning")]


baseline_levs=readRDS("~/Library/CloudStorage/Dropbox-Personal/dfukb_chol_bp_smoke.rds")
baseline_levs$hdl=baseline_levs$f.30760.0.0*38.4
baseline_levs$ldl=baseline_levs$f.30780.0.0*38.4

PCS=readRDS("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/baseline_withPCS.rds")[,c(1,20:39)]
colnames(PCS)=c("identifier",paste0(rep("PC",20),seq(1:20)))
baseline_levs=merge(baseline_levs,PCS,by="identifier")
m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","Cad_0_Any","Cad_0_censor_age","f.31.0.0")],eth,by.x="identifier",by.y="identifier")
m2$Cad_Sta=ifelse(m2$Cad_0_Any==2,1,0)

#df2=fread("~/Library/CloudStorage/Dropbox-Personal/UKBB_HardCAD_202109.tsv")
# m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","f.31.0.0")],eth,by.x="identifier",by.y="identifier")
 #m2=merge(m2,df2[,c("sample_id","has_disease","censor_age")],by.x = "identifier",by.y="sample_id")
       
 #m2$Cad_Sta=m2$has_disease
 #m2$Cad_0_censor_age=m2$censor_age

## get these labs ready
med_codes=readRDS("~/Library/CloudStorage/Dropbox-Personal/medcodes.rds")
#levels(as.factor(med_codes$meaning))
## what do you do about meds? I think we need to eliminate, but will skew the distribtuion
baseline_meds=merge(med_codes,baseline_levs,by="identifier")
baseline_meds$ldl_adj=ifelse(baseline_meds$meaning%in%"Cholesterol lowering medication",baseline_meds$ldl/0.8,baseline_meds$ldl)
baseline_meds$sbp_adj=ifelse(baseline_meds$meaning%in%"Blood pressure medication",baseline_meds$f.4080.0.0+10,baseline_meds$f.4080.0.0)

## merge all
final_dat=merge(m2,baseline_meds[,c("identifier","sbp_adj","ldl_adj","hdl",paste0(rep("PC",10),seq(1:10)))],by.x="identifier",by.y="identifier")

Now we want to calculate ancestry specific thresholds for PRS. Let’s get the PRS for African, European, and Asian ancestry that causes a 40 mmHg increase in LDL cholesterol in the UK Biobank using the AFR, SAS, and EUR individauls present.

Code

# final_dat <- final_dat %>%
#   mutate(recoded_ethnicity = case_when(
#     meaning %in% c("African", "Any other Black background", "Black or Black British","Caribbean","Any other mixed background","Mixed") ~ "AFR",
#     meaning %in% c("White", "Irish", "Any other white background","British") ~ "White",
#     meaning %in% c("Indian", "Pakistani","Bangladeshi") ~ "South Asian",
#     meaning == "Chinese" ~ "Chinese",
#     TRUE ~ "Other" # This will be used for any ethnicity not covered above
#   ))
# 
# final_dat$Cad_Sta=ifelse(final_dat$Cad_0_Any==2,1,0)
# final_dat$sex=final_dat$f.31.0.0
# 
# table(final_dat$recoded_ethnicity)
#ggplot(lt,aes(x=Var1,y=value,fill=Var1))+geom_bar(stat="identity")

gae=fread("~/Library/CloudStorage/Dropbox-Personal/ukb.kgp_projected.tsv.gz")
final_dat=merge(final_dat,gae[,c("eid","rf80")],by.x="identifier",by.y="eid")
table(final_dat$rf80)


   AFR    AMR    EAS    EUR    SAS 
  8426    707   2249 448138   9085

Code

# prs_SAS=fread("~/Library/CloudStorage/Dropbox-Personal/ukbb_ldlcprs.SAS.tsv")
# prs_AFR=fread("~/Library/CloudStorage/Dropbox-Personal/ukbb_ldlcprs.AFR.tsv")
# prs_EUR=fread("~/Library/CloudStorage/Dropbox-Personal/ukbb_ldlcprs.EUR.tsv")

# SAS_dat=merge(prs_SAS[,c(1:5,20)],final_dat,by.x="eid",by.y="identifier")
# AFR_dat=merge(prs_AFR[,c(1:5,20)],final_dat,by.x="eid",by.y="identifier")
# EUR_dat=merge(prs_EUR[,c(1:5,20)],final_dat,by.x="eid",by.y="identifier")

## with wallace's 

prs_SAS=fread("~/Library/CloudStorage/Dropbox-Personal/PRSethnicity/SAS_SUM_SCORE_FINAL")
prs_AFR=fread("~/Library/CloudStorage/Dropbox-Personal/PRSethnicity/AFR_SUM_SCORE_FINAL")
prs_EUR=fread("~/Library/CloudStorage/Dropbox-Personal/PRSethnicity/EUR_SUM_SCORE_FINAL")

AFR=final_dat$identifier[final_dat$rf80%in%"AFR"]
EUR=final_dat$identifier[final_dat$rf80%in%"EUR"]
SAS=final_dat$identifier[final_dat$rf80%in%"SAS"]

colnames(prs_SAS)=colnames(prs_AFR)=colnames(prs_EUR)=c("IID","PRS")

SAS_dat=merge(prs_SAS,final_dat[final_dat$rf80%in%"SAS",],by.x="IID",by.y="identifier")
AFR_dat=merge(prs_AFR,final_dat[final_dat$rf80%in%"AFR",],by.x="IID",by.y="identifier")
EUR_dat=merge(prs_EUR,final_dat[final_dat$rf80%in%"EUR",],by.x="IID",by.y="identifier")


pop_data_list=list(SAS_dat,AFR_dat,EUR_dat)
names(pop_data_list)=c("SAS","AFR","EUR")

Histograms

We note that the distributions of these PRS are skewed for non EUR pops but will normalize for our analysis:

Code

prs_AFR$Population <- 'AFR';prs_AFR=prs_AFR[prs_AFR$IID%in%AFR]
prs_EUR$Population <- 'EUR';prs_EUR=prs_EUR[prs_EUR$IID%in%EUR]
prs_SAS$Population <- 'SAS';prs_SAS=prs_SAS[prs_SAS$IID%in%SAS]

combined_data <- rbind(prs_AFR, prs_EUR, prs_SAS)

ggplot(combined_data, aes(x = PRS, y = Population, fill = Population)) +
    geom_density_ridges(alpha = 0.7) +
    scale_fill_brewer(palette = "Set1") +
    theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
    labs(title = "Density Distribution of Sum Scores",
         x = "Sum Score",
         y = "Population")

Picking joint bandwidth of 0.0353

Using residual PRS

Code

totaldata=do.call(rbind,pop_data_list)
pcmod=lm(PRS~PC1+PC2+PC3+PC4+PC5,data = totaldata)
totaldata$predictscore=predict(object = pcmod,newdata=totaldata)
totaldata$residscore=totaldata$PRS-totaldata$predictscore
totaldata$Population=as.factor(totaldata$rf80)
totaldata$scaled_resid=scale(totaldata$residscore)

ggplot(totaldata, aes(x = residscore, y = Population, fill = Population)) +
    geom_density_ridges(alpha = 0.7) +
    scale_fill_brewer(palette = "Set1") +
    theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
    labs(title = "Density Distribution of Sum Scores",
         x = "Sum Score",
         y = "Population")

Picking joint bandwidth of 0.0353

Impact of PRS

To assess the impact of LDL-C PRS on CAD risk, you need to establish a two-step relationship: first, between LDL-C PRS and LDL-C levels (per 40 mg/dL change), and then between LDL-C PRS and CAD outcomes. This approach allows you to understand how changes in LDL-C levels attributed to genetic risk (as quantified by PRS) relate to CAD risk. Here’s how you can approach this analysis for both European (EUR) and African (AFR) populations:

Step 1: Associate LDL-C PRS with LDL-C Levels (per 40 mg/dL)

Linear Regression for LDL-C Levels: Fit a linear regression model with LDL-C levels as the outcome and LDL-C PRS as the predictor. You might want to adjust for relevant covariates.
Rescale the Coefficient for LDL-C PRS: Adjust the coefficient from this model to reflect a 40 mg/dL change in LDL-C.

Step 2: Associate Rescaled LDL-C PRS with CAD

Incorporate Rescaled LDL-C PRS into Cox Model: Use the rescaled LDL-C PRS from Step 1 as a predictor in a Cox proportional hazards model to analyze its relationship with CAD risk.
Perform Analysis for Both EUR and AFR Populations: Repeat these steps separately for EUR and AFR populations to capture population-specific effects.

Here’s an example in R for the European population. The same steps would be repeated for the African population:

Code

library(dplyr)
library(survival)

# Assuming 'df' is your dataset

# List of unique populations
populations <- c("AFR","SAS","EUR")

# Initialize an empty list for storing models and results
cox_models <- list()
glm_models = list()
for(pop in populations) {
  # Subset data for the specific population
  pop_data=totaldata[totaldata$rf80%in%pop,]
  
  # Choose residual score or PRS
  
  #pop_data$PRS=scale(pop_data$PRS)
   pop_data$PRS=scale(pop_data$residscore)
   print("PRS Summary")
   summary(pop_data$PRS)
  
  # Step 1: Linear Regression for LDL-C Levels
  if(pop=="AFR"){
     ldl_model <- lm(ldl_adj ~ PRS + as.numeric(enrollage) + as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data)
     print(paste("LDL_coef_CI:",pop,":",confint(ldl_model,parm = "PRS")))
  }
  else{
    ldl_model <- lm(ldl_adj ~ PRS + as.numeric(enrollage) + as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10#+as.factor(array)
                    , data = pop_data)
  print(paste("LDL_coef_CI:",pop,":",confint(ldl_model,parm = "PRS")))
  }
  
  # Calculate the PRS scaling factor
  prs_scale_factor <- 40 / coef(ldl_model)["PRS"]
  
  # Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL
  pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor
  
  # Step 2: Cox Model for CAD
  cox_model <- coxph(Surv(Cad_0_censor_age, Cad_Sta) ~ scaled_ldl_prs+ as.numeric(enrollage)+as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data)
  #cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+sex, data = pop_data,family = "binomial")
  cox_models[[pop]] <- cox_model
  
    

    glm_model <- glm(Cad_Sta ~ scaled_ldl_prs+as.numeric(enrollage) + as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data,family = "binomial")
  glm_models[[pop]] <- glm_model
}

[1] "PRS Summary"
[1] "LDL_coef_CI: AFR : 11.4140581248914" "LDL_coef_CI: AFR : 12.8035879501732"
[1] "PRS Summary"
[1] "LDL_coef_CI: SAS : 6.56497515444643" "LDL_coef_CI: SAS : 7.88646425331043"
[1] "PRS Summary"
[1] "LDL_coef_CI: EUR : 11.5681725787802" "LDL_coef_CI: EUR : 11.7455973839658"

Code

# Access and summarize the Cox models for each population
lapply(cox_models, function(x){exp(coef(x)["scaled_ldl_prs"])})

$AFR
scaled_ldl_prs 
      1.156804 

$SAS
scaled_ldl_prs 
      1.644979 

$EUR
scaled_ldl_prs 
      1.506107

Now let’s look at a meta analysis:

Code

# Create a data frame to store the results
results <- data.frame(Population = character(),
                      HR = numeric(),
                      Lower = numeric(),
                      Upper = numeric(),
                      stringsAsFactors = FALSE)

 #Extract the results from each model
 for (pop in names(cox_models)) {
   model_summary <- summary(cox_models[[pop]])
   hr <- model_summary$coefficients["scaled_ldl_prs", "exp(coef)"]
  lower <- model_summary$conf.int["scaled_ldl_prs","lower .95"]
  upper <- model_summary$conf.int["scaled_ldl_prs","upper .95"]

  # Add to the results data frame
  results <- rbind(results, data.frame(Population = pop, HR = hr, Lower = lower, Upper = upper))
}

ggplot(results, aes(x = Population, y = HR, ymin = Lower, ymax = Upper)) +
  geom_point() + # Add points for hazard ratios
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.1) + # Add error bars
  coord_flip() + # Flip coordinates for a horizontal plot
  xlab("Population") +ylim(c(0.5,2.5))+
  ylab("Hazard Ratio (HR)") +
  ggtitle("Forest Plot of Hazard Ratios for LDL-C PRS (per 40 mg/dL) on CAD  across Populations") +
  theme_minimal()

And now with OR/GLM:

Code

# # Extract the results from each model
# # library(broom)
# # library(dplyr)
# 
# for(pop in populations) {
# 
#   pop_data=totaldata[totaldata$rf80%in%pop,]
#   
#   # Choose residual score or PRS
#   
#   #pop_data$PRS=scale(pop_data$PRS)
#    pop_data$PRS=scale(pop_data$residscore)
#    print("PRS Summary")
#    print(summary(pop_data$PRS))
#   
#    if(pop=="AFR"){
#      ldl_model <- lm(ldl_adj ~ PRS + as.numeric(enrollage) + as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data)
#      print(length(coef(ldl_model)))
#   }
#   else{ldl_model <- lm(ldl_adj ~ PRS + as.numeric(enrollage) + as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10#+as.factor(array),
#                        ,data = pop_data)
#   print(length(coef(ldl_model)))}
#   
#   # Calculate the PRS scaling factor
#   prs_scale_factor <- 40 / coef(ldl_model)["PRS"]
#   print(paste0(pop,"population scaling:",prs_scale_factor))
#   
#   # Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL
#   pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor
#   
# 
#     cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+as.numeric(enrollage) + as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data,family = "binomial")
#   cox_models[[pop]] <- cox_model
# }

# Access and summarize the Cox models for each population
lapply(glm_models, function(x){exp(coef(x)["scaled_ldl_prs"])})

$AFR
scaled_ldl_prs 
       1.16974 

$SAS
scaled_ldl_prs 
      1.725201 

$EUR
scaled_ldl_prs 
      1.549232

Code

# 
results <- data.frame(Population = character(),
                      OR = numeric(),
                      Lower = numeric(),
                      Upper = numeric(),
                      stringsAsFactors = FALSE)

for (pop in names(cox_models)) {
    model_summary <- tidy(cox_models[[pop]], conf.int = TRUE, exponentiate = TRUE)

    if ("scaled_ldl_prs" %in% model_summary$term) {
        term_row <- model_summary %>% filter(term == "scaled_ldl_prs")

        results <- rbind(results, data.frame(Population = pop,
                                             OR = term_row$estimate,
                                             Lower = term_row$conf.low,
                                             Upper = term_row$conf.high))
    } else {
        results <- rbind(results, data.frame(Population = pop, OR = NA, Lower = NA, Upper = NA))
    }
}
# # Create the forest plot using ggplot2
# 
ggplot(results, aes(x = Population, y = OR, ymin = Lower, ymax = Upper)) +
    geom_point() +
    geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.1) +
    coord_flip()+ylim(c(0.5,2.5))+
    xlab("Population") +
    ylab("Odds Ratio (OR)") +
    ggtitle("Forest Plot of Odds Ratios for LDL-C PRS (per 40 mg/dL) on CAD across Populations") +theme_minimal()

Bayesian Meta-analysis

Now we want to take a two-step Bayesian approach where you first calculate a global mean hazard ratio (HR) or odds ratio (OR) from population-specific estimates and then use this global mean as a prior in a Bayesian adjustment of the population-specific estimates. In such a case, weighting the global mean by the standard error (or variance) of each population-specific estimate is indeed a good approach, as it accounts for the different levels of uncertainty in each population estimate.

Calculate Weighted Global Mean HR/OR:
- For each population, you have an HR/OR and its standard error (SE).
- You can weight the HR/OR by the inverse of the variance (which is SE²).
- The weighted mean HR/OR is then the sum of each weighted HR/OR divided by the sum of the weights.
Use Weighted Mean as Prior for Bayesian Adjustment:
- Use the weighted global mean HR/OR as the mean for your prior distribution in the Bayesian model for each population.
- The variance of the prior could be based on the pooled standard errors or a measure of the variability of the HR/OR across populations.

Variance and Weights

$$$$The variance of each log-transformed odds ratio (OR) can be approximated from the confidence interval: \[\text{Variance} \approx \left( \frac{\log(\text{Upper CI}) - \log(\text{Lower CI})}{2 \times 1.96} \right)^2 \]

Based on this variance, the weight for each observation is:

\[ w_i = \frac{1}{\text{Variance}_i} \]

Weighted Mean The weighted mean is then calculated as:

\[\ \text{Weighted Mean} = \frac{\sum_{i=1}^{n} w_i x_i}{\sum_{i=1}^{n} w_i} \ \]

where \[( x_i ) \] is the log(OR) for each observation \[i\].

Weighted Variance The weighted variance is given by:

\[ \text{Weighted Variance} = \frac{\sum w_i}{\left( \sum w_i \right)^2 - \sum w_i^2} \sum w_i (x_i - \text{Weighted Mean})^2 \]

The weighted variance is given by:

\[\text{Weighted Variance} = \frac{\sum_{i=1}^{n} w_i}{\left( \sum_{i=1}^{n} w_i \right)^2 - \sum_{i=1}^{n} w_i^2} \sum_{i=1}^{n} w_i (x_i - \text{Weighted Mean})^2\]Here,

\[ \sum_{i=1}^{n} w_i \]

is the sum of the weights,

\[ \sum_{i=1}^{n} w_i^2 \] is the sum of the squared weights, and \[ x_i \] is each individual log-transformed OR.

Code

# Assuming you have a dataframe 'results' with columns 'Population', 'OR', and 'SE'

# Calculate variance and weights
results <- results %>%
  mutate(
    log_OR = log(OR),
    log_Lower = log(Lower),
    log_Upper = log(Upper),
    Variance = ((log_Upper - log_Lower) / (2 * 1.96))^2,
    Weight = 1 / Variance
  )


# Calculate weighted mean and variance for the prior
sum_weights <- sum(results$Weight)
sum_squared_weights <- sum(results$Weight^2)

weighted_mean_log_OR <- sum(results$log_OR * results$Weight) / sum_weights
weighted_variance <- (sum_weights / (sum_weights^2 - sum_squared_weights)) * 
                     sum(results$Weight * (results$log_OR - weighted_mean_log_OR)^2)

prior_mean <- weighted_mean_log_OR
prior_variance <- weighted_variance

# Perform Bayesian updating for each population
results <- results %>%
  mutate(
    Posterior_Variance = 1 / (1 / prior_variance + 1 / Variance),
    Posterior_Mean = Posterior_Variance * (prior_mean / prior_variance + log_OR / Variance)
  )

# Exponentiating the posterior mean to get back to the OR scale
results$Posterior_OR <- exp(results$Posterior_Mean)

results <- results %>%
  mutate(
    Posterior_Lower_Log = Posterior_Mean - 1.96 * sqrt(Posterior_Variance),
    Posterior_Upper_Log = Posterior_Mean + 1.96 * sqrt(Posterior_Variance),
    Posterior_Lower_OR = exp(Posterior_Lower_Log),
    Posterior_Upper_OR = exp(Posterior_Upper_Log)
  )
# Now 'results' contains the posterior mean and variance for each population
print(results)

  Population       OR     Lower    Upper    log_OR  log_Lower log_Upper
1        AFR 1.156804 0.8871046 1.508499 0.1456612 -0.1197924 0.4111149
2        SAS 1.644979 1.2496310 2.165403 0.4977274  0.2228483 0.7726065
3        EUR 1.506107 1.4614119 1.552170 0.4095283  0.3794030 0.4396536
      Variance     Weight Posterior_Variance Posterior_Mean Posterior_OR
1 0.0183427860   54.51735       0.0096020811      0.2703117     1.310373
2 0.0196685017   50.84271       0.0099532735      0.4530343     1.573078
3 0.0002362384 4233.01248       0.0002335009      0.4095019     1.506067
  Posterior_Lower_Log Posterior_Upper_Log Posterior_Lower_OR Posterior_Upper_OR
1          0.07825088           0.4623725           1.081394           1.587837
2          0.25749277           0.6485759           1.293682           1.912815
3          0.37955165           0.4394521           1.461629           1.551857

Forest Plot

We can then make the Bayesian meta analysis

Code

library(dplyr)
library(ggplot2)
library(forestplot)

Loading required package: grid

Loading required package: checkmate

Loading required package: abind

Code

library(forestplot)
library(ggplot2)
library(dplyr)

# Assuming 'results' is already loaded with your data

# We need to create a new data frame for ggplot
# First, we create two separate data frames for original and posterior results and then bind them together
# plot_data <- results %>%
#   select(Population, OR, Lower, Upper, Posterior_OR, Posterior_Lower_OR, Posterior_Upper_OR) %>%
#   gather(key = "Type", value = "Value", -Population, -Lower, -Upper, -Posterior_Lower_OR, -Posterior_Upper_OR) %>%
#   mutate(
#     ymin = ifelse(Type == "OR", Lower, Posterior_Lower_OR),
#     ymax = ifelse(Type == "OR", Upper, Posterior_Upper_OR),
#     Type = factor(Type, levels = c("OR", "Posterior_OR"))
#   )
# 
# # Now we plot with ggplot2
# ggplot(plot_data, aes(y = Population, x = Value, xmin = ymin, xmax = ymax, color = Type)) +
#   geom_errorbarh(height = 0.2) +
#   geom_point(size = 2.5) +
#   scale_color_manual(values = c("OR" = "black", "Posterior_OR" = "red")) +
#   theme_minimal() + lims(x=c(0.5,2.5))+
#   labs(x = "Odds Ratio", y = "", color = "Estimate Type") +
#   theme(legend.position = "bottom")



# Adding an identifier for each row
results$ID <- seq_along(results$Population)

results$ID <- seq_along(results$Population)

# Create a long-format data frame for plotting
plot_data <- tidyr::pivot_longer(
  results, 
  cols = c("OR", "Posterior_OR"), 
  names_to = "Estimate_Type", 
  values_to = "Value"
) %>%
  mutate(
    ymin = ifelse(Estimate_Type == "OR", Lower, Posterior_Lower_OR),
    ymax = ifelse(Estimate_Type == "OR", Upper, Posterior_Upper_OR),
    Estimate_Type = factor(Estimate_Type, levels = c("OR", "Posterior_OR")),
    # Adjust the y position of the posterior estimates
    ypos = as.numeric(ID) + ifelse(Estimate_Type == "OR", -0.1, 0.1)
  )

# Create the plot
ggplot(plot_data, aes(x = Value, y = ypos, color = Estimate_Type)) +
  geom_point(size = 3) +
  geom_errorbar(aes(xmin = ymin, xmax = ymax), width = 0.2) +
  scale_color_manual(values = c("OR" = "black", "Posterior_OR" = "red")) +
  labs(x = "Odds Ratio", y = "") +
  theme_minimal() +
  theme(axis.text.y = element_text(vjust = 0.5), 
        axis.ticks.y = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "bottom") +
  scale_y_continuous(breaks = 1:length(unique(plot_data$ID)), labels = results$Population) +
  xlim(0.8, 2.5)  # Set x-axis limits

We can do this in stan and calculate heterogneiety as well:

Code

library(brms)

results <- results %>%
mutate(se_log_OR = 1 / sqrt(Weight))

# Given prior study's log OR and its standard error
prior_log_or <- log(1/0.76)  # log of the hazard ratio from the study
prior_sd <- (log(1/0.64) - log(1/0.91)) / (2 * 1.96)  # standard error of the log OR


# or try this:
#   
# rosuvastatin_with_event <- 235
# rosuvastatin_without_event <- 6361 - rosuvastatin_with_event
# 
# placebo_with_event <- 304
# placebo_without_event <- 6344 - placebo_with_event
# 
# # Compute the OR
# odds_case <- rosuvastatin_with_event / rosuvastatin_without_event
# odds_control <- placebo_with_event / placebo_without_event
# OR <- odds_control / odds_case
# 
# # Convert OR to log scale for the prior
# prior_log_or <- log(OR)
# 
# # Calculate standard error of the log OR
# prior_log_or_se <- sqrt(1 / placebo_with_event +
#                         1 / placebo_without_event +
#                         1 / rosuvastatin_with_event +
#                         1 / rosuvastatin_without_event)
# 
# # Print the prior log OR and its standard error
# print(prior_log_or)
# print(prior_log_or_se)
# 
# # Print the OR
# print(OR)
# # Define the priors based on the prior study for the global effect size
#prior_effect <- set_prior(paste("normal(", prior_log_or, ", ", prior_log_or_se, ")"), class = "Intercept")

prior_effect <- set_prior(paste("normal(", weighted_mean_log_OR, ", ", sqrt(weighted_variance), ")"), class = "Intercept")

# Define a non-informative prior for the heterogeneity
prior_heterogeneity <- set_prior("student_t(3, 0, 10)", class = "sd", group = "Population")

prior_global <- set_prior(paste0("normal(", prior_log_or, ", ", prior_log_or_se, ")"), class = "Intercept")
prior_heterogeneity <- set_prior("student_t(3, 0, 10)", class = "sd",group = "Population")

# Fit the model
meta_analysis_model <- brm(
  bf(log_OR | se(se_log_OR) ~ 1 + (1 | Population)),
  data = results,
  prior = c(prior_global, prior_heterogeneity),
  family = gaussian(),
  chains = 4,
  iter = 4000,
  warmup = 2000,
  control = list(adapt_delta = 0.999999) # adjust for convergence
)

# Check the summary of the model
summary(meta_analysis_model)

# Extract population-specific effects
ranef(meta_analysis_model)

global_effect= fixef(meta_analysis_model)[1,1]

# Get the random effects (population-specific deviations)
population_effects <- ranef(meta_analysis_model)

# Calculate the population-specific log ORs by adding the global effect to the random effects
population_specific_log_ORs <- global_effect + population_effects$Population[,,"Intercept"][,"Estimate"]

# Convert the log ORs to ORs
population_specific_ORs <- exp(population_specific_log_ORs)

# Now you have the population-specific ORs
print(population_specific_ORs)

Heterogeneity assessment:

Code

# Extract the standard deviation of the random effects
heterogeneity_estimate <- VarCorr(meta_analysis_model)$Population$sd

# Print the heterogeneity estimate
print(heterogeneity_estimate)

# Conduct posterior predictive checks
pp_check(meta_analysis_model)

# If you want to plot the distribution of the standard deviation of the random effects
posterior_samples <- posterior_samples(meta_analysis_model, pars = "sd_Population__Intercept")
hist(posterior_samples[, "sd_Population__Intercept"], breaks = 40, main = "Distribution of Heterogeneity (sd Population)",x="Heterogeneity (SD)",fill="blue")

# Summarize the posterior distribution of the heterogeneity parameter
posterior_summary <- summary(posterior_samples[, "sd_Population__Intercept"])
print(posterior_summary)