Ancestry PRS Load

Author

Sarah Urbut

Published

March 7, 2024

LDL PRS analyais

Code
library(dplyr)
library(broom)
library(tidyr)
library(ggplot2)
library(data.table)
library(ggridges)
load("~/Library/CloudStorage/Dropbox/pheno_dir/output/merged_pheno_censor_final_withdrugs_smoke.rds")
pheno_prs=dfh
pheno2=readRDS("~/Library/CloudStorage/Dropbox/df_ukb_pheno_updated.rds")

pheno_prs=merge(pheno_prs,pheno2[,c("Identifier","Cad_hard_Any","Cad_hard_censor_age")],by.x="identifier",by.y="Identifier")
# eth=readRDS("~/Library/CloudStorage/Dropbox/pheno_dir/ethnicity.rds")
# eth=eth[!is.na(eth$f.21000.0.0),]
# eth=eth[,c("identifier","meaning")]


baseline_levs=readRDS("~/Library/CloudStorage/Dropbox/dfukb_chol_bp_smoke.rds")
baseline_levs$hdl=baseline_levs$f.30760.0.0*38.4
baseline_levs$ldl=baseline_levs$f.30780.0.0*38.4

PCS=readRDS("~/Library/CloudStorage/Dropbox/data_forDynamicHRpaper/baseline_withPCS.rds")[,c(1,20:39)]
colnames(PCS)=c("identifier",paste0(rep("PC",20),seq(1:20)))
baseline_levs=merge(baseline_levs,PCS,by="identifier")

Phenotyping

We validated previously that UKB pheno implementation of Hard CAD is matched to HARD Cad. So we will now use that here to be c/w AoU HARD CAD pheno.

  • Because we are using age as time scale (or GLM with ‘evere event’ follow up time is really just time from birth to death.

  • We know that time is inhomogenous: even if we use OR instead of HR, the OR will tend to decrease over time because there are more non-genetically mediated CAD events

Code
m2=pheno_prs[,c("identifier","Birthdate","enrollage","Cad_hard_Any","Cad_hard_censor_age","f.31.0.0")]


m2$Cad_Sta=ifelse(m2$Cad_hard_Any==2,1,0)
m2$Cad_0_censor_age=m2$Cad_hard_censor_age



m2=m2[!is.na(m2$Cad_Sta),]
m2=m2[!is.na(m2$Cad_0_censor_age),]
summary(m2$Cad_0_censor_age[m2$Cad_Sta==1])
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
 0.04654 55.13210 62.57906 62.05194 70.03970 84.76112 
Code
summary(m2$Cad_0_censor_age[m2$Cad_Sta==0])
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  40.38   63.13   70.49   69.60   76.13   88.63 
Code
## old phenotypes
# df2=fread("~/Library/CloudStorage/Dropbox/UKBB_HardCAD_202109.tsv")
# m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","f.31.0.0")],eth,by.x="identifier",by.y="identifier")
# m2=merge(m2,df2[,c("sample_id","has_disease","censor_age")],by.x = "identifier",by.y="sample_id")
# m2$Cad_Sta=m2$has_disease
# m2$Cad_0_censor_age=m2$censor_age

## get these labs ready
med_codes=readRDS("~/Library/CloudStorage/Dropbox/medcodes.rds")
#levels(as.factor(med_codes$meaning))
## what do you do about meds? I think we need to eliminate, but will skew the distribtuion
baseline_meds=merge(med_codes,baseline_levs,by="identifier")
baseline_meds$ldl_adj=ifelse(baseline_meds$meaning%in%"Cholesterol lowering medication",baseline_meds$ldl/0.8,baseline_meds$ldl)
baseline_meds$sbp_adj=ifelse(baseline_meds$meaning%in%"Blood pressure medication",baseline_meds$f.4080.0.0+10,baseline_meds$f.4080.0.0)

## merge all
final_dat=merge(m2,baseline_meds[,c("identifier","sbp_adj","ldl_adj","hdl",paste0(rep("PC",10),seq(1:10)))],by.x="identifier",by.y="identifier")
library(lubridate)

Attaching package: 'lubridate'
The following objects are masked from 'package:data.table':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
Code
final_dat$yob=year(final_dat$Birthdate)

#final_dat=final_dat[year(final_dat$Birthdate)>1950,]

Now we want to calculate ancestry specific thresholds for PRS. Let’s get the PRS for African, European, and Asian ancestry that causes a 40 mmHg increase in LDL cholesterol in the UK Biobank using the AFR, SAS, and EUR individauls present.

Code
# final_dat <- final_dat %>%
#   mutate(recoded_ethnicity = case_when(
#     meaning %in% c("African", "Any other Black background", "Black or Black British","Caribbean","Any other mixed background","Mixed") ~ "AFR",
#     meaning %in% c("White", "Irish", "Any other white background","British") ~ "White",
#     meaning %in% c("Indian", "Pakistani","Bangladeshi") ~ "South Asian",
#     meaning == "Chinese" ~ "Chinese",
#     TRUE ~ "Other" # TAMR will be used for any ethnicity not covered above
#   ))
# 
# final_dat$Cad_Sta=ifelse(final_dat$Cad_0_Any==2,1,0)
# final_dat$sex=final_dat$f.31.0.0
# 
# table(final_dat$recoded_ethnicity)
#ggplot(lt,aes(x=Var1,y=value,fill=Var1))+geom_bar(stat="identity")

# gae=fread("~/Library/CloudStorage/Dropbox/ukb.kgp_projected.tsv.gz")
# final_dat=merge(final_dat,gae[,c("eid","rf80")],by.x="identifier",by.y="eid")
# table(final_dat$rf80)

gae=fread("~/Dropbox/yangstuff/UKB_HGDP_1KG_real.txt.gz")
gae=data.frame("IID"=as.character(gae$IID),"knn"=gae$knn_super)
final_dat=merge(final_dat,gae,by.y="IID",by.x="identifier")
table(final_dat$knn)

   AFR    AMR    CSA    EAS    EUR    MID    OCE 
  9152   1637   9981   2809 456173   1136      1 
Code
prs_MID=fread("~/Dropbox/yangstuff/UKB_MID_1157.txt")[,c("IID","combined6")]
MID=final_dat$identifier[final_dat$knn%in%"MID"]

colnames(prs_MID)=c("IID","PRS")

totaldata=MID_dat=merge(prs_MID,final_dat[final_dat$knn%in%"MID",],by.x="IID",by.y="identifier")



final_dat%>%group_by(knn)%>%summarise("cad_rate"=mean(Cad_Sta),"mean_ldl"=mean(na.omit(ldl_adj)),"%M"=mean(na.omit(as.numeric(f.31.0.0))),"meanAge"=as.numeric(mean(enrollage)))
# A tibble: 7 × 5
  knn   cad_rate mean_ldl  `%M` meanAge
  <chr>    <dbl>    <dbl> <dbl>   <dbl>
1 AFR     0.0412     130. 0.412    52.4
2 AMR     0.0531     139. 0.405    52.6
3 CSA     0.128      136. 0.530    54.0
4 EAS     0.0356     136. 0.345    52.8
5 EUR     0.0771     142. 0.458    57.3
6 MID     0.0977     139. 0.613    53.2
7 OCE     0          124. 1        40.4

Histograms

We note that the distributions of these PRS are skewed for non EUR pops but will normalize for our analysis:

Code
prs_MID$Population = 'MID'; prs_MID=prs_MID[prs_MID$IID%in%MID]

Using residual PRS

Code
pcmod=lm(PRS~PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,data = totaldata)
totaldata$predictscore=predict(object = pcmod,newdata=totaldata)
totaldata$residscore=totaldata$PRS-totaldata$predictscore
totaldata$Population=as.factor(totaldata$knn)
totaldata$pop_scaled_prs=totaldata$scaled_resid=scale(totaldata$residscore)

ggplot(totaldata, aes(x = residscore, y = Population, fill = Population)) +
    geom_density_ridges(alpha = 0.7) +
    scale_fill_brewer(palette = "Set1") +
    theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
    labs(title = "Density Distribution of Sum Scores",
         x = "Sum Score",
         y = "Population")
Picking joint bandwidth of 2.35

Code
totaldata%>%group_by(Population)%>%
summarise("event_rate"=mean(Cad_Sta),"total_subjects"=length(Cad_Sta),"percent_female"=mean(f.31.0.0==0),"enrollment_age"=mean(enrollage))
# A tibble: 1 × 5
  Population event_rate total_subjects percent_female enrollment_age
  <fct>           <dbl>          <int>          <dbl> <drtn>        
1 MID            0.0977           1136          0.387 53.18859 days 
Code
summary(totaldata$pop_scaled_prs)
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-2.827728 -0.641920 -0.007595  0.000000  0.657082  2.947659 
Code
ggplot(totaldata, aes(x = pop_scaled_prs, y = Population, fill = Population)) +
    geom_density_ridges(alpha = 0.7) +
    scale_fill_brewer(palette = "Set1") +
    theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
    labs(title = "Density Distribution of Sum Scores: Scaled for Population",
         x = "Sum Score",
         y = "Population")
Picking joint bandwidth of 0.214

Code
library(dplyr)
totaldata$enrollage=as.numeric(totaldata$enrollage)
totaldata%>%group_by(Population)%>%summarize(quantile(pop_scaled_prs,0.25),median(pop_scaled_prs),mean(pop_scaled_prs),quantile(pop_scaled_prs,0.75))
# A tibble: 1 × 5
  Population `quantile(pop_scaled_prs, 0.25)` `median(pop_scaled_prs)`
  <fct>                                 <dbl>                    <dbl>
1 MID                                  -0.642                 -0.00760
# ℹ 2 more variables: `mean(pop_scaled_prs)` <dbl>,
#   `quantile(pop_scaled_prs, 0.75)` <dbl>

Survival plot

Code
library(survival)
library(survminer)
Loading required package: ggpubr

Attaching package: 'survminer'
The following object is masked from 'package:survival':

    myeloma
Code
fit3 <- survfit( Surv(Cad_0_censor_age, Cad_Sta) ~ Population,data=totaldata)
ggsurvplot(fit3, data = totaldata, conf.int = FALSE,fun="event",
risk.table = TRUE, risk.table.col="strata",
ggtheme = theme_classic(),xlab="Age",palette = "nejm")

Table 1

Code
totaldata$followup=round(as.numeric(totaldata$Cad_0_censor_age)-as.numeric(totaldata$enrollage),2)
library(table1)

Attaching package: 'table1'
The following objects are masked from 'package:base':

    units, units<-
Code
dat <- totaldata
dat$Sex=ifelse(dat$f.31.0.0==1,"Male","Female")
dat$Sex=as.factor(dat$Sex)
dat$Cad_Sta=as.factor(dat$Cad_Sta)
dat$pop_scaled_prs <- as.numeric(as.character(round(dat$pop_scaled_prs,2)))
dat$enrollage=as.numeric(dat$enrollage)
dat$CAD_censor <- as.factor(dat$Cad_Sta)
label(dat$ldl_adj) <- "LDL-C (adjusted for Statin)"
label(dat$Cad_Sta) <- "CAD Ever"
label(dat$scaled_resid) <- "Overall Scaled Residual PRS"
label(dat$pop_scaled_prs) <- "Pop Scaled Residual PRS"
label(dat$enrollage) = "Age Enrolled"

# Continue with creating the table
table1(~ Sex + ldl_adj + Cad_Sta + enrollage + followup+round(pop_scaled_prs,4) + scaled_resid | Population, data = dat)
MID
(N=1136)
Overall
(N=1136)
Sex
Female 440 (38.7%) 440 (38.7%)
Male 696 (61.3%) 696 (61.3%)
LDL-C (adjusted for Statin)
Mean (SD) 139 (33.2) 139 (33.2)
Median [Min, Max] 139 [49.9, 315] 139 [49.9, 315]
Missing 73 (6.4%) 73 (6.4%)
CAD Ever
0 1025 (90.2%) 1025 (90.2%)
1 111 (9.8%) 111 (9.8%)
Age Enrolled
Mean (SD) 53.2 (8.04) 53.2 (8.04)
Median [Min, Max] 52.4 [40.3, 70.1] 52.4 [40.3, 70.1]
followup
Mean (SD) 11.8 (5.71) 11.8 (5.71)
Median [Min, Max] 13.0 [-53.6, 15.5] 13.0 [-53.6, 15.5]
Pop Scaled Residual PRS
Mean (SD) 0.0000528 (1.00) 0.0000528 (1.00)
Median [Min, Max] -0.0100 [-2.83, 2.95] -0.0100 [-2.83, 2.95]
Overall Scaled Residual PRS
Mean (SD) -0.0000000000000000118 (1.00) -0.0000000000000000118 (1.00)
Median [Min, Max] -0.00760 [-2.83, 2.95] -0.00760 [-2.83, 2.95]

Impact of PRS

To assess the impact of LDL-C PRS on CAD risk, you need to establish a two-step relationship: first, between LDL-C PRS and LDL-C levels (per 40 mg/dL change), and then between LDL-C PRS and CAD outcomes. TAMR approach allows you to understand how changes in LDL-C levels attributed to genetic risk (as quantified by PRS) relate to CAD risk. Here’s how you can approach tAMR analysis for both European (EUR) and African (AFR) populations:

Step 1: Associate LDL-C PRS with LDL-C Levels (per 40 mg/dL)

  1. Linear Regression for LDL-C Levels: Fit a linear regression model with LDL-C levels as the outcome and LDL-C PRS as the predictor. You might want to adjust for relevant covariates.

  2. Rescale the Coefficient for LDL-C PRS: Adjust the coefficient from tAMR model to reflect a 40 mg/dL change in LDL-C.

Step 2: Associate Rescaled LDL-C PRS with CAD

  1. Incorporate Rescaled LDL-C PRS into Cox Model: Use the rescaled LDL-C PRS from Step 1 as a predictor in a Cox proportional hazards model to analyze its relationship with CAD risk.

  2. Perform Analysis for Both EUR and AFR Populations: Repeat these steps separately for EUR and AFR populations to capture population-specific effects.

Here’s an example in R for the European population. The same steps would be repeated for the African population:

Code
library(dplyr)
library(survival)

# Before the loop
results <- data.frame(
  Population = character(),
  PRS_Mean = numeric(),
  PRS_SD = numeric(),
  Percent_male = numeric(),
  LDL_Coef_CI_Lower = numeric(),
  LDL_Coef_CI_Upper = numeric(),
  Population_Scale_Factor = numeric(),
  LDL_Scaled_Coef_CI_Lower = numeric(),
  LDL_Scaled_Coef_CI_Upper = numeric(),
  HR=numeric(),
  HR_CI_Lower = numeric(),
  HR_CI_Upper = numeric(),
  OR = numeric(),
  OR_LCI = numeric(),
  OR_UCI = numeric(),
  stringsAsFactors = FALSE
)

# Assuming 'df' is your dataset

# List of unique populations
populations <- "MID"
library(broom)
library(dplyr)
# Initialize an empty data frame to store results

# Initialize an empty list for storing models and results
cox_models <- list()
glm_models = list()



pop = "MID"
# Subset data for the specific population
pop_data = totaldata[totaldata$knn%in%pop, ]


pop_data$PRS = pop_data$pop_scaled_prs


# Step 1: Linear Regression for LDL-C Levels
# Assuming 'pop' is defined elsewhere and 'pop_data' is your dataset
#if (pop == "EUR") {
ldl_model <- lm(
  ldl_adj ~ PRS + as.numeric(enrollage) + as.factor(f.31.0.0) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
  data = pop_data
)
#} else {
#  pop_data$PRS=-1*pop_data$PRS
#    ldl_model <- lm(ldl_adj ~  PRS + as.numeric(enrollage) + as.factor(f.31.0.0) + PC1 + PC2 + #PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = pop_data)
#}
ldl_coef_ci <- confint(ldl_model, "PRS")

#print(paste("LDL_coef_CI for population:", pop))
if ("PRS" %in% names(coef(ldl_model))) {
  #print(confint(ldl_model, parm = "PRS"))
} else {
  #print("PRS coefficient is not available in the model.")
}
NULL
Code
# Calculate the PRS scaling factor
prs_scale_factor <- 40 / coef(ldl_model)["PRS"]
#print(paste0(pop,":",prs_scale_factor))
#prs_scale_factor=1
# Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL
pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor


ldl_model_2 <- lm(
  ldl_adj ~ scaled_ldl_prs + as.numeric(enrollage) + as.factor(f.31.0.0) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
  data = pop_data
)

ldl_scaled_coef_ci <- confint(ldl_model_2, "scaled_ldl_prs")

#print(summary(pop_data$scaled_ldl_prs))
# Step 2: Cox Model for CAD -- inherengly accounting for age by using age as time scale
cox_model <- coxph(
  Surv(Cad_0_censor_age, Cad_Sta) ~ scaled_ldl_prs
  + as.factor(f.31.0.0) + PC1 + PC2 + PC3 + PC4 + PC5 +
    PC6 + PC7 + PC8 + PC9 + PC10,
  data = pop_data
)
#cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+sex, data = pop_data,family = "binomial")
cox_models[[pop]] <- cox_model




glm_model <- glm(
  Cad_Sta ~ scaled_ldl_prs + as.numeric(enrollage)
  + as.factor(f.31.0.0) + PC1 + PC2 + PC3 + PC4 + PC5 +
    PC6 + PC7 + PC8 + PC9 + PC10,
  data = pop_data,
  family = "binomial"
)
glm_models[[pop]] <- glm_model
log_or = summary(glm_model)$coefficients["scaled_ldl_prs", "Estimate"]
log_or_se = summary(glm_model)$coefficients["scaled_ldl_prs", "Std. Error"]
or = round(exp(log_or), 2)
lci = round(exp(log_or - 1.96 * log_or_se), 2)
uci = round(exp(log_or + 1.96 * log_or_se), 2)

glmci = confint(glm_model)
Waiting for profiling to be done...
Code
# Extract summary stats
cox_summary <- summary(cox_model)
glm_summary <- summary(glm_model)
# Predict the probability of CAD for each individual in the dataset
predicted_risk <- predict(glm_model, pop_data, type = "response")


# Compile results into the data frame
# Append to results



results <- data.frame(
  Population = pop,
  PRS_Mean = mean(pop_data$PRS),
  PRS_SD = sd(pop_data$PRS),
  Percent_male = mean(pop_data$f.31.0.0 == 1),
  LDL_Coef_CI_Lower = ldl_coef_ci[1],
  LDL_Coef_CI_Upper = ldl_coef_ci[2],
  Population_Scale_Factor = prs_scale_factor,
  LDL_Scaled_Coef_CI_Lower = ldl_scaled_coef_ci[1],
  LDL_Scaled_Coef_CI_Upper = ldl_scaled_coef_ci[2],
  HR = cox_summary$conf.int["scaled_ldl_prs", 1],
  HR_CI_Lower = cox_summary$conf.int["scaled_ldl_prs", 3],
  HR_CI_Upper = cox_summary$conf.int["scaled_ldl_prs", 4],
  OR = or,
  OR_LCI = exp(glmci['scaled_ldl_prs', 1]),
  OR_UCI = exp(glmci['scaled_ldl_prs', 2])
)

# Display results
print(results)
    Population      PRS_Mean PRS_SD Percent_male LDL_Coef_CI_Lower
PRS        MID -1.182544e-17      1    0.6126761          6.078461
    LDL_Coef_CI_Upper Population_Scale_Factor LDL_Scaled_Coef_CI_Lower
PRS          9.939555                4.994376                 30.35812
    LDL_Scaled_Coef_CI_Upper       HR HR_CI_Lower HR_CI_Upper   OR   OR_LCI
PRS                 49.64188 4.500511    1.794136    11.28933 4.66 1.688755
      OR_UCI
PRS 13.07784
Code
results
    Population      PRS_Mean PRS_SD Percent_male LDL_Coef_CI_Lower
PRS        MID -1.182544e-17      1    0.6126761          6.078461
    LDL_Coef_CI_Upper Population_Scale_Factor LDL_Scaled_Coef_CI_Lower
PRS          9.939555                4.994376                 30.35812
    LDL_Scaled_Coef_CI_Upper       HR HR_CI_Lower HR_CI_Upper   OR   OR_LCI
PRS                 49.64188 4.500511    1.794136    11.28933 4.66 1.688755
      OR_UCI
PRS 13.07784
Code
saveRDS(results, 'results_UKB_MID.rds')