Ancestry PRS Load

Author

Sarah Urbut

Published

March 7, 2024

LDL PRS analyais

Code

library(dplyr)
library(broom)
library(tidyr)
library(ggplot2)
library(data.table)
library(ggridges)
load("~/Library/CloudStorage/Dropbox/pheno_dir/output/merged_pheno_censor_final_withdrugs_smoke.rds")
pheno_prs=dfh
pheno2=readRDS("~/Library/CloudStorage/Dropbox/df_ukb_pheno_updated.rds")

pheno_prs=merge(pheno_prs,pheno2[,c("Identifier","Cad_hard_Any","Cad_hard_censor_age")],by.x="identifier",by.y="Identifier")
# eth=readRDS("~/Library/CloudStorage/Dropbox/pheno_dir/ethnicity.rds")
# eth=eth[!is.na(eth$f.21000.0.0),]
# eth=eth[,c("identifier","meaning")]


baseline_levs=readRDS("~/Library/CloudStorage/Dropbox/dfukb_chol_bp_smoke.rds")
baseline_levs$hdl=baseline_levs$f.30760.0.0*38.4
baseline_levs$ldl=baseline_levs$f.30780.0.0*38.4

PCS=readRDS("~/Library/CloudStorage/Dropbox/data_forDynamicHRpaper/baseline_withPCS.rds")[,c(1,20:39)]
colnames(PCS)=c("identifier",paste0(rep("PC",20),seq(1:20)))
baseline_levs=merge(baseline_levs,PCS,by="identifier")

Phenotyping

We validated previously that UKB pheno implementation of Hard CAD is matched to HARD Cad. So we will now use that here to be c/w AoU HARD CAD pheno.

Because we are using age as time scale (or GLM with ‘evere event’ follow up time is really just time from birth to death.
We know that time is inhomogenous: even if we use OR instead of HR, the OR will tend to decrease over time because there are more non-genetically mediated CAD events

Code

m2=pheno_prs[,c("identifier","Birthdate","enrollage","Cad_hard_Any","Cad_hard_censor_age","f.31.0.0")]


m2$Cad_Sta=ifelse(m2$Cad_hard_Any==2,1,0)
m2$Cad_0_censor_age=m2$Cad_hard_censor_age



m2=m2[!is.na(m2$Cad_Sta),]
m2=m2[!is.na(m2$Cad_0_censor_age),]
summary(m2$Cad_0_censor_age[m2$Cad_Sta==1])

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
 0.04654 55.13210 62.57906 62.05194 70.03970 84.76112

Code

summary(m2$Cad_0_censor_age[m2$Cad_Sta==0])

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  40.38   63.13   70.49   69.60   76.13   88.63

Code

## old phenotypes
# df2=fread("~/Library/CloudStorage/Dropbox/UKBB_HardCAD_202109.tsv")
# m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","f.31.0.0")],eth,by.x="identifier",by.y="identifier")
# m2=merge(m2,df2[,c("sample_id","has_disease","censor_age")],by.x = "identifier",by.y="sample_id")
# m2$Cad_Sta=m2$has_disease
# m2$Cad_0_censor_age=m2$censor_age

## get these labs ready
med_codes=readRDS("~/Library/CloudStorage/Dropbox/medcodes.rds")
#levels(as.factor(med_codes$meaning))
## what do you do about meds? I think we need to eliminate, but will skew the distribtuion
baseline_meds=merge(med_codes,baseline_levs,by="identifier")
baseline_meds$ldl_adj=ifelse(baseline_meds$meaning%in%"Cholesterol lowering medication",baseline_meds$ldl/0.8,baseline_meds$ldl)
baseline_meds$sbp_adj=ifelse(baseline_meds$meaning%in%"Blood pressure medication",baseline_meds$f.4080.0.0+10,baseline_meds$f.4080.0.0)

## merge all
final_dat=merge(m2,baseline_meds[,c("identifier","sbp_adj","ldl_adj","hdl",paste0(rep("PC",10),seq(1:10)))],by.x="identifier",by.y="identifier")
library(lubridate)


Attaching package: 'lubridate'

The following objects are masked from 'package:data.table':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year

The following objects are masked from 'package:base':

    date, intersect, setdiff, union

Code

final_dat$yob=year(final_dat$Birthdate)

#final_dat=final_dat[year(final_dat$Birthdate)>1950,]

Now we want to calculate ancestry specific thresholds for PRS. Let’s get the PRS for African, European, and Asian ancestry that causes a 40 mmHg increase in LDL cholesterol in the UK Biobank using the AFR, SAS, and EUR individauls present.

Code

# final_dat <- final_dat %>%
#   mutate(recoded_ethnicity = case_when(
#     meaning %in% c("African", "Any other Black background", "Black or Black British","Caribbean","Any other mixed background","Mixed") ~ "AFR",
#     meaning %in% c("White", "Irish", "Any other white background","British") ~ "White",
#     meaning %in% c("Indian", "Pakistani","Bangladeshi") ~ "South Asian",
#     meaning == "Chinese" ~ "Chinese",
#     TRUE ~ "Other" # TAMR will be used for any ethnicity not covered above
#   ))
# 
# final_dat$Cad_Sta=ifelse(final_dat$Cad_0_Any==2,1,0)
# final_dat$sex=final_dat$f.31.0.0
# 
# table(final_dat$recoded_ethnicity)
#ggplot(lt,aes(x=Var1,y=value,fill=Var1))+geom_bar(stat="identity")

# gae=fread("~/Library/CloudStorage/Dropbox/ukb.kgp_projected.tsv.gz")
# final_dat=merge(final_dat,gae[,c("eid","rf80")],by.x="identifier",by.y="eid")
# table(final_dat$rf80)

gae=fread("~/Dropbox/yangstuff/UKB_HGDP_1KG_real.txt.gz")
gae=data.frame("IID"=as.character(gae$IID),"knn"=gae$knn_super)
final_dat=merge(final_dat,gae,by.y="IID",by.x="identifier")
table(final_dat$knn)


   AFR    AMR    CSA    EAS    EUR    MID    OCE 
  9152   1637   9981   2809 456173   1136      1

Code

prs_MID=fread("~/Dropbox/yangstuff/UKB_MID_1157.txt")[,c("IID","combined6")]
MID=final_dat$identifier[final_dat$knn%in%"MID"]

colnames(prs_MID)=c("IID","PRS")

totaldata=MID_dat=merge(prs_MID,final_dat[final_dat$knn%in%"MID",],by.x="IID",by.y="identifier")



final_dat%>%group_by(knn)%>%summarise("cad_rate"=mean(Cad_Sta),"mean_ldl"=mean(na.omit(ldl_adj)),"%M"=mean(na.omit(as.numeric(f.31.0.0))),"meanAge"=as.numeric(mean(enrollage)))

# A tibble: 7 × 5
  knn   cad_rate mean_ldl  `%M` meanAge
  <chr>    <dbl>    <dbl> <dbl>   <dbl>
1 AFR     0.0412     130. 0.412    52.4
2 AMR     0.0531     139. 0.405    52.6
3 CSA     0.128      136. 0.530    54.0
4 EAS     0.0356     136. 0.345    52.8
5 EUR     0.0771     142. 0.458    57.3
6 MID     0.0977     139. 0.613    53.2
7 OCE     0          124. 1        40.4

Histograms

We note that the distributions of these PRS are skewed for non EUR pops but will normalize for our analysis:

Code

prs_MID$Population = 'MID'; prs_MID=prs_MID[prs_MID$IID%in%MID]

Using residual PRS

Code

pcmod=lm(PRS~PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,data = totaldata)
totaldata$predictscore=predict(object = pcmod,newdata=totaldata)
totaldata$residscore=totaldata$PRS-totaldata$predictscore
totaldata$Population=as.factor(totaldata$knn)
totaldata$pop_scaled_prs=totaldata$scaled_resid=scale(totaldata$residscore)

ggplot(totaldata, aes(x = residscore, y = Population, fill = Population)) +
    geom_density_ridges(alpha = 0.7) +
    scale_fill_brewer(palette = "Set1") +
    theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
    labs(title = "Density Distribution of Sum Scores",
         x = "Sum Score",
         y = "Population")

Picking joint bandwidth of 2.35

Code

totaldata%>%group_by(Population)%>%
summarise("event_rate"=mean(Cad_Sta),"total_subjects"=length(Cad_Sta),"percent_female"=mean(f.31.0.0==0),"enrollment_age"=mean(enrollage))

# A tibble: 1 × 5
  Population event_rate total_subjects percent_female enrollment_age
  <fct>           <dbl>          <int>          <dbl> <drtn>        
1 MID            0.0977           1136          0.387 53.18859 days

Code

summary(totaldata$pop_scaled_prs)

     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-2.827728 -0.641920 -0.007595  0.000000  0.657082  2.947659

Code

ggplot(totaldata, aes(x = pop_scaled_prs, y = Population, fill = Population)) +
    geom_density_ridges(alpha = 0.7) +
    scale_fill_brewer(palette = "Set1") +
    theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
    labs(title = "Density Distribution of Sum Scores: Scaled for Population",
         x = "Sum Score",
         y = "Population")

Picking joint bandwidth of 0.214

Code

library(dplyr)
totaldata$enrollage=as.numeric(totaldata$enrollage)
totaldata%>%group_by(Population)%>%summarize(quantile(pop_scaled_prs,0.25),median(pop_scaled_prs),mean(pop_scaled_prs),quantile(pop_scaled_prs,0.75))

# A tibble: 1 × 5
  Population `quantile(pop_scaled_prs, 0.25)` `median(pop_scaled_prs)`
  <fct>                                 <dbl>                    <dbl>
1 MID                                  -0.642                 -0.00760
# ℹ 2 more variables: `mean(pop_scaled_prs)` <dbl>,
#   `quantile(pop_scaled_prs, 0.75)` <dbl>

Survival plot

Code

library(survival)
library(survminer)

Loading required package: ggpubr


Attaching package: 'survminer'

The following object is masked from 'package:survival':

    myeloma

Code

fit3 <- survfit( Surv(Cad_0_censor_age, Cad_Sta) ~ Population,data=totaldata)
ggsurvplot(fit3, data = totaldata, conf.int = FALSE,fun="event",
risk.table = TRUE, risk.table.col="strata",
ggtheme = theme_classic(),xlab="Age",palette = "nejm")

Table 1

Code

totaldata$followup=round(as.numeric(totaldata$Cad_0_censor_age)-as.numeric(totaldata$enrollage),2)
library(table1)


Attaching package: 'table1'

The following objects are masked from 'package:base':

    units, units<-

Code

dat <- totaldata
dat$Sex=ifelse(dat$f.31.0.0==1,"Male","Female")
dat$Sex=as.factor(dat$Sex)
dat$Cad_Sta=as.factor(dat$Cad_Sta)
dat$pop_scaled_prs <- as.numeric(as.character(round(dat$pop_scaled_prs,2)))
dat$enrollage=as.numeric(dat$enrollage)
dat$CAD_censor <- as.factor(dat$Cad_Sta)
label(dat$ldl_adj) <- "LDL-C (adjusted for Statin)"
label(dat$Cad_Sta) <- "CAD Ever"
label(dat$scaled_resid) <- "Overall Scaled Residual PRS"
label(dat$pop_scaled_prs) <- "Pop Scaled Residual PRS"
label(dat$enrollage) = "Age Enrolled"

# Continue with creating the table
table1(~ Sex + ldl_adj + Cad_Sta + enrollage + followup+round(pop_scaled_prs,4) + scaled_resid | Population, data = dat)

	MID (N=1136)	Overall (N=1136)
Sex
Female	440 (38.7%)	440 (38.7%)
Male	696 (61.3%)	696 (61.3%)
LDL-C (adjusted for Statin)
Mean (SD)	139 (33.2)	139 (33.2)
Median [Min, Max]	139 [49.9, 315]	139 [49.9, 315]
Missing	73 (6.4%)	73 (6.4%)
CAD Ever
0	1025 (90.2%)	1025 (90.2%)
1	111 (9.8%)	111 (9.8%)
Age Enrolled
Mean (SD)	53.2 (8.04)	53.2 (8.04)
Median [Min, Max]	52.4 [40.3, 70.1]	52.4 [40.3, 70.1]
followup
Mean (SD)	11.8 (5.71)	11.8 (5.71)
Median [Min, Max]	13.0 [-53.6, 15.5]	13.0 [-53.6, 15.5]
Pop Scaled Residual PRS
Mean (SD)	0.0000528 (1.00)	0.0000528 (1.00)
Median [Min, Max]	-0.0100 [-2.83, 2.95]	-0.0100 [-2.83, 2.95]
Overall Scaled Residual PRS
Mean (SD)	-0.0000000000000000118 (1.00)	-0.0000000000000000118 (1.00)
Median [Min, Max]	-0.00760 [-2.83, 2.95]	-0.00760 [-2.83, 2.95]

Impact of PRS

To assess the impact of LDL-C PRS on CAD risk, you need to establish a two-step relationship: first, between LDL-C PRS and LDL-C levels (per 40 mg/dL change), and then between LDL-C PRS and CAD outcomes. TAMR approach allows you to understand how changes in LDL-C levels attributed to genetic risk (as quantified by PRS) relate to CAD risk. Here’s how you can approach tAMR analysis for both European (EUR) and African (AFR) populations:

Step 1: Associate LDL-C PRS with LDL-C Levels (per 40 mg/dL)

Linear Regression for LDL-C Levels: Fit a linear regression model with LDL-C levels as the outcome and LDL-C PRS as the predictor. You might want to adjust for relevant covariates.
Rescale the Coefficient for LDL-C PRS: Adjust the coefficient from tAMR model to reflect a 40 mg/dL change in LDL-C.

Step 2: Associate Rescaled LDL-C PRS with CAD

Incorporate Rescaled LDL-C PRS into Cox Model: Use the rescaled LDL-C PRS from Step 1 as a predictor in a Cox proportional hazards model to analyze its relationship with CAD risk.
Perform Analysis for Both EUR and AFR Populations: Repeat these steps separately for EUR and AFR populations to capture population-specific effects.

Here’s an example in R for the European population. The same steps would be repeated for the African population:

Code

library(dplyr)
library(survival)

# Before the loop
results <- data.frame(
  Population = character(),
  PRS_Mean = numeric(),
  PRS_SD = numeric(),
  Percent_male = numeric(),
  LDL_Coef_CI_Lower = numeric(),
  LDL_Coef_CI_Upper = numeric(),
  Population_Scale_Factor = numeric(),
  LDL_Scaled_Coef_CI_Lower = numeric(),
  LDL_Scaled_Coef_CI_Upper = numeric(),
  HR=numeric(),
  HR_CI_Lower = numeric(),
  HR_CI_Upper = numeric(),
  OR = numeric(),
  OR_LCI = numeric(),
  OR_UCI = numeric(),
  stringsAsFactors = FALSE
)

# Assuming 'df' is your dataset

# List of unique populations
populations <- "MID"
library(broom)
library(dplyr)
# Initialize an empty data frame to store results

# Initialize an empty list for storing models and results
cox_models <- list()
glm_models = list()



pop = "MID"
# Subset data for the specific population
pop_data = totaldata[totaldata$knn%in%pop, ]


pop_data$PRS = pop_data$pop_scaled_prs


# Step 1: Linear Regression for LDL-C Levels
# Assuming 'pop' is defined elsewhere and 'pop_data' is your dataset
#if (pop == "EUR") {
ldl_model <- lm(
  ldl_adj ~ PRS + as.numeric(enrollage) + as.factor(f.31.0.0) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
  data = pop_data
)
#} else {
#  pop_data$PRS=-1*pop_data$PRS
#    ldl_model <- lm(ldl_adj ~  PRS + as.numeric(enrollage) + as.factor(f.31.0.0) + PC1 + PC2 + #PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = pop_data)
#}
ldl_coef_ci <- confint(ldl_model, "PRS")

#print(paste("LDL_coef_CI for population:", pop))
if ("PRS" %in% names(coef(ldl_model))) {
  #print(confint(ldl_model, parm = "PRS"))
} else {
  #print("PRS coefficient is not available in the model.")
}

NULL

Code

# Calculate the PRS scaling factor
prs_scale_factor <- 40 / coef(ldl_model)["PRS"]
#print(paste0(pop,":",prs_scale_factor))
#prs_scale_factor=1
# Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL
pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor


ldl_model_2 <- lm(
  ldl_adj ~ scaled_ldl_prs + as.numeric(enrollage) + as.factor(f.31.0.0) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
  data = pop_data
)

ldl_scaled_coef_ci <- confint(ldl_model_2, "scaled_ldl_prs")

#print(summary(pop_data$scaled_ldl_prs))
# Step 2: Cox Model for CAD -- inherengly accounting for age by using age as time scale
cox_model <- coxph(
  Surv(Cad_0_censor_age, Cad_Sta) ~ scaled_ldl_prs
  + as.factor(f.31.0.0) + PC1 + PC2 + PC3 + PC4 + PC5 +
    PC6 + PC7 + PC8 + PC9 + PC10,
  data = pop_data
)
#cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+sex, data = pop_data,family = "binomial")
cox_models[[pop]] <- cox_model




glm_model <- glm(
  Cad_Sta ~ scaled_ldl_prs + as.numeric(enrollage)
  + as.factor(f.31.0.0) + PC1 + PC2 + PC3 + PC4 + PC5 +
    PC6 + PC7 + PC8 + PC9 + PC10,
  data = pop_data,
  family = "binomial"
)
glm_models[[pop]] <- glm_model
log_or = summary(glm_model)$coefficients["scaled_ldl_prs", "Estimate"]
log_or_se = summary(glm_model)$coefficients["scaled_ldl_prs", "Std. Error"]
or = round(exp(log_or), 2)
lci = round(exp(log_or - 1.96 * log_or_se), 2)
uci = round(exp(log_or + 1.96 * log_or_se), 2)

glmci = confint(glm_model)

Waiting for profiling to be done...

Code

# Extract summary stats
cox_summary <- summary(cox_model)
glm_summary <- summary(glm_model)
# Predict the probability of CAD for each individual in the dataset
predicted_risk <- predict(glm_model, pop_data, type = "response")


# Compile results into the data frame
# Append to results



results <- data.frame(
  Population = pop,
  PRS_Mean = mean(pop_data$PRS),
  PRS_SD = sd(pop_data$PRS),
  Percent_male = mean(pop_data$f.31.0.0 == 1),
  LDL_Coef_CI_Lower = ldl_coef_ci[1],
  LDL_Coef_CI_Upper = ldl_coef_ci[2],
  Population_Scale_Factor = prs_scale_factor,
  LDL_Scaled_Coef_CI_Lower = ldl_scaled_coef_ci[1],
  LDL_Scaled_Coef_CI_Upper = ldl_scaled_coef_ci[2],
  HR = cox_summary$conf.int["scaled_ldl_prs", 1],
  HR_CI_Lower = cox_summary$conf.int["scaled_ldl_prs", 3],
  HR_CI_Upper = cox_summary$conf.int["scaled_ldl_prs", 4],
  OR = or,
  OR_LCI = exp(glmci['scaled_ldl_prs', 1]),
  OR_UCI = exp(glmci['scaled_ldl_prs', 2])
)

# Display results
print(results)

    Population      PRS_Mean PRS_SD Percent_male LDL_Coef_CI_Lower
PRS        MID -1.182544e-17      1    0.6126761          6.078461
    LDL_Coef_CI_Upper Population_Scale_Factor LDL_Scaled_Coef_CI_Lower
PRS          9.939555                4.994376                 30.35812
    LDL_Scaled_Coef_CI_Upper       HR HR_CI_Lower HR_CI_Upper   OR   OR_LCI
PRS                 49.64188 4.500511    1.794136    11.28933 4.66 1.688755
      OR_UCI
PRS 13.07784

Code

results

    Population      PRS_Mean PRS_SD Percent_male LDL_Coef_CI_Lower
PRS        MID -1.182544e-17      1    0.6126761          6.078461
    LDL_Coef_CI_Upper Population_Scale_Factor LDL_Scaled_Coef_CI_Lower
PRS          9.939555                4.994376                 30.35812
    LDL_Scaled_Coef_CI_Upper       HR HR_CI_Lower HR_CI_Upper   OR   OR_LCI
PRS                 49.64188 4.500511    1.794136    11.28933 4.66 1.688755
      OR_UCI
PRS 13.07784

Code

saveRDS(results, 'results_UKB_MID.rds')