Threshold Analysis

Weibull Exploration

Author

Affiliation

Sarah Urbut

MGH

Published

February 24, 2001

Rare and Common Event Timing

In general, individuals with both high common and rare variant burden have ‘earlier’ onset disease, but how can we quantify the proportion of rare/common heritability by time of event. Let’s first simulate some data:

We’ll simulate 10 (MAF 1e-1) rare and 100 (MAF 5e-2) common variants with rare variant effect sizes \N (15,0.5) and \N (0.1,1) respectively.
Every individuals total genetic load will then be the sum of this rare and common genetic load which we can sum, and normalize N(0,1)

Code

library(survival)
library(gridExtra)
library(data.table)
library(ggsci)
library(dplyr)
library(reshape2)
library(dplyr)
library(ggplot2)
library(ggridges)

set.seed(123)


list.files(path="out/")

 [1] "APOB.pLOF.snp"                 "FH.1.long.txt.gz"             
 [3] "FH.19.long.txt.gz"             "FH.2.long.txt.gz"             
 [5] "FH.coding.txt"                 "FH.coding.txt.gz"             
 [7] "FH.ldlc.coding.txt.gz"         "FH.pathogenic.snp"            
 [9] "FH.variant.txt"                "LDLR.deleterious.snp"         
[11] "LDLR.pLOF.snp"                 "PCSK9.pLOF.snp"               
[13] "README.md"                     "ukb_exome_450k_fh.carrier.txt"
[15] "ukb_exome_450k_fh.qced.txt"

Code

fhp=fread("out/ukb_exome_450k_fh.qced.txt")
fhc=fread("out/ukb_exome_450k_fh.carrier.txt")
m=merge(fhp,fhc,all.x=T,by.x="x",by.y="IID")
m$carrier=ifelse(!is.na(m$snp),1,0)
prs=readRDS("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/prs_subset.rds")
prs=prs[,c("Identifier","CAD","LDL_SF","HDL","CVD","ISS")]
prs$CAD=scale(prs$CAD)
prs$LDL_SF=scale(prs$LDL_SF)
prs$HDL=scale(prs$HDL)
prs$ISS=scale(prs$ISS)
m2=merge(prs,m,by.x = "Identifier",by.y="x")
phenos=readRDS("~/Library/CloudStorage/Dropbox-Personal/df_ukb_pheno_updated.rds")
df=merge(phenos[,c("Identifier","Cad_Any","Cad_censor_age")],m2,by="Identifier")
df=df[!is.na(df$Cad_Any),]

bottom_5 <- df %>% 
  filter(CAD >= quantile(CAD, 0.0),
         CAD < quantile(CAD, 0.2),
         Cad_Any == 2)
bottom_5$group="Low 5 % PRS"

middle_75 <- df %>% 
  filter(CAD >= quantile(CAD, 0.2),
         CAD < quantile(CAD, 0.95),
         Cad_Any == 2)
middle_75$group="Mid 75 % PRS"

top_5 <- df %>% 
  filter(CAD >= quantile(CAD, 0.95),
         Cad_Any == 2)
top_5$group="Top 5 % PRS"

fh_carrier <- df %>% 
  filter(carrier == 1,
         Cad_Any == 2)
fh_carrier$group="FH"

fh_carrier_top <- df %>% 
  filter(carrier == 1&CAD >= quantile(CAD, 0.95))
fh_carrier_top$group="FH&top20% Cad PRS"

combined=rbind(bottom_5,middle_75,top_5,fh_carrier,fh_carrier_top)

Code

df <- combined %>%
  mutate(group = factor(group, levels = c("FH&top20% Cad PRS","FH", "Top 5 % PRS", "Mid 75 % PRS", "Low 5 % PRS")))

# Create geom_ridge plot
p <- ggplot() +
  geom_density_ridges(data = df, aes(x = Cad_censor_age, y = group, fill = group), alpha = 0.5) +
  labs(title = "Distribution of Age of CAD by CAD-PRS and FH Carrier Status",
       x = "Cad_Censor_age",
       y = "CAD",
       fill = "PRS of FH") +
  theme_minimal() +
  scale_fill_jama()

print(p)

Picking joint bandwidth of 2.22

Warning: Removed 87 rows containing non-finite values
(`stat_density_ridges()`).

Forest Plots

Code

phenos=readRDS("~/Library/CloudStorage/Dropbox-Personal/df_ukb_pheno_updated.rds")
df=merge(phenos[,c("Identifier","Cad_Any","Cad_censor_age")],m2,by="Identifier")
df=df[!is.na(df$Cad_Any),]

bottom_5 <- df %>% 
  filter(CAD >= quantile(CAD, 0.0),
         CAD < quantile(CAD, 0.2))
bottom_5$group="Low 5% PRS"

middle_75 <- df %>% 
  filter(CAD >= quantile(CAD, 0.2),
         CAD < quantile(CAD, 0.95))
middle_75$group="Mid 75% PRS"

top_5 <- df %>% 
  filter(CAD >= quantile(CAD, 0.95))
top_5$group="Top 5% PRS"

fh_carrier <- df %>% 
  filter(carrier == 1)
fh_carrier$group="FH"

fh_carrier_top <- df %>% 
  filter(carrier == 1&CAD >= quantile(CAD, 0.95))
fh_carrier_top$group="FH&top20% Cad PRS"

combined=rbind(bottom_5,middle_75,top_5,fh_carrier,fh_carrier_top)



library(dplyr)
library(ggplot2)

# Summarize age of event for each group
summary_age <- combined %>%
  filter(Cad_Any == 2) %>%
  group_by(group) %>%
  summarize(mean_age = mean(Cad_censor_age, na.rm = TRUE))

# Summarize mean probability of event being 2 for each group
summary_prob <- combined %>%
  group_by(group) %>%
  summarize(mean_prob = mean(Cad_Any == 2, na.rm = TRUE))

# Combine summary statistics into one dataframe
summary_df <- merge(summary_age, summary_prob, by = "group")

summary_df <- summary_df %>%
  mutate(group = factor(group, levels = c("FH&top20% Cad PRS","FH", "Top 5% PRS", "Mid 75% PRS", "Low 5% PRS")))

m=melt(summary_df,id.vars="group")
# Create forest plot
p_forest <- ggplot(summary_df, aes(x = mean_age, y = order(group,mean_age),col=group)) +
  geom_point(aes(size = mean_prob)) +
  geom_errorbarh(aes(xmin = mean_age - 1.96 * sd(mean_age), xmax = mean_age + 1.96 * sd(mean_age))) +
  scale_size_continuous(name = "Mean Probability of CAD",
                        labels = scales::percent_format(accuracy = 0.1)) +scale_color_jama()+
  labs(title = "Forest Plot of Age of Event and Mean Probability by Group",
       x = "Mean Age of Event",
       y = "Genetic Grouping",color="Genetic Status") +
  theme_minimal()

print(p_forest)

Furthermore, we can see that rare event (FH) carriers are almost doubly enriched (overrepresented) in high overall score: