Threshold Analysis

Weibull Exploration

Author

Affiliation

Sarah Urbut

MGH

Published

January 7, 2024

Overview of the trial and intervention

Code

library(survival)
library(dplyr)
library(reshape2)
library(dplyr)
library(ggplot2)

# Summarize by age
# Define the number of individuals
n <- 10000
n_rare_variants <- 10 # Number of rare variants per individual
n_common_variants <- 100 # Number of common variants per individual

# Simulate rare and common variant effects as before
set.seed(123)  # For reproducibility

# Rare variants (assume 15 effect size, 1e-3 frequency)
rare_variants <- matrix(rbinom(n * n_rare_variants, 1, 1e-3), ncol = n_rare_variants) 
rare_effects=t(matrix(rnorm(n_rare_variants,mean = 15,0.5),ncol=n_rare_variants))
rare_load=rare_variants%*%rare_effects

# Common variants (assume 0.1 effect size, 0.5 frequency)
common_variants <- matrix(rbinom(n * n_common_variants, 1, 0.5), ncol = n_common_variants) 
common_effects=t(matrix(rnorm(n_common_variants,mean = 0,1),ncol = n_common_variants))
common_effects=scale(common_effects)
## on the order of 10% for rare variant burden of heritability

common_load=common_variants%*%common_effects
# Calculate the genetic risk score for each individual as the sum of effects
genetic_risk_score <- common_load+rare_load

# Assuming genetic_risk_score is already calculated
# quantile normalizing the distribution of genetic risk score
genetic_risk_score_normalized <- scale(genetic_risk_score)

hist(genetic_risk_score)

Code

hist(genetic_risk_score_normalized)

Code

hist(rare_load)

Code

hist(common_load)

Code

## confrim that the individauls at the extreme of the normalized score also have highest rare burden
plot(genetic_risk_score_normalized,rare_load)

Let’s get some empirical Data

Here we grab event ages (minimum event) from UKB and show that the simulated distribtuion matches the actual distribution quite well when using weibull fit:

Code

# mi from ukb
mi=readRDS("~/Library/CloudStorage/Dropbox-Personal/mi.rds")
hist(mi$`min(age_diag)`,main="From UKB")

Code

summary(mi$`min(age_diag)`)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  31.50   59.40   66.60   65.72   72.80   86.20

Code

surv_obj <- Surv(mi$`min(age_diag)`)

# Fit the Weibull model
fit <- survreg(surv_obj ~ 1, dist = 'weibull')

# survreg scale parameter maps to 1/shape, linear predictor to log(scale) per 
## survreg.distributions {survival}

# Extract the shape and scale parameters
shape_parameter_ukb <- 1 / fit$scale
scale_parameter_ukb <- exp(coef(fit))

event_ages <- rweibull(n, shape = shape_parameter_ukb, scale = scale_parameter_ukb)

hist(event_ages,main="Using Weibull Simulation")

Adding genetics

Now, we’ll try and use these settings to adjust our simulations accordingly

Code

# Parameters for the Weibull distribution
shape_parameter <- shape_parameter_ukb  # Adjusted shape parameter for the desired 'J-shaped' distribution
#scale_base <- 55  # Base scale parameter around which to center the disease onset ages
# Adjust the scale parameter of the Weibull distribution based on the genetic risk score
# Higher genetic risk scores should lead to a lower scale parameter (earlier onset)
# Add a constant to ensure the scale parameter stays positive
 # Determines how strongly the risk score affects the scale
grm = 5
scale_parameter <- scale_parameter_ukb - grm*genetic_risk_score_normalized # Adjust the 10 multiplier as needed
# Ensure that the scale parameter does not go negative or too low, which could cause unrealistic ages
#scale_parameter[scale_parameter < 1] <- 1
# Simulate the disease onset ages
event_ages <- rweibull(n, shape = shape_parameter, scale = scale_parameter)
# Ensure that event ages are within a realistic range
min_age <- 20
max_age <- 100

variant_summary <- data.frame(
  event_age = round(event_ages),  # Round event ages to the nearest whole number for grouping
  # Count of common variants
  rare_burden = rare_load,  # Burden of rare variants
  common_burden = common_load,
  score=genetic_risk_score_normalized # Burden of common variants
)
#event_ages[event_ages > max_age] <- max_age
# Create a histogram to visualize the distribution of event ages
hist(event_ages[event_ages<100&event_ages>20], breaks = max_age - min_age, main = "Histogram of Event Ages", xlab = "Age", ylab = "Frequency")

Code

# You can also calculate and print summary statistics if needed
summary(event_ages)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  22.38   59.11   66.18   65.73   72.93  100.38

We’re not completely satisfied because we see that the distribtuion is more symmetric than we might hope. But let’s proceed with aggregate analyses.

Code

#Create a dataframe that includes the event age, counts, and burdens of variants for each individual}

# Aggregate the counts and burdens by event age
summary_by_age <- variant_summary%>%group_by(event_age)%>%
summarize(n=length(rare_burden),rare_burden=mean(rare_burden),common_burden=mean(common_burden),mean_score=mean(score))
# Rename the columns for clarity
names(summary_by_age) <- c("Age", "Event Count", "Rare Burden", "Common Burden", "Score")
# Print the summary table
#print(summary_by_age,n=60)
m=melt(summary_by_age,id.vars="Age")
ggplot(m,aes(Age,value,fill=variable))+geom_bar(stat="identity")+
facet_wrap(~variable,scales="free")+theme_classic()

Heritability like

Take the correlation between the normalized score and the binary phenotype of MI by age N

R2 is effectively the observed scaled narrow sense heritability.

Code

plot(sapply(c(40:70),function(x){cor(genetic_risk_score_normalized,ifelse(event_ages<x,1,0))}),xlab="Age",ylab="R2",pch=19)

Code

plot(x=40:70,sapply(c(40:70),function(x){cor(rare_load,ifelse(event_ages<x,1,0))}),xlab="Age",ylab="R2",pch=19,col="red",main="Rare Heritability",ylim=c(0,0.4))

points(x=40:70,sapply(c(40:70),function(x){cor(common_load,ifelse(event_ages<x,1,0))}),pch=19,col="blue")

Introducing non CAD competing events

In the normal data, we observe a plateau.

This plateau could indicate that there are competing risks present that affect the distribution of event ages. In survival analysis, competing risks occur when individuals are at risk of several different events and the occurrence of one event prevents the occurrence of the other event. This is common in older populations where the risk of death from non-CAD causes increases with age.

Here are some suggestions to account for competing risks in your simulation:

Competing Risks Model: Instead of using a single Weibull distribution, consider using a competing risks framework to simulate the time to CAD event and the time to other events separately. The cmprsk package in R can handle competing risks.
Subdistribution Hazard: Instead of focusing on the cause-specific hazard, which only considers the hazard for the event of interest, look at the subdistribution hazard, which takes into account the fact that some individuals will experience competing events. The survival package can fit subdistribution hazard models using the survreg function with the competing.risks argument.
Plateau Incorporation: Adjust your simulation to incorporate a plateau. For instance, you could simulate two separate age distributions: one for the initial rise and peak and another for the plateau, and then combine them.
Truncation or Censoring: You could also consider truncating the simulated distribution at an age where the plateau begins or treat the plateau as censoring due to competing risks.

To refine your simulation, you would take the following steps:

Simulate the non-CAD death times using another survival distribution or a constant rate of occurrence.
Generate CAD event times with the estimated Weibull parameters.
Combine the two sets of times, taking the minimum time for each individual as the observed time and noting the cause of the event.

Analyze the combined dataset considering the competing events.

Code

library(purrr)
n <- 10000  # The number of individuals to simulate
mean_non_cad_death_age <- 80  # The average age at non-CAD death
min_age_threshold <- 40  # The minimum realistic age for CAD onset


df_pheno=readRDS("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/output/df_final_withddeath.rds")
# Simulate non-CAD death times with the corrected rate
non_cad_death_ages <- na.omit(df_pheno$Death_censor_age[df_pheno$Cad_0_Any==1])

fit_non_cad <- survreg(Surv(non_cad_death_ages) ~ 1, dist = 'weibull')

# Extract the shape and scale parameters
shape_non_cad <- 1 / fit_non_cad$scale
scale_non_cad <- exp(coef(fit_non_cad))


simulated_non_cad_death_times <- rweibull(n, shape = shape_non_cad, scale = scale_non_cad)

# Plot the histogram of the simulated non-CAD death times to compare with the empirical histogram
hist(simulated_non_cad_death_times, breaks = 100, main = "Simulated Non-CAD Death Times")

Code

combined_event_times <- pmin(event_ages, simulated_non_cad_death_times)


# Exclude unrealistic early deaths by setting a minimum age threshold
combined_event_times[combined_event_times < min_age_threshold] <- min_age_threshold

# Plot the distribution of the combined event times with the threshold
hist(combined_event_times, breaks = 50, main = "Histogram of Combined Event Times", xlim = c(min_age_threshold, 100))

make P (T>t| S(t)C*exp(k*t-1) — make 1/C a linear function of genetics
make weibull distribution with same shape parameter that yields mean age of interest and rate (1/Scale) = 1/population mean prevalence
1/2 both rate parameters to recover the desired behavior
- look up prevalence numbers for cancer and CAD