library(survival)library(dplyr)library(reshape2)library(dplyr)library(ggplot2)# Summarize by age# Define the number of individualsn <-10000n_rare_variants <-10# Number of rare variants per individualn_common_variants <-100# Number of common variants per individual# Simulate rare and common variant effects as beforeset.seed(123) # For reproducibility# Rare variants (assume 15 effect size, 1e-3 frequency)rare_variants <-matrix(rbinom(n * n_rare_variants, 1, 1e-3), ncol = n_rare_variants) rare_effects=t(matrix(rnorm(n_rare_variants,mean =15,0.5),ncol=n_rare_variants))rare_load=rare_variants%*%rare_effects# Common variants (assume 0.1 effect size, 0.5 frequency)common_variants <-matrix(rbinom(n * n_common_variants, 1, 0.5), ncol = n_common_variants) common_effects=t(matrix(rnorm(n_common_variants,mean =0,1),ncol = n_common_variants))common_effects=scale(common_effects)## on the order of 10% for rare variant burden of heritabilitycommon_load=common_variants%*%common_effects# Calculate the genetic risk score for each individual as the sum of effectsgenetic_risk_score <- common_load+rare_load# Assuming genetic_risk_score is already calculated# quantile normalizing the distribution of genetic risk scoregenetic_risk_score_normalized <-scale(genetic_risk_score)hist(genetic_risk_score)
Code
hist(genetic_risk_score_normalized)
Code
hist(rare_load)
Code
hist(common_load)
Code
## confrim that the individauls at the extreme of the normalized score also have highest rare burdenplot(genetic_risk_score_normalized,rare_load)
Let’s get some empirical Data
Here we grab event ages (minimum event) from UKB and show that the simulated distribtuion matches the actual distribution quite well when using weibull fit:
Code
# mi from ukbmi=readRDS("~/Library/CloudStorage/Dropbox-Personal/mi.rds")hist(mi$`min(age_diag)`,main="From UKB")
Code
summary(mi$`min(age_diag)`)
Min. 1st Qu. Median Mean 3rd Qu. Max.
31.50 59.40 66.60 65.72 72.80 86.20
Code
surv_obj <-Surv(mi$`min(age_diag)`)# Fit the Weibull modelfit <-survreg(surv_obj ~1, dist ='weibull')# survreg scale parameter maps to 1/shape, linear predictor to log(scale) per ## survreg.distributions {survival}# Extract the shape and scale parametersshape_parameter_ukb <-1/ fit$scalescale_parameter_ukb <-exp(coef(fit))event_ages <-rweibull(n, shape = shape_parameter_ukb, scale = scale_parameter_ukb)hist(event_ages,main="Using Weibull Simulation")
Adding genetics
Now, we’ll try and use these settings to adjust our simulations accordingly
Code
# Parameters for the Weibull distributionshape_parameter <- shape_parameter_ukb # Adjusted shape parameter for the desired 'J-shaped' distribution#scale_base <- 55 # Base scale parameter around which to center the disease onset ages# Adjust the scale parameter of the Weibull distribution based on the genetic risk score# Higher genetic risk scores should lead to a lower scale parameter (earlier onset)# Add a constant to ensure the scale parameter stays positive# Determines how strongly the risk score affects the scalegrm =5scale_parameter <- scale_parameter_ukb - grm*genetic_risk_score_normalized # Adjust the 10 multiplier as needed# Ensure that the scale parameter does not go negative or too low, which could cause unrealistic ages#scale_parameter[scale_parameter < 1] <- 1# Simulate the disease onset agesevent_ages <-rweibull(n, shape = shape_parameter, scale = scale_parameter)# Ensure that event ages are within a realistic rangemin_age <-20max_age <-100variant_summary <-data.frame(event_age =round(event_ages), # Round event ages to the nearest whole number for grouping# Count of common variantsrare_burden = rare_load, # Burden of rare variantscommon_burden = common_load,score=genetic_risk_score_normalized # Burden of common variants)#event_ages[event_ages > max_age] <- max_age# Create a histogram to visualize the distribution of event ageshist(event_ages[event_ages<100&event_ages>20], breaks = max_age - min_age, main ="Histogram of Event Ages", xlab ="Age", ylab ="Frequency")
Code
# You can also calculate and print summary statistics if neededsummary(event_ages)
Min. 1st Qu. Median Mean 3rd Qu. Max.
22.38 59.11 66.18 65.73 72.93 100.38
We’re not completely satisfied because we see that the distribtuion is more symmetric than we might hope. But let’s proceed with aggregate analyses.
Code
#Create a dataframe that includes the event age, counts, and burdens of variants for each individual}# Aggregate the counts and burdens by event agesummary_by_age <- variant_summary%>%group_by(event_age)%>%summarize(n=length(rare_burden),rare_burden=mean(rare_burden),common_burden=mean(common_burden),mean_score=mean(score))# Rename the columns for claritynames(summary_by_age) <-c("Age", "Event Count", "Rare Burden", "Common Burden", "Score")# Print the summary table#print(summary_by_age,n=60)m=melt(summary_by_age,id.vars="Age")ggplot(m,aes(Age,value,fill=variable))+geom_bar(stat="identity")+facet_wrap(~variable,scales="free")+theme_classic()
Heritability like
Take the correlation between the normalized score and the binary phenotype of MI by age N
R2 is effectively the observed scaled narrow sense heritability.
This plateau could indicate that there are competing risks present that affect the distribution of event ages. In survival analysis, competing risks occur when individuals are at risk of several different events and the occurrence of one event prevents the occurrence of the other event. This is common in older populations where the risk of death from non-CAD causes increases with age.
Here are some suggestions to account for competing risks in your simulation:
Competing Risks Model: Instead of using a single Weibull distribution, consider using a competing risks framework to simulate the time to CAD event and the time to other events separately. The cmprsk package in R can handle competing risks.
Subdistribution Hazard: Instead of focusing on the cause-specific hazard, which only considers the hazard for the event of interest, look at the subdistribution hazard, which takes into account the fact that some individuals will experience competing events. The survival package can fit subdistribution hazard models using the survreg function with the competing.risks argument.
Plateau Incorporation: Adjust your simulation to incorporate a plateau. For instance, you could simulate two separate age distributions: one for the initial rise and peak and another for the plateau, and then combine them.
Truncation or Censoring: You could also consider truncating the simulated distribution at an age where the plateau begins or treat the plateau as censoring due to competing risks.
To refine your simulation, you would take the following steps:
Simulate the non-CAD death times using another survival distribution or a constant rate of occurrence.
Generate CAD event times with the estimated Weibull parameters.
Combine the two sets of times, taking the minimum time for each individual as the observed time and noting the cause of the event.
Analyze the combined dataset considering the competing events.
Code
library(purrr)n <-10000# The number of individuals to simulatemean_non_cad_death_age <-80# The average age at non-CAD deathmin_age_threshold <-40# The minimum realistic age for CAD onsetdf_pheno=readRDS("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/output/df_final_withddeath.rds")# Simulate non-CAD death times with the corrected ratenon_cad_death_ages <-na.omit(df_pheno$Death_censor_age[df_pheno$Cad_0_Any==1])fit_non_cad <-survreg(Surv(non_cad_death_ages) ~1, dist ='weibull')# Extract the shape and scale parametersshape_non_cad <-1/ fit_non_cad$scalescale_non_cad <-exp(coef(fit_non_cad))simulated_non_cad_death_times <-rweibull(n, shape = shape_non_cad, scale = scale_non_cad)# Plot the histogram of the simulated non-CAD death times to compare with the empirical histogramhist(simulated_non_cad_death_times, breaks =100, main ="Simulated Non-CAD Death Times")
Code
combined_event_times <-pmin(event_ages, simulated_non_cad_death_times)# Exclude unrealistic early deaths by setting a minimum age thresholdcombined_event_times[combined_event_times < min_age_threshold] <- min_age_threshold# Plot the distribution of the combined event times with the thresholdhist(combined_event_times, breaks =50, main ="Histogram of Combined Event Times", xlim =c(min_age_threshold, 100))
make P (T>t| S(t)C*exp(k*t-1) — make 1/C a linear function of genetics
make weibull distribution with same shape parameter that yields mean age of interest and rate (1/Scale) = 1/population mean prevalence
1/2 both rate parameters to recover the desired behavior