Investigations into grandchildren

Use the first generation from the multigen simulations to get grandchildren information

require(TwoSampleMR)
## Loading required package: TwoSampleMR
## TwoSampleMR version 0.5.6 
## [>] New: Option to use non-European LD reference panels for clumping etc
## [>] Some studies temporarily quarantined to verify effect allele
## [>] See news(package='TwoSampleMR') and https://gwas.mrcieu.ac.uk for further details
require(jsonlite)
## Loading required package: jsonlite
config <- read_json("~/config.json")
mr_sel_path <- config$mr_sel_path

Get data

dat<-read.table(paste0(mr_sel_path,"/Simulations/children/multigen/selStop_sim_1/dat_gc_0.txt"),header = TRUE)
hist(dat$grandchildren)

MR analysis for children and grandchildren from all sims over all generations

b_c_mat<-matrix(NA,100,19)
se_c_mat<-matrix(NA,100,19)
b_gc_mat<-matrix(NA,100,19)
se_gc_mat<-matrix(NA,100,19)
for (s in 1:100){
  setwd(paste0(mr_sel_path,"/Simulations/children/multigen/selStop_sim_",s))
  for (g in 0:18){
    dat<-read.table(paste0("dat_gc_",g,".txt"),header = TRUE)
    mod1<-summary(lm(dat$pheno~dat$geno))
    bgx<-mod1$coefficients[2,1]
    segx<-mod1$coefficients[2,2]
    mod2<-summary(glm(dat$children~dat$geno,family=poisson("log")))
    bgy<-mod2$coefficients[2,1]
    segy<-mod2$coefficients[2,2]
    mrw<-mr_wald_ratio(bgx,bgy,segx,segy)
    mod3<-summary(glm(dat$grandchildren~dat$geno,family=poisson("log")))
    bgygc<-mod3$coefficients[2,1]
    segygc<-mod3$coefficients[2,2]
    mrwgc<-mr_wald_ratio(bgx,bgygc,segx,segygc)
    b_c_mat[s,g+1]<-mrw$b
    se_c_mat[s,g+1]<-mrw$se
    b_gc_mat[s,g+1]<-mrwgc$b
    se_gc_mat[s,g+1]<-mrwgc$se
  }
}

box plot for children

boxplot(b_c_mat,xlab="Generations",ylab="Estimated selection strength")
abline(h=0.05,lty=2,col="red")
abline(h=0,lty=2,col="blue")

box plot for grandchildren

boxplot(b_gc_mat,xlab="Generations",ylab="Estimated selection strength")
abline(h=0.05*3/2,lty=2,col="green")
abline(h=0.05,lty=2,col="red")
abline(h=0,lty=2,col="blue")

Generation means

apply(b_gc_mat,2,mean)
##  [1]  0.0727543935  0.0737042107  0.0759795303  0.0767918777  0.0759407112
##  [6]  0.0746484503  0.0749540913  0.0760197467  0.0759419083  0.0513136685
## [11]  0.0015129301  0.0001732289 -0.0016388924 -0.0008433544  0.0004247945
## [16]  0.0008141813  0.0023013654  0.0010771072  0.0010805496

The selection strength is estimated as 1.5 times the actual selection strength when using grandchildren. This is because the beneficial allele is passed down through multiple generations.

If the selection strength is b, and the beneficial allele is a:

In generation 0 (grandparents):

AA: Fitness = 1

Aa: Fitness = 1 + b

aa: Fitness = 1 + 2b

Difference between the groups is b. This is the additional fitness for children for each allele.

The fitness in terms of grandchildren per allele also depends on the likelihood of passing on the beneficial allele:

AA: Fitness = 1 + 0 (no chance of passing on allele a) = 1

Aa: Fitness = 1 + b + 0.5*b (half a chance of passing on allele a) = 1 + 1.5b

aa: Fitness = 1 + 2b + b (Definitely will pass on allele a) = 1 + 3b

Difference between the groups is 1.5b. This is the additional fitness for grandchildren for each allele.

Is the estimate using grandparents more precise?

box<-list()
for(i in 1:19){
 box[[2*i-1]]<-se_c_mat[,i]
 box[[2*i]]<-se_gc_mat[,i]
}
boxplot(box,col=c("red","blue"),ylim=c(0.005,0.011),ylab="Standard error",xlab="Generation",xaxt="n")
axis(1,at=seq(1.5,37.5,2),1:19)
legend("bottom",horiz = TRUE,legend=c("child","grandchild"),fill=c("red","blue"))

The standard error is much smaller in each generation for the grandchild estimates, meaning the precision is greater. However, number of grandchildren is hard to ascertain.

Just grandparents with at least one child

Can we use imputed parents for the UKBioBank participants? Will this bias the data as we will only have information for imputed parents who have had at least one child (the UKBioBank participant)

b_c_mat_1<-matrix(NA,100,19)
se_c_mat_1<-matrix(NA,100,19)
b_gc_mat_1<-matrix(NA,100,19)
se_gc_mat_1<-matrix(NA,100,19)
for (s in 1:100){
  setwd(paste0(mr_sel_path,"/Simulations/children/multigen/selStop_sim_",s))
  for (g in 0:18){
    dat<-read.table(paste0("dat_gc_",g,".txt"),header = TRUE)
    dat<-dat[dat$children>0,]
    mod1<-summary(lm(dat$pheno~dat$geno))
    bgx<-mod1$coefficients[2,1]
    segx<-mod1$coefficients[2,2]
    mod2<-summary(glm(dat$children~dat$geno,family=poisson("log")))
    bgy<-mod2$coefficients[2,1]
    segy<-mod2$coefficients[2,2]
    mrw<-mr_wald_ratio(bgx,bgy,segx,segy)
    mod3<-summary(glm(dat$grandchildren~dat$geno,family=poisson("log")))
    bgygc<-mod3$coefficients[2,1]
    segygc<-mod3$coefficients[2,2]
    mrwgc<-mr_wald_ratio(bgx,bgygc,segx,segygc)
    b_c_mat_1[s,g+1]<-mrw$b
    se_c_mat_1[s,g+1]<-mrw$se
    b_gc_mat_1[s,g+1]<-mrwgc$b
    se_gc_mat_1[s,g+1]<-mrwgc$se
  }
}

box plot for grandchildren where they had at least one child

boxplot(b_gc_mat_1,xlab="Generations",ylab="Estimated selection strength")
abline(h=0.05*3/2,lty=2,col="green")
abline(h=0.05,lty=2,col="red")
abline(h=0,lty=2,col="blue")

Generation means

apply(b_gc_mat_1,2,mean)
##  [1]  5.828273e-02  5.778351e-02  6.006259e-02  6.169668e-02  6.026291e-02
##  [6]  5.895509e-02  5.927791e-02  6.086045e-02  6.048984e-02  3.522530e-02
## [11]  1.907662e-03 -7.578781e-05 -1.578331e-03 -3.968375e-04  6.082113e-04
## [16]  4.037947e-04  2.485320e-03  6.729823e-04  1.033728e-03

Estimates using parents with at least one child

boxplot(b_c_mat_1,xlab="Generations",ylab="Estimated selection strength")
abline(h=0.05,lty=2,col="red")
abline(h=0,lty=2,col="blue")

Generation means

apply(b_c_mat_1,2,mean)
##  [1]  3.411025e-02  3.203680e-02  3.444164e-02  3.613108e-02  3.498404e-02
##  [6]  3.337685e-02  3.388208e-02  3.582826e-02  3.572761e-02  3.439482e-02
## [11]  1.083901e-03  2.169723e-04 -1.245377e-03 -3.336023e-05  7.042251e-04
## [16]  6.263430e-05  1.556128e-03  1.095192e-03  7.243536e-04

This shows the result would be biased without including grandparents with no children.