Packages

Load the systemfit and TwoSampleMR packages

require(systemfit)
## Loading required package: systemfit
## Loading required package: Matrix
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Please cite the 'systemfit' package as:
## Arne Henningsen and Jeff D. Hamann (2007). systemfit: A Package for Estimating Systems of Simultaneous Equations in R. Journal of Statistical Software 23(4), 1-40. http://www.jstatsoft.org/v23/i04/.
## 
## If you have questions, suggestions, or comments regarding the 'systemfit' package, please use a forum or 'tracker' at systemfit's R-Forge site:
## https://r-forge.r-project.org/projects/systemfit/
require(TwoSampleMR)
## Loading required package: TwoSampleMR
## TwoSampleMR version 0.5.6 
## [>] New: Option to use non-European LD reference panels for clumping etc
## [>] Some studies temporarily quarantined to verify effect allele
## [>] See news(package='TwoSampleMR') and https://gwas.mrcieu.ac.uk for further details
require(jsonlite)
## Loading required package: jsonlite

Get file path

config <- read_json("~/config.json")
mr_sel_path <- config$mr_sel_path

Basic simulation

Read in the Data

Get the SLiM data from the “children” simualation output.txt and put it into the “dat” data frame. Get the parent ID for all the children in the next generation from parent.txt and put it into the par data frame

dat<-read.table(paste0(mr_sel_path,"/Simulations/children/dat.txt"),header = TRUE)

Check the fitness of the individuals is as expected (0.05 higher for each allele)

tapply(log(dat$fit),dat$geno,mean)
##         0         1         2 
## 0.9986081 1.0490366 1.0996051

Get number of children

Look at the mean number of children per genotype

tapply(dat$children,dat$geno,mean)
##        0        1        2 
## 1.891735 2.028752 2.117777

Calculate fitness estimate using 2SLS method in systemfit

sf<-systemfit(children~pheno,inst = ~geno,method = "2SLS",data=dat)
summary(sf)
## 
## systemfit results 
## method: 2SLS 
## 
##            N   DF   SSR detRCov  OLS-R2 McElroy-R2
## system 10000 9998 19949  1.9953 0.00978    0.00978
## 
##         N   DF   SSR    MSE    RMSE      R2   Adj R2
## eq1 10000 9998 19949 1.9953 1.41255 0.00978 0.009681
## 
## The covariance matrix of the residuals
##        eq1
## eq1 1.9953
## 
## The correlations of the residuals
##     eq1
## eq1   1
## 
## 
## 2SLS estimates for 'eq1' (equation 1)
## Model Formula: children ~ pheno
## Instruments: ~geno
## 
##             Estimate Std. Error  t value   Pr(>|t|)    
## (Intercept) 1.903950   0.021843 87.16509 < 2.22e-16 ***
## pheno       0.115444   0.020025  5.76499 8.4084e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.41255 on 9998 degrees of freedom
## Number of observations: 10000 Degrees of Freedom: 9998 
## SSR: 19948.970679 MSE: 1.995296 Root MSE: 1.41255 
## Multiple R-Squared: 0.00978 Adjusted R-Squared: 0.009681

Is 2SLS the best way to do this?

This estimate could be biased due to the distribution of the number of children. Here is a bar plot of the number of children in the dataset:

barplot(table(dat$children),main="Number of children",xlab="Number of children",ylab="Frequency")

Does the number of children follow a Poisson distribution?

Are the mean and variance similar?

mean(dat$children)
## [1] 2
var(dat$children)
## [1] 2.014801

Here is the graph again with the poisson distribution overlayed on top/next to it:

c1 <- rgb(173,216,230,max = 255, alpha = 80, names = "lt.blue")
c2 <- rgb(255,192,203, max = 255, alpha = 80, names = "lt.pink")
barplot(table(dat$children),main="Number of children",xlab="Number of children",ylab="Frequency",col=c1)
pois<-dpois(0:10, lambda=2)*10000
barplot(pois,col=c2,add=TRUE)
legend("topright",legend=c("Observed","Poisson"),fill=c(c1,c2))

barplot(rbind(table(dat$children),pois),beside=TRUE,main="Number of children",xlab="Number of children",ylab="Frequency",col=c("black","red"))
## Warning in rbind(table(dat$children), pois): number of columns of result is not
## a multiple of vector length (arg 2)
legend("topright",legend=c("Observed","Poisson"),fill=c("black","red"))

This looks close. More formally, a chi-squared test can be used to see if the data is significantly different than expected given the Poisson distribution with = 2. Any values greater than 7 have been grouped.

chiChild<-as.vector(table(dat$children))
chiChild<-c(chiChild[1:8],sum(chiChild[9:length(chiChild)]))
chiPois<-dpois(0:7, lambda=2)*10000
chiPois<-c(chiPois,10000-sum(chiPois))

chisq.test(rbind(chiChild,chiPois))
## 
##  Pearson's Chi-squared test
## 
## data:  rbind(chiChild, chiPois)
## X-squared = 4.6452, df = 8, p-value = 0.7947

The chi-sqaured test did not show evidence to reject the null hypothesis (that the distributions are not different)

Two Sample MR method

Firstly, start by running using a normal linear model:

mod1<-summary(lm(dat$pheno~dat$geno))
bgx<-mod1$coefficients[2,1]
segx<-mod1$coefficients[2,2]
mod2<-summary(lm(dat$children~dat$geno))
bgy<-mod2$coefficients[2,1]
segy<-mod2$coefficients[2,2]
mr_wald_ratio(bgx,bgy,segx,segy)
## $b
## [1] 0.1154437
## 
## $se
## [1] 0.02009047
## 
## $pval
## [1] 9.127578e-09
## 
## $nsnp
## [1] 1

This returns the same estimate for iv as in the 2SLS method.

Now try changing the lm to a glm with a Poisson family distribution:

mod1<-summary(lm(dat$pheno~dat$geno))
bgx<-mod1$coefficients[2,1]
segx<-mod1$coefficients[2,2]
mod2<-summary(glm(dat$children~dat$geno,family=poisson("log")))
bgy<-mod2$coefficients[2,1]
segy<-mod2$coefficients[2,2]
mr_wald_ratio(bgx,bgy,segx,segy)
## $b
## [1] 0.05749412
## 
## $se
## [1] 0.009986124
## 
## $pval
## [1] 8.54189e-09
## 
## $nsnp
## [1] 1

Simulation with large n

setwd("~/Documents/MR-and-selection/Simulations/children/children_large_n/")
large_dat<-read.table("dat.txt",header = TRUE)

Check the fitness of the individuals is as expected (0.05 higher for each allele)

tapply(log(large_dat$fit),large_dat$geno,mean)
##         0         1         2 
## 0.9999216 1.0498728 1.0997450

Get number of children

Look at the mean number of children per genotype

tapply(large_dat$children,large_dat$geno,mean)
##        0        1        2 
## 1.918382 2.014508 2.112193

Calculate fitness estimate using 2SLS method in systemfit

sf<-systemfit(children~pheno,inst = ~geno,method = "2SLS",data=large_dat)
summary(sf)
## Warning in nObsEq[i] * nObsEq[j]: NAs produced by integer overflow
## 
## systemfit results 
## method: 2SLS 
## 
##            N     DF     SSR detRCov   OLS-R2 McElroy-R2
## system 1e+06 999998 1999624 1.99963 0.007547         NA
## 
##         N     DF     SSR     MSE    RMSE       R2   Adj R2
## eq1 1e+06 999998 1999624 1.99963 1.41408 0.007547 0.007546
## 
## The covariance matrix of the residuals
##         eq1
## eq1 1.99963
## 
## The correlations of the residuals
##     eq1
## eq1   1
## 
## 
## 2SLS estimates for 'eq1' (equation 1)
## Model Formula: children ~ pheno
## Instruments: ~geno
## 
##               Estimate Std. Error  t value   Pr(>|t|)    
## (Intercept) 1.91825324 0.00221696 865.2613 < 2.22e-16 ***
## pheno       0.09691848 0.00202431  47.8772 < 2.22e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.414082 on 999998 degrees of freedom
## Number of observations: 1e+06 Degrees of Freedom: 999998 
## SSR: 1999624.204212 MSE: 1.999628 Root MSE: 1.414082 
## Multiple R-Squared: 0.007547 Adjusted R-Squared: 0.007546

Firstly, start by running using a normal linear model:

mod1<-summary(lm(large_dat$pheno~large_dat$geno))
bgx<-mod1$coefficients[2,1]
segx<-mod1$coefficients[2,2]
mod2<-summary(lm(large_dat$children~large_dat$geno))
bgy<-mod2$coefficients[2,1]
segy<-mod2$coefficients[2,2]
mr_wald_ratio(bgx,bgy,segx,segy)
## $b
## [1] 0.09691848
## 
## $se
## [1] 0.002029683
## 
## $pval
## [1] 0
## 
## $nsnp
## [1] 1

Now try changing the lm to a glm with a Poisson family distribution:

mod1<-summary(lm(large_dat$pheno~large_dat$geno))
bgx<-mod1$coefficients[2,1]
segx<-mod1$coefficients[2,2]
mod2<-summary(glm(large_dat$children~large_dat$geno,family=poisson("log")))
bgy<-mod2$coefficients[2,1]
segy<-mod2$coefficients[2,2]
mr_wald_ratio(bgx,bgy,segx,segy)
## $b
## [1] 0.04828811
## 
## $se
## [1] 0.001008775
## 
## $pval
## [1] 0
## 
## $nsnp
## [1] 1
child_geno_prop<-prop.table(table(large_dat$geno,large_dat$children),1)
plot(child_geno_prop[1,],type="l",xlab="Number of children",ylab="Proportion")
lines(child_geno_prop[2,],col="blue")
lines(child_geno_prop[3,],col="red")
legend("topright",legend=c(0,1,2),col=c("black","blue","red"),lty=1,title = "Genotype")

Multiple simulations

For each simulation, extract the final wald ratio

setwd("~/Documents/MR-and-selection/Simulations/children/multisim/")
effectEsts<-rep(0,1000)
standardErrs<-rep(0,1000)
for (i in 1:1000){
  temp_dat<-read.table(paste0("sim_",i,"/dat.txt"),header = TRUE)
  mod1<-summary(lm(temp_dat$pheno~temp_dat$geno))
  bgx<-mod1$coefficients[2,1]
  segx<-mod1$coefficients[2,2]
  mod2<-summary(glm(temp_dat$children~temp_dat$geno,family=poisson("log")))
  bgy<-mod2$coefficients[2,1]
  segy<-mod2$coefficients[2,2]
  wr<-mr_wald_ratio(bgx,bgy,segx,segy)
  effectEsts[i]<-wr$b
  standardErrs[i]<-wr$se
}

Summarise causal effect estimates

summary(effectEsts)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01877 0.04354 0.05016 0.05026 0.05712 0.08096

Histogram of effect estimates

hist(effectEsts, main="Effect estimates over 1,000 simulations")

Distribution of standard errors

hist(standardErrs)

Multiple simulations with linear model

For each simulation, extract the final wald ratio

setwd("~/Documents/MR-and-selection/Simulations/children/multisim/")
effectEsts<-rep(0,1000)
standardErrs<-rep(0,1000)
for (i in 1:1000){
  temp_dat<-read.table(paste0("sim_",i,"/dat.txt"),header = TRUE)
  mod1<-summary(lm(temp_dat$pheno~temp_dat$geno))
  bgx<-mod1$coefficients[2,1]
  segx<-mod1$coefficients[2,2]
  mod2<-summary(lm(temp_dat$children~temp_dat$geno))
  bgy<-mod2$coefficients[2,1]
  segy<-mod2$coefficients[2,2]
  wr<-mr_wald_ratio(bgx,bgy,segx,segy)
  effectEsts[i]<-wr$b
  standardErrs[i]<-wr$se
}

Summarise causal effect estimates

Divide by mean number of children

effectEsts<-effectEsts/2
summary(effectEsts)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01879 0.04369 0.05034 0.05045 0.05735 0.08147

Histogram of effect estimates

hist(effectEsts, main="Effect estimates over 1,000 simulations")

Distribution of standard errors

hist(standardErrs)

Effect estimate can be recovered using a standard lm model rather than a glm, however the standard errors are larger.