Load the systemfit and TwoSampleMR packages
require(systemfit)
## Loading required package: systemfit
## Loading required package: Matrix
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Please cite the 'systemfit' package as:
## Arne Henningsen and Jeff D. Hamann (2007). systemfit: A Package for Estimating Systems of Simultaneous Equations in R. Journal of Statistical Software 23(4), 1-40. http://www.jstatsoft.org/v23/i04/.
##
## If you have questions, suggestions, or comments regarding the 'systemfit' package, please use a forum or 'tracker' at systemfit's R-Forge site:
## https://r-forge.r-project.org/projects/systemfit/
require(TwoSampleMR)
## Loading required package: TwoSampleMR
## TwoSampleMR version 0.5.6
## [>] New: Option to use non-European LD reference panels for clumping etc
## [>] Some studies temporarily quarantined to verify effect allele
## [>] See news(package='TwoSampleMR') and https://gwas.mrcieu.ac.uk for further details
require(jsonlite)
## Loading required package: jsonlite
Get file path
config <- read_json("~/config.json")
mr_sel_path <- config$mr_sel_path
Get the SLiM data from the “children” simualation output.txt and put it into the “dat” data frame. Get the parent ID for all the children in the next generation from parent.txt and put it into the par data frame
dat<-read.table(paste0(mr_sel_path,"/Simulations/children/dat.txt"),header = TRUE)
tapply(log(dat$fit),dat$geno,mean)
## 0 1 2
## 0.9986081 1.0490366 1.0996051
Look at the mean number of children per genotype
tapply(dat$children,dat$geno,mean)
## 0 1 2
## 1.891735 2.028752 2.117777
sf<-systemfit(children~pheno,inst = ~geno,method = "2SLS",data=dat)
summary(sf)
##
## systemfit results
## method: 2SLS
##
## N DF SSR detRCov OLS-R2 McElroy-R2
## system 10000 9998 19949 1.9953 0.00978 0.00978
##
## N DF SSR MSE RMSE R2 Adj R2
## eq1 10000 9998 19949 1.9953 1.41255 0.00978 0.009681
##
## The covariance matrix of the residuals
## eq1
## eq1 1.9953
##
## The correlations of the residuals
## eq1
## eq1 1
##
##
## 2SLS estimates for 'eq1' (equation 1)
## Model Formula: children ~ pheno
## Instruments: ~geno
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.903950 0.021843 87.16509 < 2.22e-16 ***
## pheno 0.115444 0.020025 5.76499 8.4084e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.41255 on 9998 degrees of freedom
## Number of observations: 10000 Degrees of Freedom: 9998
## SSR: 19948.970679 MSE: 1.995296 Root MSE: 1.41255
## Multiple R-Squared: 0.00978 Adjusted R-Squared: 0.009681
This estimate could be biased due to the distribution of the number of children. Here is a bar plot of the number of children in the dataset:
barplot(table(dat$children),main="Number of children",xlab="Number of children",ylab="Frequency")
Are the mean and variance similar?
mean(dat$children)
## [1] 2
var(dat$children)
## [1] 2.014801
Here is the graph again with the poisson distribution overlayed on top/next to it:
c1 <- rgb(173,216,230,max = 255, alpha = 80, names = "lt.blue")
c2 <- rgb(255,192,203, max = 255, alpha = 80, names = "lt.pink")
barplot(table(dat$children),main="Number of children",xlab="Number of children",ylab="Frequency",col=c1)
pois<-dpois(0:10, lambda=2)*10000
barplot(pois,col=c2,add=TRUE)
legend("topright",legend=c("Observed","Poisson"),fill=c(c1,c2))
barplot(rbind(table(dat$children),pois),beside=TRUE,main="Number of children",xlab="Number of children",ylab="Frequency",col=c("black","red"))
## Warning in rbind(table(dat$children), pois): number of columns of result is not
## a multiple of vector length (arg 2)
legend("topright",legend=c("Observed","Poisson"),fill=c("black","red"))
This looks close. More formally, a chi-squared test can be used to see if the data is significantly different than expected given the Poisson distribution with = 2. Any values greater than 7 have been grouped.
chiChild<-as.vector(table(dat$children))
chiChild<-c(chiChild[1:8],sum(chiChild[9:length(chiChild)]))
chiPois<-dpois(0:7, lambda=2)*10000
chiPois<-c(chiPois,10000-sum(chiPois))
chisq.test(rbind(chiChild,chiPois))
##
## Pearson's Chi-squared test
##
## data: rbind(chiChild, chiPois)
## X-squared = 4.6452, df = 8, p-value = 0.7947
The chi-sqaured test did not show evidence to reject the null hypothesis (that the distributions are not different)
Firstly, start by running using a normal linear model:
mod1<-summary(lm(dat$pheno~dat$geno))
bgx<-mod1$coefficients[2,1]
segx<-mod1$coefficients[2,2]
mod2<-summary(lm(dat$children~dat$geno))
bgy<-mod2$coefficients[2,1]
segy<-mod2$coefficients[2,2]
mr_wald_ratio(bgx,bgy,segx,segy)
## $b
## [1] 0.1154437
##
## $se
## [1] 0.02009047
##
## $pval
## [1] 9.127578e-09
##
## $nsnp
## [1] 1
This returns the same estimate for iv as in the 2SLS method.
Now try changing the lm to a glm with a Poisson family distribution:
mod1<-summary(lm(dat$pheno~dat$geno))
bgx<-mod1$coefficients[2,1]
segx<-mod1$coefficients[2,2]
mod2<-summary(glm(dat$children~dat$geno,family=poisson("log")))
bgy<-mod2$coefficients[2,1]
segy<-mod2$coefficients[2,2]
mr_wald_ratio(bgx,bgy,segx,segy)
## $b
## [1] 0.05749412
##
## $se
## [1] 0.009986124
##
## $pval
## [1] 8.54189e-09
##
## $nsnp
## [1] 1
setwd("~/Documents/MR-and-selection/Simulations/children/children_large_n/")
large_dat<-read.table("dat.txt",header = TRUE)
tapply(log(large_dat$fit),large_dat$geno,mean)
## 0 1 2
## 0.9999216 1.0498728 1.0997450
Look at the mean number of children per genotype
tapply(large_dat$children,large_dat$geno,mean)
## 0 1 2
## 1.918382 2.014508 2.112193
sf<-systemfit(children~pheno,inst = ~geno,method = "2SLS",data=large_dat)
summary(sf)
## Warning in nObsEq[i] * nObsEq[j]: NAs produced by integer overflow
##
## systemfit results
## method: 2SLS
##
## N DF SSR detRCov OLS-R2 McElroy-R2
## system 1e+06 999998 1999624 1.99963 0.007547 NA
##
## N DF SSR MSE RMSE R2 Adj R2
## eq1 1e+06 999998 1999624 1.99963 1.41408 0.007547 0.007546
##
## The covariance matrix of the residuals
## eq1
## eq1 1.99963
##
## The correlations of the residuals
## eq1
## eq1 1
##
##
## 2SLS estimates for 'eq1' (equation 1)
## Model Formula: children ~ pheno
## Instruments: ~geno
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.91825324 0.00221696 865.2613 < 2.22e-16 ***
## pheno 0.09691848 0.00202431 47.8772 < 2.22e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.414082 on 999998 degrees of freedom
## Number of observations: 1e+06 Degrees of Freedom: 999998
## SSR: 1999624.204212 MSE: 1.999628 Root MSE: 1.414082
## Multiple R-Squared: 0.007547 Adjusted R-Squared: 0.007546
Firstly, start by running using a normal linear model:
mod1<-summary(lm(large_dat$pheno~large_dat$geno))
bgx<-mod1$coefficients[2,1]
segx<-mod1$coefficients[2,2]
mod2<-summary(lm(large_dat$children~large_dat$geno))
bgy<-mod2$coefficients[2,1]
segy<-mod2$coefficients[2,2]
mr_wald_ratio(bgx,bgy,segx,segy)
## $b
## [1] 0.09691848
##
## $se
## [1] 0.002029683
##
## $pval
## [1] 0
##
## $nsnp
## [1] 1
Now try changing the lm to a glm with a Poisson family distribution:
mod1<-summary(lm(large_dat$pheno~large_dat$geno))
bgx<-mod1$coefficients[2,1]
segx<-mod1$coefficients[2,2]
mod2<-summary(glm(large_dat$children~large_dat$geno,family=poisson("log")))
bgy<-mod2$coefficients[2,1]
segy<-mod2$coefficients[2,2]
mr_wald_ratio(bgx,bgy,segx,segy)
## $b
## [1] 0.04828811
##
## $se
## [1] 0.001008775
##
## $pval
## [1] 0
##
## $nsnp
## [1] 1
child_geno_prop<-prop.table(table(large_dat$geno,large_dat$children),1)
plot(child_geno_prop[1,],type="l",xlab="Number of children",ylab="Proportion")
lines(child_geno_prop[2,],col="blue")
lines(child_geno_prop[3,],col="red")
legend("topright",legend=c(0,1,2),col=c("black","blue","red"),lty=1,title = "Genotype")
setwd("~/Documents/MR-and-selection/Simulations/children/multisim/")
effectEsts<-rep(0,1000)
standardErrs<-rep(0,1000)
for (i in 1:1000){
temp_dat<-read.table(paste0("sim_",i,"/dat.txt"),header = TRUE)
mod1<-summary(lm(temp_dat$pheno~temp_dat$geno))
bgx<-mod1$coefficients[2,1]
segx<-mod1$coefficients[2,2]
mod2<-summary(glm(temp_dat$children~temp_dat$geno,family=poisson("log")))
bgy<-mod2$coefficients[2,1]
segy<-mod2$coefficients[2,2]
wr<-mr_wald_ratio(bgx,bgy,segx,segy)
effectEsts[i]<-wr$b
standardErrs[i]<-wr$se
}
summary(effectEsts)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01877 0.04354 0.05016 0.05026 0.05712 0.08096
hist(effectEsts, main="Effect estimates over 1,000 simulations")
hist(standardErrs)
setwd("~/Documents/MR-and-selection/Simulations/children/multisim/")
effectEsts<-rep(0,1000)
standardErrs<-rep(0,1000)
for (i in 1:1000){
temp_dat<-read.table(paste0("sim_",i,"/dat.txt"),header = TRUE)
mod1<-summary(lm(temp_dat$pheno~temp_dat$geno))
bgx<-mod1$coefficients[2,1]
segx<-mod1$coefficients[2,2]
mod2<-summary(lm(temp_dat$children~temp_dat$geno))
bgy<-mod2$coefficients[2,1]
segy<-mod2$coefficients[2,2]
wr<-mr_wald_ratio(bgx,bgy,segx,segy)
effectEsts[i]<-wr$b
standardErrs[i]<-wr$se
}
Divide by mean number of children
effectEsts<-effectEsts/2
summary(effectEsts)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01879 0.04369 0.05034 0.05045 0.05735 0.08147
hist(effectEsts, main="Effect estimates over 1,000 simulations")
hist(standardErrs)
Effect estimate can be recovered using a standard lm model rather than a glm, however the standard errors are larger.