The liability scale follows a standard linear model

\[ l_i = \beta_0 + \beta_1 g_{1,i} + \beta_2 g_{2,i} + e_i \] This is transformed to the expected data scale

\[ \nu_i = g^{-1}(l_i) \] where the function \(g()\) is the link function, which in the Poisson case is the natural \(log\). Therefore, the expected data scale will be the exponential transformation of the liability.

Finally, the actual number of counts is a random sample drawn with

\[ z_i = D(\nu_i) \]

Create 1000 genotypes for 10000 individuals

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

nid <- 10000
nsnp <- 1000
af <- runif(nsnp)
h2 <- 0.8
g <- lapply(1:nsnp, function(x) rbinom(nid, 2, af[x])) %>% do.call(cbind, .)

Generate true effect sizes for a trait with some known heritability

sigmag <- sqrt(2 * af * (1-af))
b <- rnorm(nsnp, mean=0, sd=sigmag)

Scale effects to be on a liability scale that will result in an exponential distribution of mean 2 and variance 2

\[ \mu_l = ln \left (\frac{\mu_\nu^2}{\sqrt{\mu_\nu^2+\sigma_\nu^2}} \right ) - \sum^M 2p_j\beta_j \] and

\[ \sigma^2_l = ln \left ( 1 + \frac{\sigma^2_\nu}{\mu^2_\nu} \right ) \]

sigmal <- sqrt(log(1+2^2/2^2))

vg <- sum(af * 2 * (1 - af) * b^2)
ve <- (vg - h2 * vg)/h2
vy <- vg + ve
b <- b/sqrt(vy) * sigmal
hist(b)

sd(b)

## [1] 0.03742696

mu <- log(2^2 / sqrt(2^2 + 2^2)) - sum(2*af*b)

Generate liability

gbv <- g %*% b
l <- mu + gbv + rnorm(nid, 0, sqrt(sigmal^2 * (1-h2)))
cor(gbv, l)^2

##           [,1]
## [1,] 0.8000013

nu <- exp(l)
mean(nu)

## [1] 2.027862

sd(nu)

## [1] 1.994783

hist(nu, breaks=100)

Transform to observed distribution

Sample counts

y <- rpois(nid, nu)
barplot(table(y))

mod1 <- lm(y ~ g)
mod2 <- lm(l ~ g)
plot(coefficients((mod1))[-1] ~ b)

plot(coefficients((mod2))[-1] ~ b)

summary(lm(coefficients(mod1)[-1] ~ b))

## 
## Call:
## lm(formula = coefficients(mod1)[-1] ~ b)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.40633 -0.02548  0.00102  0.02716  0.85045 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.001847   0.001844  -1.002    0.317    
## b            2.041119   0.049269  41.428   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05828 on 997 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6325, Adjusted R-squared:  0.6322 
## F-statistic:  1716 on 1 and 997 DF,  p-value: < 2.2e-16

simulate <- function(h2, nsnp, nid, nkids)
{
  af <- runif(nsnp)
  g <- lapply(1:nsnp, function(x) rbinom(nid, 2, af[x])) %>% do.call(cbind, .)
  sigmag <- sqrt(2 * af * (1-af))
  b <- rnorm(nsnp, mean=0, sd=sigmag)
  vg <- sum(af * 2 * (1 - af) * b^2)
  ve <- (vg - h2 * vg)/h2
  vy <- vg + ve
  b <- b/sqrt(vy) * sigmal
  mu <- log(nkids^2 / sqrt(nkids^2 + nkids^2)) - sum(2*af*b)
  sigmal <- sqrt(log(1+nkids^2/nkids^2))
  gbv <- g %*% b
  l <- mu + gbv + rnorm(nid, 0, sqrt(sigmal^2 * (1-h2)))
  nu <- exp(l)
  y <- rpois(nid, nu)
  mod <- sapply(1:nsnp, function(x) lm(y ~ g[,x])$coef[2])
  return(lm(mod ~ b)$coef[2])
}

param <- expand.grid(
  h2=c(0.1, 0.3, 0.5, 0.7, 0.9),
  nkids=c(2,3),
  nsnp=c(500),
  sim=c(1:3),
  nid=10000
)

for(i in 1:nrow(param))
{
  message(i)
  param$coef[i] <- simulate(param$h2[i], param$nsnp[i], param$nid[i], param$nkids[i])
}

## 1

## 2

## 3

## 4

## 5

## 6

## 7

## 8

## 9

## 10

## 11

## 12

## 13

## 14

## 15

## 16

## 17

## 18

## 19

## 20

## 21

## 22

## 23

## 24

## 25

## 26

## 27

## 28

## 29

## 30

str(param)

## 'data.frame':    30 obs. of  6 variables:
##  $ h2   : num  0.1 0.3 0.5 0.7 0.9 0.1 0.3 0.5 0.7 0.9 ...
##  $ nkids: num  2 2 2 2 2 3 3 3 3 3 ...
##  $ nsnp : num  500 500 500 500 500 500 500 500 500 500 ...
##  $ sim  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ nid  : num  10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 ...
##  $ coef : num  2.17 2.14 1.97 2.02 2.01 ...
##  - attr(*, "out.attrs")=List of 2
##   ..$ dim     : Named int [1:5] 5 2 1 3 1
##   .. ..- attr(*, "names")= chr [1:5] "h2" "nkids" "nsnp" "sim" ...
##   ..$ dimnames:List of 5
##   .. ..$ h2   : chr [1:5] "h2=0.1" "h2=0.3" "h2=0.5" "h2=0.7" ...
##   .. ..$ nkids: chr [1:2] "nkids=2" "nkids=3"
##   .. ..$ nsnp : chr "nsnp=500"
##   .. ..$ sim  : chr [1:3] "sim=1" "sim=2" "sim=3"
##   .. ..$ nid  : chr "nid=10000"

library(ggplot2)
ggplot(param, aes(h2, coef)) +
  geom_point(aes(colour=nkids))

Reconciling `sample` vs `rpois`

I think that once the liability is exponentiated then drawing numbers of children from the poisson, or using it as a probability in sample is going to be the same.

nsnp <- 1000
nid <- 10000
nkids <- 2
h2 <- 0.8

# sample allele frequencies
af <- runif(nsnp)

# generate genotypes
g <- lapply(1:nsnp, function(x) rbinom(nid, 2, af[x])) %>% do.call(cbind, .)

# variance of liability should scale to nkids once exponentiated
sigmal <- sqrt(log(1+nkids^2/nkids^2))

# generate effect sizes
sigmag <- sqrt(2 * af * (1-af))
b <- rnorm(nsnp, mean=0, sd=sigmag)
vg <- sum(af * 2 * (1 - af) * b^2)
ve <- (vg - h2 * vg)/h2
vy <- vg + ve
b <- b/sqrt(vy) * sigmal

# mean of liability should scale to nkids once exponentiated
mu <- log(nkids^2 / sqrt(nkids^2 + nkids^2)) - sum(2*af*b)

# generate liability (fitness)
gbv <- g %*% b
l <- mu + gbv + rnorm(nid, 0, sqrt(sigmal^2 * (1-h2)))

# transform to observed scale
nu <- exp(l)

# sample counts by drawing from poisson
y1 <- rpois(nid, nu)

# sample counts by assigning children using sample
child_parent <- sample(1:nid, nkids*nid, replace=TRUE, prob=nu)
y2 <- rep(0,nid)
tabParents <- table(child_parent)
y2[as.numeric(unlist(dimnames(tabParents)))] <- tabParents

# estimate effects of each SNP on poisson y
mod1 <- sapply(1:nsnp, function(x) lm(y1 ~ g[,x])$coef[2])

# estimate effects of each SNP on sample y
mod2 <- sapply(1:nsnp, function(x) lm(y2 ~ g[,x])$coef[2])

# how do the SNP effect estimates compare across the two models?
plot(mod1 ~ mod2)

The estimated effects should by proportional to the simulated effects on the liability scale, but need to be divided by the mean number of kids per person

plot(mod1/nkids ~ b)

lm(mod1/nkids ~ b)

## 
## Call:
## lm(formula = mod1/nkids ~ b)
## 
## Coefficients:
## (Intercept)            b  
##    0.001023     1.105031

Compare GLM and LM

nsnp <- 100
nid <- 10000
nkids <- 2
h2 <- 0.8

# sample allele frequencies
af <- runif(nsnp)

# generate genotypes
g <- lapply(1:nsnp, function(x) rbinom(nid, 2, af[x])) %>% do.call(cbind, .)

# variance of liability should scale to nkids once exponentiated
sigmal <- sqrt(log(1+nkids^2/nkids^2))

# generate effect sizes
sigmag <- sqrt(2 * af * (1-af))
b <- rnorm(nsnp, mean=0, sd=sigmag)
vg <- sum(af * 2 * (1 - af) * b^2)
ve <- (vg - h2 * vg)/h2
vy <- vg + ve
b <- b/sqrt(vy) * sigmal

# mean of liability should scale to nkids once exponentiated
mu <- log(nkids^2 / sqrt(nkids^2 + nkids^2)) - sum(2*af*b)

# generate liability (fitness)
gbv <- g %*% b
l <- mu + gbv + rnorm(nid, 0, sqrt(sigmal^2 * (1-h2)))

# transform to observed scale
nu <- exp(l)

# sample counts by drawing from poisson
y1 <- rpois(nid, nu)

# estimate effects of each SNP using lm
mod1 <- lapply(1:nsnp, function(x) summary(lm(y1 ~ g[,x])))

# estimate effects of each SNP using glm
mod2 <- lapply(1:nsnp, function(x) summary(glm(y1 ~ g[,x], family=poisson(link="log"))))

b1 <- sapply(mod1, function(x) { x$coef[2,1]})
b2 <- sapply(mod2, function(x) { x$coef[2,1]})
se1 <- sapply(mod1, function(x) { x$coef[2,2]})
se2 <- sapply(mod2, function(x) { x$coef[2,2]})
pval1 <- sapply(mod1, function(x) { x$coef[2,4]})
pval2 <- sapply(mod2, function(x) { x$coef[2,4]})

# how do the SNP effect estimates compare across the two models?
plot(b1 ~ b2)

summary(lm(b2 ~ b1))

## 
## Call:
## lm(formula = b2 ~ b1)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.046623 -0.001283  0.000707  0.002402  0.026290 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.0006918  0.0009453  -0.732    0.466    
## b1           0.4949249  0.0033973 145.682   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.009448 on 98 degrees of freedom
## Multiple R-squared:  0.9954, Adjusted R-squared:  0.9954 
## F-statistic: 2.122e+04 on 1 and 98 DF,  p-value: < 2.2e-16

plot(se1 ~ se2)

summary(lm(se2 ~ se1))

## 
## Call:
## lm(formula = se2 ~ se1)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0027889 -0.0004799 -0.0002599  0.0003361  0.0064852 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.0014955  0.0001643   9.104 1.07e-14 ***
## se1         0.2382896  0.0018020 132.238  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001095 on 98 degrees of freedom
## Multiple R-squared:  0.9944, Adjusted R-squared:  0.9944 
## F-statistic: 1.749e+04 on 1 and 98 DF,  p-value: < 2.2e-16

plot(pval1 ~ pval2)

table(pval1 < 0.05, pval2 < 0.05)

##        
##         FALSE TRUE
##   FALSE    24   11
##   TRUE      0   65

Looks like the effect estimates are extremely close - but need to divide the lm effect estimate by mean number of children in the sample
The standard error for the lm is relatively higher - which means that it has slightly lower power

Standard deviation units transformation

nsnp <- 100
nid <- 10000
nkids <- 2
h2 <- 0.8

# sample allele frequencies
af <- runif(nsnp)

# generate genotypes
g <- lapply(1:nsnp, function(x) rbinom(nid, 2, af[x])) %>% do.call(cbind, .)

# variance of liability should scale to nkids once exponentiated
sigmal <- sqrt(log(1+nkids^2/nkids^2))

# generate effect sizes
sigmag <- sqrt(2 * af * (1-af))
b <- rnorm(nsnp, mean=0, sd=sigmag)
vg <- sum(af * 2 * (1 - af) * b^2)
ve <- (vg - h2 * vg)/h2
vy <- vg + ve
b <- b/sqrt(vy) * sigmal

# mean of liability should scale to nkids once exponentiated
mu <- log(nkids^2 / sqrt(nkids^2 + nkids^2)) - sum(2*af*b)

# generate liability (fitness)
gbv <- g %*% b
l <- mu + gbv + rnorm(nid, 0, sqrt(sigmal^2 * (1-h2)))

# transform to observed scale
nu <- exp(l)

# sample counts by drawing from poisson
y1 <- rpois(nid, nu)

# estimate effects of each SNP on raw units
mod1 <- sapply(1:nsnp, function(x) lm(y1 ~ g[,x])$coef[2])

y2 <- scale(y1)

# estimate effects of each SNP on sd units
mod2 <- sapply(1:nsnp, function(x) lm(y2 ~ g[,x])$coef[2])

# how do the SNP effect estimates compare across the two models?
plot(mod1 ~ mod2)

Now scale the effect estimate by sd

mod2a <- mod2 * sd(y1)
plot(mod1 ~ mod2a)

Rank Transform

library(RNOmni)
y3 <- RankNorm(y1)
mod3 <- sapply(1:nsnp, function(x) lm(y3 ~ g[,x])$coef[2])

# how do the SNP effect estimates compare across the two models?
plot(mod1 ~ mod3)

mod3a <- mod3 * sd(y1)
plot(mod1 ~ mod3a)

Intercept, heritability, lambda

Reconciling sample vs rpois

Compare GLM and LM

Standard deviation units transformation

Rank Transform

Reconciling `sample` vs `rpois`