library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
set.seed(31415)

Baseline parameters

# sample size
n <- 100000

# effect allele frequency
eaf <- 0.3

# genotype - diploid additive model
g <- rbinom(n, size = 2, prob = eaf)

Check things on the continuous scale

Basic model:

\[ x_i = \alpha + \beta g_i + e_i \]

where \(g\) is binomially distributed of size 2 (i.e. diploid genotypes) where the effect allele frequency is \(p\)

\[ g \sim Binom(2, p) \]

and the error term

\[ e \sim N(0, 1) \]

Note that if we centre \(g\) to have mean of 0 (i.e. \(g - 2p\)) then the \(\alpha\) term disappears.

We expect that the variance explained in \(x\) by \(g\) is

\[ h^2_x = 2 \beta^2 p (1-p) / var(x) \]

Run a few simulations something on the simple continuous / liability scale:

# Number of simulations
nsim <- 100

# Sample a SNP effect for each simulation
b <- rnorm(nsim)

For each of the 100 simulations

res <- lapply(b, function(B)
{
    # Generate x - the residual variance will be 1 given g
    x <- rnorm(n, (g-mean(g)) * B, 1)
    tibble(
        b = B,
        vx = var(x),                                # Estimate variance of x
        r = cor(x, g),                              # Correlation between x and g
        rsq = r^2,                                  # Variance explained in x by g
        bhat = cov(x, g) / var(g),                  # Effect estimate of g on x
        h2 = bhat^2 * 2 * eaf * (1-eaf) / var(x)    # Expected variance explained in x by g
    )
}) %>% bind_rows()

Check that estimated \(\hat{\beta} \approx \beta\)

ggplot(res, aes(x = b, y = bhat)) +
geom_point()

Check that estimated \(\hat{R^2} \approx E(h^2_x)\)

ggplot(res, aes(x = rsq, y = h2)) +
geom_point()

Check that variance in x changes with simulated \(\beta\)

ggplot(res, aes(x = b, y = vx)) +
geom_point()

Generating genetic effects on count data

To make a Poisson variable, for each individual \(i\) the count is based on a \(\lambda_i\) value that is related to the individual's genotype \(g_i\) and the expected number of children \(\alpha\)

\[ \lambda_i = \alpha + \beta (g_i - 2p) \]

In this regard, \(\lambda_i\) is equivalent to the expected values of liability model depicted above as \(x\). Then the number of children that an individual has is drawn by

\[ y_i \sim Pois(\lambda_i) \]

e.g.

# average number of kids per person
a <- 2

# genetic effect on number of children
b <- 0.2

# Generate liability
lambdai <- a + b * (g - mean(g))

# Number of kids
y <- rpois(n, lambdai)

We expect the average number of kids by genotype to increment by 0.2 per genotype value. On the liability scale:

# lambda value by genotype
tapply(lambdai, g, mean)
##        0        1        2 
## 1.879528 2.079528 2.279528

On the observed scale?

tapply(y, g, mean)
##        0        1        2 
## 1.877963 2.086697 2.281568

Seems to work

Linear and generalised linear models

Can we retrieve the simulated genetic effects through linear or Poisson regression

b <- 0.2
y <- rpois(n, 2 + b * (g-mean(g)))
summary(lm(y ~ g))
## 
## Call:
## lm(formula = y ~ g)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2716 -1.0772 -0.0772  0.9228  8.7284 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.882724   0.006116  307.85   <2e-16 ***
## g           0.194428   0.006903   28.17   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.418 on 99998 degrees of freedom
## Multiple R-squared:  0.007872,   Adjusted R-squared:  0.007862 
## F-statistic: 793.4 on 1 and 99998 DF,  p-value: < 2.2e-16

This gives a pretty good estimate for the effect of \(g\) on number of kids, but the intercept is off - because g is not centered. Can we retrieve?

2 - 2 * eaf * b
## [1] 1.88

What about Poisson regression

mod <- summary(glm(y ~ g, family=poisson))
mod
## 
## Call:
## glm(formula = y ~ g, family = poisson)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1358  -0.8296  -0.0511   0.6030   4.1444  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 0.633631   0.003119  203.14   <2e-16 ***
## g           0.095441   0.003380   28.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 115193  on 99999  degrees of freedom
## Residual deviance: 114405  on 99998  degrees of freedom
## AIC: 341082
## 
## Number of Fisher Scoring iterations: 5

Poisson regression uses a natural log link function and the coefficients are estimated on the multiplicative scale rather than the additive scale used in the continuous/liability model. So can we retrieve the effects on the liability scale? Let's call the coefficients from the Poisson \(b_0\) for the intercept and \(b_1\) for the effect of \(g\)

\[ \beta = \exp(b_0) \times (e^{b_1} - 1) \]

so the intercept:

exp(coefficients(mod)[1,1])
## [1] 1.88444

and the effect estimate:

exp(coefficients(mod)[1,1]) * (exp(coefficients(mod)[2,1]) - 1)
## [1] 0.1887149

Run a few simulations to check

nsim <- 100
b <- rnorm(nsim, sd=0.1) # sample genetic effects for each simulation
a <- runif(nsim, 2, 4) # mean kids range from 2 to 4

res <- lapply(1:nsim, function(i)
{
    y <- rpois(n, a[i] + b[i] * (g-mean(g)))
    mod1 <- lm(y ~ g)
    mod2 <- glm(y ~ g, family=poisson)
    tibble(
        a = a[i],
        b = b[i],
        ahat1 = coefficients(mod1)[1],
        ahat2 = coefficients(mod2)[1],
        bhat1 = coefficients(mod1)[2],
        bhat2 = coefficients(mod2)[2],
        betahat2 = exp(ahat2) * (exp(bhat2) - 1)
    )
}) %>% bind_rows()

Linear model estimates vs true effects:

ggplot(res, aes(x=b, y=bhat1)) +
geom_point()

Poisson model estimates vs true effects:

ggplot(res, aes(x=b, y=betahat2)) +
geom_point()

Linear model vs Poisson model estimates:

ggplot(res, aes(x=bhat1, y=betahat2)) +
geom_point()

Ok so they look identical. Is there any value in doing the Poisson e.g. standard errors?

nsim <- 100
b <- c(rnorm(nsim/2, sd=0.1), rep(0, nsim/2)) # mixture of real and null effects
a <- runif(100, 2, 4)

res <- lapply(1:nsim, function(i)
{
    y <- rpois(n, a[i] + b[i] * (g-mean(g)))
    mod1 <- summary(lm(y ~ g))
    mod2 <- summary(glm(y ~ g, family=poisson))
    tibble(
        a = a[i],
        b = b[i],
        pval1 = coefficients(mod1)[2,4],
        pval2 = coefficients(mod2)[2,4]
    )
}) %>% bind_rows()

Compare p-values - for null model want pvals to be uniformly distributed, for non-null model want smaller p-values

ggplot(res, aes(x = pval1, y = pval2)) +
geom_point() +
facet_grid(. ~ b == 0)

Ok they are identical, so we can just use the linear model instead of Poisson?