library(knitr)
library(abd)Exercise 3 Solution
L01: Simulations to understand sampling distributions
Includes:
- Lion noses linear regression
- Data generation consistent with model
- Linear regression of this first dataset
- In-class Sampling Distribution Simulation Assignment
Document Preamble
Load libraries
Settings for Knitr (optional)
opts_chunk$set(fig.width = 8, fig.height = 6)1. Lion noses linear regression:
Data entry
data(LionNoses)
head(LionNoses) age proportion.black
1 1.1 0.21
2 1.5 0.14
3 1.9 0.11
4 2.2 0.13
5 2.6 0.12
6 3.2 0.13
Fit linear model
lm.nose<-lm(age~proportion.black, data=LionNoses)Parameters:
Coefficients and residual variation are stored in lmfit:
coef(lm.nose) (Intercept) proportion.black
0.8790062 10.6471194
summary(lm.nose)$sigma # residual variation[1] 1.668764
What else is stored in lmfit? (residuals, variance covariance matrix, etc)
names(lm.nose) [1] "coefficients" "residuals" "effects" "rank"
[5] "fitted.values" "assign" "qr" "df.residual"
[9] "xlevels" "call" "terms" "model"
names(summary(lm.nose)) [1] "call" "terms" "residuals" "coefficients"
[5] "aliased" "sigma" "df" "r.squared"
[9] "adj.r.squared" "fstatistic" "cov.unscaled"
2. Data generation consistent with fitted model
## Use the same sampmle size Sample size - use length so it matches sample size of original data
n <- length(LionNoses$age)
## Predictor - copy of original proporation black data, now in vector
p.black <- LionNoses$proportion.black
## Parameters
sigma <- summary(lm.nose)$sigma # residual variation
betas <- coef(lm.nose)# regression coefficients
## Errors and response
# Residual errors are modeled as ~ N(0, sigma)
epsilon <- rnorm(n, 0, sigma)
# Response is modeled as linear function plus residual errors
y <- betas[1] + betas[2]*p.black + epsilon3. Linear regression of this generated dataset
# Fit of model to simulated data:
lmfit.generated <- lm(y ~ p.black)
summary(lmfit.generated)
Call:
lm(formula = y ~ p.black)
Residuals:
Min 1Q Median 3Q Max
-3.8583 -1.6573 0.5004 1.5111 3.4417
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.9734 0.6711 1.450 0.157
p.black 9.7032 1.7810 5.448 6.57e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.969 on 30 degrees of freedom
Multiple R-squared: 0.4974, Adjusted R-squared: 0.4806
F-statistic: 29.68 on 1 and 30 DF, p-value: 6.57e-06
In-Class Sampling Distribution Simulation Assignment
Exercise 3:
- Generate 5000 datasets using the same code
- Fit a linear regression model to each dataset “lm.temp”
- Store the estimates of \(\beta_1\) and t-statistics
- Calucate confidence limits for each simulation and determine how many include the true parameter used to simulate the data.
Hint: if you get stuck, try starting with a small number of simulations (less than 5000) until you get the code right.
# set up a matricies to hold results
nsims <- 5000 # number of simulations
beta.hat<- matrix(NA, nrow = nsims, ncol = 1) # estimates of beta_1
tsamp.dist<-matrix(NA, nsims, ncol = 1) # matrix to hold t-statistics
limits <- matrix(NA, nrow = nsims, ncol = 2) # matrix to hold CI limits
colnames(limits) <- c("LL.slope","UL.slope")# label columns
# Simulation
for(i in 1:nsims){
epsilon <- rnorm(n, 0, sigma) # random errors
y <- betas[1] + betas[2]*p.black + epsilon # response
lm.temp <- lm(y ~ p.black)
## extract beta-hat
beta.hat[i] <- coef(lm.temp)[2]
# Here is our t-statistic, calculated for each sample
tsamp.dist[i]<-(beta.hat[i]-betas[2])/sqrt(vcov(lm.temp)[2,2])
# Confidence limits
limits[i,] <- confint(lm.temp)[2,]
}How many CI include the parameter used to generate the data?
# Indicator of whether "true" parameter is within confidence intervals
I.in <- betas[2] >= limits[,1] & betas[2] <= limits[,2]
# Proportion of confidence intervals with true beta
sum(I.in)/nsims[1] 0.9504
Plot earlier results
par(mfrow=c(1,2))
hist(beta.hat, col="gray",xlab="", main=expression(paste("Sampling Distribution of ", hat(beta)[1])))
abline(v=betas[2]) # add population parameter
hist(tsamp.dist, xlab="",
main=expression(t==frac(hat(beta)-beta, se(hat(beta)))), freq=FALSE)
tvalues<-seq(-3,3, length=1000) # xvalues to evaluate t-distribution
lines(tvalues,dt(tvalues, df=30)) # overlay t-distributionPlot results of confidence limits (first 100 of them)
sim.dat<-data.frame(est.slope=beta.hat, limits, In=I.in)
ggplot(sim.dat[1:100,], aes(x=est.slope, y=1:100, colour=as.factor(In))) +
geom_segment(aes(x=LL.slope, xend=UL.slope, yend=1:100, colour=as.factor(In))) +
scale_colour_discrete(name=expression(paste("Contains ", beta, "?"))) +
geom_point() +
theme(axis.text.y=element_blank()) +
geom_vline(xintercept=betas[2]) +
labs(x = "Estimate", y = " ",
alt = "Plot of 100 confidence intervals showing whether or not they contain the true parameter")