Chapter 11

NOTHING HERE REALLY YET

Goals

To finish the ANOVA/ANCOVA section with our dignity intact.

Set up

pacman::p_load(dplyr,
               tidyr,
               broom,
               stringr,
               emmeans,
               modelsummary,
               marginaleffects,
               ggeffects,
               tinyplot,
               ggplot2)
tinytheme("ipsum",
          family = "Roboto Condensed",
          palette.qualitative = "Tableau 10",
          palette.sequential = "agSunset")
# Match tinyplot ipsum style
theme_set(
  theme_minimal(base_family = "Roboto Condensed") +
    theme(panel.grid.minor = element_blank())
)

# Match Tableau 10 palette used by tinytheme (hex codes from ggthemes)
tableau10 <- c("#5778a4", "#e49444", "#d1615d", "#85b6b2", "#6a9f58",
               "#e7ca60", "#a87c9f", "#f1a2a9", "#967662", "#b8b0ac")

options(
  ggplot2.discrete.colour = \() scale_color_manual(values = tableau10),
  ggplot2.discrete.fill   = \() scale_fill_manual(values = tableau10)
)

gss <- readRDS("data/gss2024.rds") |> 
  select(id, tvhours, sex, degree, age) |> 
  drop_na() |> 
  haven::zap_labels()
m1 <- lm(tvhours ~ poly(age, 5) * factor(sex),
         data = gss)
summary(m1)

Call:
lm(formula = tvhours ~ poly(age, 5) * factor(sex), data = gss)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.5857 -1.7370 -0.7111  0.7383 21.3271 

Coefficients:
                           Estimate Std. Error t value Pr(>|t|)    
(Intercept)                  3.2063     0.1074  29.865  < 2e-16 ***
poly(age, 5)1               32.5273     4.8049   6.770 1.67e-11 ***
poly(age, 5)2                9.3945     4.9895   1.883   0.0599 .  
poly(age, 5)3               -8.8778     5.0950  -1.742   0.0816 .  
poly(age, 5)4               -5.7797     5.1326  -1.126   0.2603    
poly(age, 5)5               12.3663     4.9872   2.480   0.0132 *  
factor(sex)2                 0.1949     0.1432   1.361   0.1737    
poly(age, 5)1:factor(sex)2  -5.5344     6.5316  -0.847   0.3969    
poly(age, 5)2:factor(sex)2   7.1800     6.5927   1.089   0.2762    
poly(age, 5)3:factor(sex)2   2.2800     6.6310   0.344   0.7310    
poly(age, 5)4:factor(sex)2   6.4230     6.6633   0.964   0.3352    
poly(age, 5)5:factor(sex)2  -0.3673     6.6066  -0.056   0.9557    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.235 on 2074 degrees of freedom
Multiple R-squared:  0.05789,   Adjusted R-squared:  0.05289 
F-statistic: 11.58 on 11 and 2074 DF,  p-value: < 2.2e-16
ggpredict(m1, terms = c("age", "sex")) |> plot()

checkpoly <- function(n) {
  
  tmp <- lm(tvhours ~ poly(age, n) * factor(sex),
         data = gss)
  
  glance(tmp)[1,8]
  
}

Let’s look at an experiment.

d <- readRDS("data/lalonde_exp.rds") |> 
  mutate(treat_fac = factor(treat,
                            labels = c("control", "treat")))
plt(re78 ~ treat_fac,
    data = d,
    type = "box")

plt(re78 ~ treat_fac,
    data = d,
    type = type_jitter(factor = .5))

m1 <- lm(re78 ~ treat_fac,
         data = d)
summary(m1)

Call:
lm(formula = re78 ~ treat_fac, data = d)

Residuals:
   Min     1Q Median     3Q    Max 
-5.976 -5.090 -1.519  3.361 54.332 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)      5.0900     0.3028  16.811   <2e-16 ***
treat_factreat   0.8863     0.4721   1.877   0.0609 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.242 on 720 degrees of freedom
Multiple R-squared:  0.004872,  Adjusted R-squared:  0.003489 
F-statistic: 3.525 on 1 and 720 DF,  p-value: 0.06086
m2 <- lm(re78 ~ treat_fac + re75,
         data = d)
summary(m2)

Call:
lm(formula = re78 ~ treat_fac + re75, data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-10.188  -4.512  -1.498   3.104  54.672 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)     4.51238    0.32932  13.702  < 2e-16 ***
treat_factreat  0.87878    0.46671   1.883   0.0601 .  
re75            0.19086    0.04536   4.207 2.91e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.171 on 719 degrees of freedom
Multiple R-squared:  0.02878,   Adjusted R-squared:  0.02608 
F-statistic: 10.65 on 2 and 719 DF,  p-value: 2.756e-05
m_diff <- lm(I(re78 - re75) ~ treat_fac,
             data = d)
summary(m_diff)

Call:
lm(formula = I(re78 - re75) ~ treat_fac, data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-37.995  -3.198  -1.442   3.677  56.114 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)      2.0634     0.3593   5.743 1.37e-08 ***
treat_factreat   0.8469     0.5601   1.512    0.131    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 7.406 on 720 degrees of freedom
Multiple R-squared:  0.003165,  Adjusted R-squared:  0.00178 
F-statistic: 2.286 on 1 and 720 DF,  p-value: 0.131
m2_constrained <- lm(re78 ~ treat_fac,
                     offset = re75,
                     data = d)
summary(m2_constrained)

Call:
lm(formula = re78 ~ treat_fac, data = d, offset = re75)

Residuals:
    Min      1Q  Median      3Q     Max 
-37.995  -3.198  -1.442   3.677  56.114 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)      2.0634     0.3593   5.743 1.37e-08 ***
treat_factreat   0.8469     0.5601   1.512    0.131    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 7.406 on 720 degrees of freedom
Multiple R-squared:  0.003165,  Adjusted R-squared:  0.00178 
F-statistic: 2.286 on 1 and 720 DF,  p-value: 0.131
m3 <- lm(re78 ~ treat_fac + re75 + re74 + age + educ + 
           black + hisp + nodegr + u74 + u75 + married,
         data = d)
summary(m3)

Call:
lm(formula = re78 ~ treat_fac + re75 + re74 + age + educ + black + 
    hisp + nodegr + u74 + u75 + married, data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-10.395  -4.431  -1.455   3.189  53.911 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)  
(Intercept)     3.18568    2.63786   1.208   0.2276  
treat_factreat  0.82365    0.46846   1.758   0.0791 .
re75            0.06472    0.09145   0.708   0.4794  
re74            0.12741    0.07526   1.693   0.0909 .
age             0.01024    0.03707   0.276   0.7824  
educ            0.20059    0.18052   1.111   0.2669  
black          -1.41919    0.80195  -1.770   0.0772 .
hisp            0.29929    1.05046   0.285   0.7758  
nodegr         -0.29961    0.74798  -0.401   0.6889  
u74             1.52993    0.93782   1.631   0.1033  
u75            -1.00584    0.91702  -1.097   0.2731  
married         0.04854    0.65220   0.074   0.9407  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.147 on 710 degrees of freedom
Multiple R-squared:  0.04827,   Adjusted R-squared:  0.03353 
F-statistic: 3.274 on 11 and 710 DF,  p-value: 0.0002169
m4 <- lm(re78 ~ treat + re75 + I(re75^2) + re74 + I(re74^2) 
           + age + I(age^2) + educ + I(educ^2) + black + hisp + married
           + nodegr + u74 + u75,
         data = d)
summary(m4)

Call:
lm(formula = re78 ~ treat + re75 + I(re75^2) + re74 + I(re74^2) + 
    age + I(age^2) + educ + I(educ^2) + black + hisp + married + 
    nodegr + u74 + u75, data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-11.465  -4.435  -1.491   3.296  53.923 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)  
(Intercept) 10.985771   5.330026   2.061   0.0397 *
treat        0.813809   0.468155   1.738   0.0826 .
re75         0.181010   0.193615   0.935   0.3502  
I(re75^2)   -0.003420   0.006532  -0.524   0.6007  
re74        -0.182819   0.180101  -1.015   0.3104  
I(re74^2)    0.010120   0.005322   1.902   0.0576 .
age         -0.134437   0.216422  -0.621   0.5347  
I(age^2)     0.002486   0.003606   0.689   0.4909  
educ        -1.181805   1.014489  -1.165   0.2444  
I(educ^2)    0.079069   0.055673   1.420   0.1560  
black       -1.418940   0.800419  -1.773   0.0767 .
hisp         0.265324   1.049633   0.253   0.8005  
married      0.152768   0.654873   0.233   0.8156  
nodegr       0.312865   0.910017   0.344   0.7311  
u74          0.712339   1.036152   0.687   0.4920  
u75         -0.947228   0.968743  -0.978   0.3285  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.132 on 706 degrees of freedom
Multiple R-squared:  0.05846,   Adjusted R-squared:  0.03846 
F-statistic: 2.922 on 15 and 706 DF,  p-value: 0.0001621
m5 <- lm(re78 ~ treat * (re75 + re74 + age + educ + nodegr +
                           black + hisp + married),
         data = d)
summary(m5)

Call:
lm(formula = re78 ~ treat * (re75 + re74 + age + educ + nodegr + 
    black + hisp + married), data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-11.444  -4.335  -1.603   3.298  54.374 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)  
(Intercept)    6.517070   3.437699   1.896   0.0584 .
treat         -6.551504   5.241558  -1.250   0.2117  
re75           0.210366   0.112566   1.869   0.0621 .
re74           0.068945   0.088207   0.782   0.4347  
age           -0.011283   0.046925  -0.240   0.8101  
educ          -0.015309   0.242965  -0.063   0.9498  
nodegr        -0.632323   1.004967  -0.629   0.5294  
black         -1.589615   1.084345  -1.466   0.1431  
hisp           0.539836   1.371852   0.394   0.6941  
married       -1.012499   0.857109  -1.181   0.2379  
treat:re75    -0.261151   0.181527  -1.439   0.1507  
treat:re74     0.000318   0.149470   0.002   0.9983  
treat:age      0.057136   0.073886   0.773   0.4396  
treat:educ     0.517990   0.361004   1.435   0.1518  
treat:nodegr   0.852859   1.502433   0.568   0.5705  
treat:black    0.589770   1.606731   0.367   0.7137  
treat:hisp    -0.801334   2.132576  -0.376   0.7072  
treat:married  2.418215   1.320962   1.831   0.0676 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.123 on 704 degrees of freedom
Multiple R-squared:  0.06363,   Adjusted R-squared:  0.04101 
F-statistic: 2.814 on 17 and 704 DF,  p-value: 0.0001321
avg_slopes(m5, variables = "treat")

 Estimate Std. Error    z Pr(>|z|)   S   2.5 % 97.5 %
    0.819      0.466 1.76    0.079 3.7 -0.0947   1.73

Term: treat
Type: response
Comparison: 1 - 0
m1 <- glm(tvhours ~ age,
          data = gss,
          family = poisson())
summary(m1)

Call:
glm(formula = tvhours ~ age, family = poisson(), data = gss)

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept) 0.6199166  0.0385197   16.09   <2e-16 ***
age         0.0111123  0.0006796   16.35   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 5366.4  on 2085  degrees of freedom
Residual deviance: 5098.1  on 2084  degrees of freedom
AIC: 10657

Number of Fisher Scoring iterations: 5
ggpredict(m1, terms = "age") |> plot()

d <- d |> 
  mutate(re78 = floor(re78 * 1000),
         re75 = floor(re75 * 1000),
         re74 = floor(re74 * 1000))

m1 <- glm(re78 ~ treat_fac,
          data = d,
          family = poisson())
summary(m1)

Call:
glm(formula = re78 ~ treat_fac, family = poisson(), data = d)

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)    8.5349735  0.0006799 12553.4   <2e-16 ***
treat_factreat 0.1605231  0.0010127   158.5   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 4851711  on 721  degrees of freedom
Residual deviance: 4826748  on 720  degrees of freedom
AIC: 4832200

Number of Fisher Scoring iterations: 5