BTS 510 Lab 9

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

set.seed(12345)
theme_set(theme_classic(base_size = 16))

1 Learning objectives

Interpret tests comparing two unrelated samples
Summarize data using contingency tables
Describe different study designs for contingency tables

2 Data

Pulse dataset from the Stat2Data package
- A dataset with n = 232 observations on the following 7 variables.
  - Active: Pulse rate (beats per minute) after exercise
  - Rest: Resting pulse rate (beats per minute)
  - Smoke: 1=smoker or 0=nonsmoker
  - Sex: 1=female or 0=male
  - Exercise: Typical hours of exercise (per week)
  - Hgt: Height (in inches)
  - Wgt: Weight (in pounds)

3 Tasks

Make plots of variables as needed (e.g., to assess assumptions)
Conduct a z-test, t-test, and Welch’s t-test
- What is/are your conclusion(s) based on the tests?
- Are the assumptions met?
  - e.g., large enough sample to justify z test using sample variance
  - e.g., equal variances in both groups
- Which test seems the best choice? (Don’t make this decision based on what is significant – here or elsewhere)
  - Do you think a non-parametric test might be a good option?

3.1 Some useful code

To split the dataset into Smoke = 0 and Smoke = 1
- There are other ways to do this, so you don’t need to use this code

library(Stat2Data)
data(Pulse)
library(tidyverse)
Pulse_smoke <- Pulse %>% filter(Smoke == 1)
Pulse_nosmoke <- Pulse %>% filter(Smoke == 0)
head(Pulse_smoke)

  Active Rest Smoke Sex Exercise Hgt Wgt
1     82   68     1   0        3  70 225
2     86   68     1   0        2  73 195
3     87   72     1   0        2  70 173
4    102   77     1   0        2  72 200
5     80   67     1   1        2  65 133
6     99   78     1   0        3  71 165

head(Pulse_nosmoke)

  Active Rest Smoke Sex Exercise Hgt Wgt
1     97   78     0   1        1  63 119
2     88   62     0   0        3  72 175
3    106   74     0   0        3  72 170
4     78   63     0   1        3  67 125
5    109   65     0   0        3  74 188
6     66   43     0   1        3  67 140

Use alternative = "greater" if H_1: \mu_1 > \mu_2
- Use alternative = "less" if H_1: \mu_1 < \mu_2
- Where \mu_1 is the mean for the first-entered group (x)
- The order you enter them (x vs y) doesn’t matter, just make sure you set up the directional hypothesis accordingly

3.2 Active pulse rate

mean(Pulse_smoke$Active)

[1] 97.46154

mean(Pulse_nosmoke$Active)

[1] 90.51942

var(Pulse_smoke$Active)

[1] 357.2185

var(Pulse_nosmoke$Active)

[1] 350.1338

ggplot(data = Pulse_nosmoke, 
       aes(x = Active)) +
  geom_histogram(fill = "red", 
                 alpha = 0.5, 
                 bins = 30) +
  geom_histogram(data = Pulse_smoke, 
                 aes(x = Active), 
                 fill = "black", 
                 bins = 30)

Is active pulse rate higher among smokers than non-smokers?

library(BSDA)

Loading required package: lattice


Attaching package: 'BSDA'

The following object is masked from 'package:datasets':

    Orange

ztest1 <- z.test(x = Pulse_smoke$Active, 
                y = Pulse_nosmoke$Active, 
                sigma.x = sd(Pulse_smoke$Active), 
                sigma.y = sd(Pulse_nosmoke$Active), 
                alternative = "greater")
ztest1


    Two-sample z-Test

data:  Pulse_smoke$Active and Pulse_nosmoke$Active
z = 1.7668, p-value = 0.03863
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
 0.4791124        NA
sample estimates:
mean of x mean of y 
 97.46154  90.51942

ttest1 <- t.test(x = Pulse_smoke$Active, 
                y = Pulse_nosmoke$Active, 
                alternative = "greater",
                var.equal = TRUE)
ttest1


    Two Sample t-test

data:  Pulse_smoke$Active and Pulse_nosmoke$Active
t = 1.7806, df = 230, p-value = 0.03815
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
 0.5034263       Inf
sample estimates:
mean of x mean of y 
 97.46154  90.51942

ttest1b <- t.test(x = Pulse_smoke$Active, 
                y = Pulse_nosmoke$Active, 
                alternative = "greater",
                var.equal = FALSE)
ttest1b


    Welch Two Sample t-test

data:  Pulse_smoke$Active and Pulse_nosmoke$Active
t = 1.7668, df = 31.509, p-value = 0.04348
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
 0.2833517       Inf
sample estimates:
mean of x mean of y 
 97.46154  90.51942

3.3 Weight

mean(Pulse_smoke$Wgt)

[1] 172.0385

mean(Pulse_nosmoke$Wgt)

[1] 156.1359

var(Pulse_smoke$Wgt)

[1] 1347.718

var(Pulse_nosmoke$Wgt)

[1] 948.9961

ggplot(data = Pulse_nosmoke, 
       aes(x = Wgt)) +
  geom_histogram(fill = "red", 
                 alpha = 0.5, 
                 bins = 30) +
  geom_histogram(data = Pulse_smoke, 
                 aes(x = Wgt), 
                 fill = "black", 
                 bins = 30)

Do smokers weight less than non-smokers?

ztest2 <- z.test(x = Pulse_smoke$Wgt, 
                y = Pulse_nosmoke$Wgt, 
                sigma.x = sd(Pulse_smoke$Wgt), 
                sigma.y = sd(Pulse_nosmoke$Wgt), 
                alternative = "less")
ztest2


    Two-sample z-Test

data:  Pulse_smoke$Wgt and Pulse_nosmoke$Wgt
z = 2.1167, p-value = 0.9829
alternative hypothesis: true difference in means is less than 0
95 percent confidence interval:
       NA 28.25999
sample estimates:
mean of x mean of y 
 172.0385  156.1359

ttest2 <- t.test(x = Pulse_smoke$Wgt, 
                y = Pulse_nosmoke$Wgt, 
                alternative = "less",
                var.equal = TRUE)
ttest2


    Two Sample t-test

data:  Pulse_smoke$Wgt and Pulse_nosmoke$Wgt
t = 2.4256, df = 230, p-value = 0.992
alternative hypothesis: true difference in means is less than 0
95 percent confidence interval:
     -Inf 26.73016
sample estimates:
mean of x mean of y 
 172.0385  156.1359

ttest2b <- t.test(x = Pulse_smoke$Wgt, 
                y = Pulse_nosmoke$Wgt, 
                alternative = "less",
                var.equal = FALSE)
ttest2b


    Welch Two Sample t-test

data:  Pulse_smoke$Wgt and Pulse_nosmoke$Wgt
t = 2.1167, df = 29.613, p-value = 0.9786
alternative hypothesis: true difference in means is less than 0
95 percent confidence interval:
     -Inf 28.65903
sample estimates:
mean of x mean of y 
 172.0385  156.1359

3.4 Exercise

mean(Pulse_smoke$Exercise)

[1] 1.807692

mean(Pulse_nosmoke$Exercise)

[1] 2.31068

var(Pulse_smoke$Exercise)

[1] 0.4815385

var(Pulse_nosmoke$Exercise)

[1] 0.5273976

ggplot(data = Pulse_nosmoke, 
       aes(x = Exercise)) +
  geom_histogram(fill = "red", 
                 alpha = 0.5, 
                 bins = 30) +
  geom_histogram(data = Pulse_smoke, 
                 aes(x = Exercise), 
                 fill = "black", 
                 bins = 30)

Do smokers and non-smokers exercise the same amount?

ztest3 <- z.test(x = Pulse_smoke$Exercise, 
                y = Pulse_nosmoke$Exercise, 
                sigma.x = sd(Pulse_smoke$Exercise), 
                sigma.y = sd(Pulse_nosmoke$Exercise), 
                alternative = "two.sided")
ztest3


    Two-sample z-Test

data:  Pulse_smoke$Exercise and Pulse_nosmoke$Exercise
z = -3.4643, p-value = 0.0005317
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.7875596 -0.2184150
sample estimates:
mean of x mean of y 
 1.807692  2.310680

ttest3 <- t.test(x = Pulse_smoke$Exercise, 
                y = Pulse_nosmoke$Exercise, 
                alternative = "two.sided",
                var.equal = TRUE)
ttest3


    Two Sample t-test

data:  Pulse_smoke$Exercise and Pulse_nosmoke$Exercise
t = -3.3437, df = 230, p-value = 0.0009651
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.7993817 -0.2065929
sample estimates:
mean of x mean of y 
 1.807692  2.310680

ttest3b <- t.test(x = Pulse_smoke$Exercise, 
                y = Pulse_nosmoke$Exercise, 
                alternative = "two.sided",
                var.equal = FALSE)
ttest3b


    Welch Two Sample t-test

data:  Pulse_smoke$Exercise and Pulse_nosmoke$Exercise
t = -3.4643, df = 32.314, p-value = 0.001521
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.7986223 -0.2073523
sample estimates:
mean of x mean of y 
 1.807692  2.310680

library(coin)

Loading required package: survival

median_test(Exercise ~ as.factor(Smoke), data = Pulse)


    Asymptotic Two-Sample Brown-Mood Median Test

data:  Exercise by as.factor(Smoke) (0, 1)
Z = 3.0223, p-value = 0.002509
alternative hypothesis: true mu is not equal to 0

wilcox.test(Exercise ~ as.factor(Smoke), data = Pulse)


    Wilcoxon rank sum test with continuity correction

data:  Exercise by as.factor(Smoke)
W = 3657, p-value = 0.001026
alternative hypothesis: true location shift is not equal to 0