library(Hmisc)
library(janitor)
library(naniar)
library(xfun) ## or, if you prefer use library(sessioninfo)
library(googlesheets4)
library(gt)
library(gtExtras)
library(mosaic)
library(broom)
library(patchwork)
library(tidyverse)
theme_set(theme_bw())
::opts_chunk$set(comment = NA) knitr
Your Favorite Movies: Analysis 3
An Example for 431 Project A
1 R Packages
2 Data Ingest
gs4_deauth()
<-
movies3 read_sheet("https://docs.google.com/spreadsheets/d/1qJnQWSjXyOXFZOO8VZixpgZWbraUW66SfP5hE5bKW4k") |>
select(film_id, film, rt_critics, rt_audience) |>
mutate(film_id = as.character(film_id))
✔ Reading from "movies_2023-09-14".
✔ Range 'Initial Data'.
<- movies3 |>
movies3_cc drop_na()
dim(movies3_cc)
[1] 53 4
3 CWRU Colors
I decided to show off here and make use of some of the 2023 official CWRU colors.
- Note that once I’ve named these in this way, based on their hexadecimal representations, I do not include quotes around them in ggplot-based plots.
## CWRU colors
<- '#003071'
cwru_blue <- '#09143A'
cwru_darkblue <- '#007AB8'
cwru_trueblue <- '#A6D2E6'
cwru_lightblue <- '#999999'
cwru_darkgray <- '#D3D3D3'
cwru_lightgray <- '#D63D1F'
cwru_terracottaorange <- '#E69E40'
cwru_fallyellow <- '#377E72'
cwru_bluegreen <- '#692C95'
cwru_violetpurple <- '#61A530' cwru_vividgreen
4 An Important Message
I wrote this document to help you get a feel for what we are looking for in Analysis 3 for Project A, and to make the scope of that work a bit clearer.
Use your own words, not mine, in preparing your analytic work for Project A. Thanks.
5 Research Question
For movies mentioned by students in the Fall 2023 survey of “favorite movies”, how large a a difference exists in the percentage of people with a favorable rating of the film (as aggregated by Rotten Tomatoes) when comparing professional critics to general audience members?
6 Analysis
6.1 Variables
Here our key variables include two measures of a quantitative outcome, which we will combine to form a paired difference. We are interested in understanding something about the difference in percentages with favorable ratings comparing professional critics to the general audience.
Our outcomes are shown in the rt_critics
and rt_audience
variables, each of which expresses the percentage of people in that group who described the movie favorably.
We needed to deal with the missing values of rt_critics
for a couple of the movies which had fewer than 10 critic reviews, according to the Rotten Tomatoes site. So, immediately after importing the data, I filtered to the 53 movies with complete data on all four variables included in movies3
.
In Project A, I would begin Analysis 3 by filtering to complete cases (as I have) and then make a statement about what you are assuming the appropriate missing data mechanism to be (which I haven’t, here.) I would not impute for Analysis 3.
If you don’t have missing values in the variables used in this analysis, there is no reason to specify a missing data mechanism or do any filtering or imputation.
Our sample therefore consists of the 55 movies with complete data on rt_critics
and rt_audience
, all of whom were also mentioned by at least one Fall 2023 student.
This Analysis 3 is about comparing two means with paired samples. You should be able to explain, in a clear English sentence or two, why your data were gathered using an paired-samples design. I’ll leave that to you, but it’s very important.
6.2 Summaries
6.2.1 Distribution of the Paired Differences
First, we’ll create the paired differences as rt_audience
- rt_critics
.
<- movies3_cc |> mutate(dif = rt_audience - rt_critics) movies3_cc
Next, we’ll assess the distribution of those paired differences with our usual set of plots.
## Normal Q-Q plot
<- ggplot(movies3_cc, aes(sample = dif)) +
p1 geom_qq(col = cwru_blue, size = 2) + geom_qq_line(col = cwru_trueblue) +
theme(aspect.ratio = 1) +
labs(title = "Normal Q-Q plot",
y = "Audience - Critics Response",
x = "Expectation under Standard Normal")
## Histogram with Normal density superimposed
<- ggplot(movies3_cc, aes(dif)) +
p2 geom_histogram(aes(y = after_stat(density)),
bins = 7, fill = cwru_blue, col = cwru_lightgray) +
stat_function(fun = dnorm,
args = list(mean = mean(movies3_cc$dif, na.rm = TRUE),
sd = sd(movies3_cc$dif, na.rm = TRUE)),
col = cwru_darkgray, lwd = 1.5) +
labs(title = "Histogram with Normal Density",
x = "Audience - Critics Response")
## Boxplot with notch and rug
<- ggplot(movies3_cc, aes(x = dif, y = "")) +
p3 geom_boxplot(fill = cwru_blue, notch = TRUE,
outlier.color = cwru_blue, outlier.size = 2) +
stat_summary(fun = "mean", geom = "point",
shape = 23, size = 3, fill = cwru_lightgray) +
geom_rug(sides = "b") +
labs(title = "Boxplot with Notch and Rug",
x = "Audience - Critics Response",
y = "")
+ (p2 / p3 + plot_layout(heights = c(4,1))) +
p1 plot_annotation(title = "Audience - Critics % Favorable",
subtitle = str_glue("Fall 2023 Favorite Movies: (n = ",
nrow(movies3_cc), ")"),
caption = str_glue("Paired Differences: Sample Size = ", nrow(movies3_cc),
", Sample Median = ", round_half_up(median(movies3_cc$dif),2),
", Mean = ", round_half_up(mean(movies3_cc$dif),2),
" and SD = ", round_half_up(sd(movies3_cc$dif),2)))
This distribution looks to be centered very close to zero. This suggests that, on average, the ratings by critics and the ratings by the general audience might be similar.
Here would be a smart place to describe whether or not these data can be assumed to come from a Normally distributed population.
Here are the movies with the five highest and five lowest paired (audience - critics) differences in ratings in our sample.
|> arrange(dif) |> head(5) movies3_cc
# A tibble: 5 × 5
film_id film rt_critics rt_audience dif
<chr> <chr> <dbl> <dbl> <dbl>
1 76 Hereditary 90 70 -20
2 186 Titanic 88 69 -19
3 124 Mean Girls 84 66 -18
4 128 Mission Impossible: Ghost Protocol 93 76 -17
5 37 Crazy Rich Asians 91 76 -15
|> arrange(dif) |> tail(5) movies3_cc
# A tibble: 5 × 5
film_id film rt_critics rt_audience dif
<chr> <chr> <dbl> <dbl> <dbl>
1 136 National Lampoon's Christmas Vacation 70 86 16
2 146 Pirates of the Caribbean: Dead Man's Che… 53 72 19
3 106 The Little Mermaid (2023) 67 94 27
4 118 A Man Called Otto 70 97 27
5 44 Divergent 41 69 28
Not in Project A, no.
6.2.2 Did pairing help reduce nuisance variation?
The best way to see this is with a scatterplot of the original data, where we hope to see a strong positive correlation.
ggplot(movies3_cc, aes(x = rt_critics, y = rt_audience)) +
geom_point(col = "black") +
geom_smooth(method = "lm", col = cwru_trueblue, formula = y ~ x, se = TRUE) +
geom_smooth(method = "loess", col = cwru_darkblue, formula = y ~ x, se = FALSE) +
labs(x = "Audience: % favorable", y = "Critics: % favorable",
title = "Favorite Movies Sample for Fall 2023",
caption = str_glue("Pearson correlation = ",
round_half_up(cor(movies3_cc$rt_critics,
$rt_audience), 3)),
movies3_ccsubtitle = "Strong positive relationship between Audience and Critic ratings")
We see a strong, positive Pearson correlation (0.71) between the audience and critic ratings, so we conclude that pairing was helpful in this setting to reduce nuisance variation caused by differences between the quality of the individual movies in the sample.
6.2.3 Numerical Summaries
Here are numerical summaries of the distributions of the paired differences, as well as each of the original samples.
<- bind_rows(
key3 favstats(~ dif, data = movies3_cc),
favstats(~ rt_audience, data = movies3_cc),
favstats(~ rt_critics, data = movies3_cc)) |>
mutate(group = c("A-C difference", "rt_audience", "rt_critics")) |>
relocate(group)
|> gt() |> gt_theme_dark() key3
group | min | Q1 | median | Q3 | max | mean | sd | n | missing |
---|---|---|---|---|---|---|---|---|---|
A-C difference | -20 | -5 | 0 | 6 | 28 | 0.8679245 | 10.96329 | 53 | 0 |
rt_audience | 45 | 73 | 87 | 93 | 98 | 82.9433962 | 13.23663 | 53 | 0 |
rt_critics | 41 | 73 | 88 | 93 | 100 | 82.0754717 | 15.15795 | 53 | 0 |
The mean audience-critics difference in ratings is just barely positive.
Note that the mean of the paired differences is the same as the difference of the means for audience and critic responses, but this isn’t true for other summary statistics.
6.3 Approach
6.3.1 90% CI for mean of paired differences via t-based procedure
Here’s the t-based result, which is appropriate if we believe the paired differences can be assumed to come from a Normal population.
<- lm(dif ~ 1, data = movies3_cc)
m3
tidy(m3, conf.int = TRUE, conf.level = 0.90) |>
gt() |> fmt_number(columns = where(is.numeric), decimals = 3) |>
gt_theme_nytimes()
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
(Intercept) | 0.868 | 1.506 | 0.576 | 0.567 | −1.654 | 3.390 |
6.3.2 90% CI for mean of paired differences via bootstrap
Here’s the bootstrap result, which is more appropriate than the t procedure if we believe the paired differences cannot be assumed to come from a Normal population.
set.seed(4312023)
smean.cl.boot(movies3_cc$dif, conf.int = 0.90, B = 2000)
Mean Lower Upper
0.8679245 -1.6047170 3.3773585
As the instructions suggest, you should show your work and your reasoning (not just your code), and comment on any analytic decisions you make. Be sure to actively present and justify any assumptions you are making.
I’ve also left it to you to build appropriate interpretations of these confidence interval estimates.
6.4 Conclusions
I’ll just reprint the Advice on Analysis 3 Conclusions from the Instructions
For Analysis 3, you’ll write two paragraphs.
In the first paragraph, you should provide a clear restatement of your research question, followed by a clear and appropriate response to your research question, motivated by your results. Interpret your chosen 90% confidence interval’s endpoints and width in context. You should also reflect on your pre-existing belief about what would happen, (as discussed in Section 12) in light of your results.
Then, write a paragraph which summarizes the key limitations of your work in Analysis 3.
- If you see problems with the assumptions behind your choice of interval, that would be a good thing to talk about here, for instance.
- If pairing didn’t “help” (in the sense that there was no substantial positive correlation between the 2018 and 2023 reports), that would be worth discussing here.
- Another issue that is worth discussing is your target population, and what evidence you can describe that might indicate whether your selected states are a representative sample of the US as a whole, or perhaps some particular part of the United States.
- You should also provide at least one useful “next step” that you could take to improve this analysis (just saying “get more data” isn’t a sufficient next step.)
7 Session Information
session_info()
R version 4.3.1 (2023-06-16 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 11 x64 (build 22621)
Locale:
LC_COLLATE=English_United States.utf8
LC_CTYPE=English_United States.utf8
LC_MONETARY=English_United States.utf8
LC_NUMERIC=C
LC_TIME=English_United States.utf8
time zone: America/New_York
tzcode source: internal
Package version:
askpass_1.2.0 backports_1.4.1 base64enc_0.1-3
bigD_0.2.0 bit_4.0.5 bit64_4.0.5
bitops_1.0.7 blob_1.2.4 broom_1.0.5
bslib_0.5.1 cachem_1.0.8 callr_3.7.3
cellranger_1.1.0 checkmate_2.2.0 cli_3.6.1
clipr_0.8.0 cluster_2.1.4 colorspace_2.1-0
commonmark_1.9.0 compiler_4.3.1 conflicted_1.2.0
cpp11_0.4.6 crayon_1.5.2 curl_5.1.0
data.table_1.14.8 DBI_1.1.3 dbplyr_2.3.4
digest_0.6.33 dplyr_1.1.3 dtplyr_1.3.1
ellipsis_0.3.2 evaluate_0.22 fansi_1.0.5
farver_2.1.1 fastmap_1.1.1 fontawesome_0.5.2
forcats_1.0.0 foreign_0.8-84 Formula_1.2-5
fs_1.6.3 gargle_1.5.2 generics_0.1.3
ggforce_0.4.1 ggformula_0.10.4 ggplot2_3.4.4
ggridges_0.5.4 ggstance_0.3.6 glue_1.6.2
googledrive_2.1.1 googlesheets4_1.1.1 graphics_4.3.1
grDevices_4.3.1 grid_4.3.1 gridExtra_2.3
gt_0.10.0 gtable_0.3.4 gtExtras_0.5.0
haven_2.5.3 highr_0.10 Hmisc_5.1-1
hms_1.1.3 htmlTable_2.4.1 htmltools_0.5.6.1
htmlwidgets_1.6.2 httr_1.4.7 ids_1.0.1
isoband_0.2.7 janitor_2.2.0 jquerylib_0.1.4
jsonlite_1.8.7 juicyjuice_0.1.0 knitr_1.44
labeling_0.4.3 labelled_2.12.0 lattice_0.21-8
lifecycle_1.0.3 lubridate_1.9.3 magrittr_2.0.3
markdown_1.10 MASS_7.3-60 Matrix_1.6-1.1
memoise_2.0.1 methods_4.3.1 mgcv_1.8-42
mime_0.12 modelr_0.1.11 mosaic_1.8.4.2
mosaicCore_0.9.2.1 mosaicData_0.20.3 munsell_0.5.0
naniar_1.0.0 nlme_3.1-162 nnet_7.3-19
norm_1.0.11.1 openssl_2.1.1 paletteer_1.5.0
patchwork_1.1.3 pillar_1.9.0 pkgconfig_2.0.3
plyr_1.8.9 polyclip_1.10-6 prettyunits_1.2.0
prismatic_1.1.1 processx_3.8.2 progress_1.2.2
ps_1.7.5 purrr_1.0.2 R6_2.5.1
ragg_1.2.6 rappdirs_0.3.3 RColorBrewer_1.1.3
Rcpp_1.0.11 RcppEigen_0.3.3.9.3 reactable_0.4.4
reactR_0.5.0 readr_2.1.4 readxl_1.4.3
rematch_2.0.0 rematch2_2.1.2 reprex_2.0.2
rlang_1.1.1 rmarkdown_2.25 rpart_4.1.19
rstudioapi_0.15.0 rvest_1.0.3 sass_0.4.7
scales_1.2.1 selectr_0.4.2 snakecase_0.11.1
splines_4.3.1 stats_4.3.1 stringi_1.7.12
stringr_1.5.0 sys_3.4.2 systemfonts_1.0.5
textshaping_0.3.7 tibble_3.2.1 tidyr_1.3.0
tidyselect_1.2.0 tidyverse_2.0.0 timechange_0.2.0
tinytex_0.48 tools_4.3.1 tweenr_2.0.2
tzdb_0.4.0 UpSetR_1.4.0 utf8_1.2.3
utils_4.3.1 uuid_1.1.1 V8_4.4.0
vctrs_0.6.4 viridis_0.6.4 viridisLite_0.4.2
visdat_0.6.0 vroom_1.6.4 withr_2.5.1
xfun_0.40 xml2_1.3.5 yaml_2.3.7