library(tidyverse)
library(haven)
library(stringr)
library(lubridate)
library(hrbrthemes)
library(viridis)
library(forcats)
library(lazyeval)
# Loading demographic data
grades_data <- read_sav("~/Dropbox/1_research/Engagement/data/all_three_semesters.sav")
grades_data <- select(grades_data, ID, contains("Exam"), FinalGrade)
grades_data$ID <- as.integer(grades_data$ID)
IDs_data <- read_csv("~/Dropbox/1_Research/Engagement/Archive/ss15_key.csv")
## Warning: Missing column names filled in: 'X12' [12]
IDs_data <- select(IDs_data, ID, username = Username)
demographic_data <- left_join(IDs_data, grades_data, by = "ID")
# Log data
file_names <- list.files("~/dropbox/1_research/engagement/data/s15data/csv")
file_names <- paste0("~/dropbox/1_research/engagement/data/s15data/csv/", file_names)
log_data <- map_df(file_names, read_csv, .id = "video_ID")
video_ID_corrected <- as.numeric(str_sub(file_names, start = -6, end = -5))
video_ID <- unique(log_data$video_ID)
lookup <- data_frame(video_ID_corrected, video_ID)
log_data <- log_data %>%
left_join(., lookup, by = "video_ID") %>%
select(-video_ID) %>%
select(video_ID = video_ID_corrected, everything())
log_data$UserName <- sapply(str_split(log_data$UserName, "\\\\"), function(x) x[2])
log_data <- rename(log_data, username = UserName)
# Joining data
df <- left_join(log_data, demographic_data, by = "username")
# Processing data
# quantile(demographic_data$Percent_FinalExam, c(1/3, 2/3), na.rm = T)
# quantile(demographic_data$FinalGrade, c(1/3, 2/3), na.rm = T)
df <- df %>%
mutate(Timestamp_n = mdy_hms(Timestamp, tz = "EST"),
Timestamp_n = ifelse(is.na(Timestamp_n), mdy_hm(Timestamp, tz = "EST"), Timestamp_n),
Timestamp_n = as.POSIXct(Timestamp_n, origin = "1970-01-01", tz = "EST")) %>%
rename(date = Timestamp_n) %>%
select(-Timestamp) %>%
mutate(grade_quartile = ifelse(FinalGrade <= 951.5, "Low",
ifelse(FinalGrade > 951.5 &
FinalGrade <= 1070.6, "Middle",
ifelse(FinalGrade > 1070.6 , "High", NA)))) %>%
# mutate(grade_quartile = ifelse(Percent_FinalExam <= 73.33333, "Low",
# ifelse(Percent_FinalExam > 73.33333 &
# Percent_FinalExam <= 85.55556, "Middle",
# ifelse(Percent_FinalExam > 85.55556, "High", NA)))) %>%
mutate(yday = yday(date),
yday = yday - min(yday),
wday = wday(date, label = T),
wday = as_factor(wday),
wday = fct_relevel(wday,
"Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"),
hour = as.numeric(hour(date))) %>%
filter(yday < 131) %>%
mutate(days_until_exam = ifelse(yday <= 47, 47 - yday,
ifelse(yday > 47 & yday <= 90, 90 - yday,
ifelse(yday > 90 & yday <= 112, 112 - yday, NA)))) %>%
select(username,
video_ID,
twm = `Minutes Viewed`,
yday,
wday,
hour,
days_until_exam,
grade_quartile)
## Warning: 6206 failed to parse.
## Warning: 34625 failed to parse.
video_length <- df %>%
group_by(video_ID) %>%
summarize(video_length = max(twm))
df <- left_join(df, video_length)
df$proportion_watched <- df$twm / df$video_length
df
## # A tibble: 40,805 x 10
## username video_ID twm yday wday hour days_until_exam
## <chr> <dbl> <dbl> <dbl> <ord> <dbl> <dbl>
## 1 levei1rl 2 0.32 0 Wed 8 47
## 2 jehnz1ce 2 3.37 0 Wed 10 47
## 3 jehnz1ce 2 3.68 0 Wed 10 47
## 4 jehnz1ce 2 0.15 0 Wed 10 47
## 5 jehnz1ce 2 0.05 0 Wed 10 47
## 6 jehnz1ce 2 0.04 0 Wed 10 47
## 7 hines1ha 2 0.26 33 Mon 19 14
## 8 hines1ha 2 0.22 33 Mon 19 14
## 9 cooks1e 2 11.53 109 Sun 13 3
## 10 stite1km 2 6.01 18 Sun 14 29
## # ... with 40,795 more rows, and 3 more variables: grade_quartile <chr>,
## # video_length <dbl>, proportion_watched <dbl>
create_individual_df <- function(df, which_var1, which_var2) {
out <- df %>%
select_("username", which_var1, which_var2, "twm") %>%
group_by_("username", which_var1, which_var2) %>%
summarize(twm_sum = sum(twm)) %>%
spread_(which_var1, "twm_sum", fill = 0) %>%
gather(key, twm_sum, -username, -grade_quartile)
return(out)
}
create_aggregate_df <- function(df, which_var1, which_var2) {
out <- df %>%
filter(!is.na(grade_quartile)) %>%
create_individual_df(which_var1, which_var2) %>%
group_by(grade_quartile, key) %>%
summarize(twm_mean_g = mean(twm_sum),
twm_se_g = sd(twm_sum) / (n() - 1))
return(out)
}
# for proportion
wday_df <- create_aggregate_df(df, which_var1 = "wday", which_var2 = "grade_quartile")
yday_df <- create_aggregate_df(df, which_var1 = "yday", which_var2 = "grade_quartile")
hour_df <- create_aggregate_df(df, which_var1 = "hour", which_var2 = "grade_quartile")
exam_df <- create_aggregate_df(df, which_var1 = "days_until_exam", which_var2 = "grade_quartile")
video_df <- create_aggregate_df(df, which_var1 = "video_ID", which_var2 = "grade_quartile")
video_prop_df <- create_individual_df(df, which_var1 = "video_ID", which_var2 = "grade_quartile")
video_prop_df <- rename(video_prop_df, video_ID = key)
video_prop_df$video_ID <- as.integer(video_prop_df$video_ID)
video_prop_df <- left_join(video_prop_df, video_length)
## Joining, by = "video_ID"
video_prop_df <- mutate(video_prop_df, twp = twm_sum / video_length)
video_prop_to_plot <- video_prop_df %>%
filter(!is.na(grade_quartile)) %>%
group_by(grade_quartile, video_ID) %>%
summarize(twm_mean_g = mean(twp),
twm_se_g = sd(twp) / (n() - 1))
x <- df %>% unite(week_hour, wday, hour)
week_day_df <- create_aggregate_df(x, which_var1 = "week_hour", which_var2 = "grade_quartile")
week_day_df_p <- week_day_df %>%
ungroup() %>%
separate(key, c("wday", "hour")) %>%
mutate(wday = factor(wday, levels = c("Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat")),
hour = as.numeric(hour)) %>%
arrange(wday, hour) %>%
unite(wday_hour, wday, hour)
So, this is showing how to plot.
plot_the_dfs <- function(df, plot_SE = F) {
p <- ggplot(df, aes_string(x = "key", y = "twm_mean_g", group = "grade_quartile", color = "grade_quartile")) +
geom_point() + geom_line() +
theme_ipsum() +
scale_color_viridis("", discrete = T)
if (plot_SE == T) {
p <- p + geom_errorbar(aes(ymin = twm_mean_g - twm_se_g, ymax = twm_mean_g + twm_se_g))
}
p <- p + xlab(NULL) + ylab("Mean Minutes Viewed")
p <- p + theme(text = element_text(size = 14))
return(p)
}
# yday_df %>%
# group_by(key) %>%
# summarize(tot_twm = mean(twm_mean_g)) %>%
# arrange(desc(tot_twm))
#
# video_df$key <- as.numeric(video_df$key)
# video_p <- plot_the_dfs(df = video_df)
# video_p
Plot: Proportion of video viewed
Note: Need to think through how we test these and other hypotheses.
video_prop_to_plot <- rename(video_prop_to_plot, key = video_ID)
video_prop_p <- plot_the_dfs(df = video_prop_to_plot, plot_SE = T)
video_prop_p <- video_prop_p + xlab(NULL) + ylab("Proportion of Video Viewed")
video_prop_p <- video_prop_p + theme(text = element_text(size = 14))
video_prop_p
Plot: Day of the semester (with days until exam as a time-varying covariate)
yday_df$key <- as.numeric(yday_df$key)
yday_p <- plot_the_dfs(df = yday_df, plot_SE = T)
yday_p
exam_df$key <- as.numeric(exam_df$key)
## Warning: NAs introduced by coercion
exam_p <- plot_the_dfs(df = exam_df, plot_SE = T)
exam_p
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_path).
## Warning: Removed 3 rows containing missing values (geom_errorbar).
Plot: Weekday and hour (separate and combined)
Note: the last one’s labels are a mess, but the order of the weekday and hour should be correct (Sunday early morning through Saturday late night). ALso, there are just a few hours with no responses (i.e. early in the morning) that need to be changed to 0s.
hour_df$key <- as.numeric(hour_df$key)
hour_p <- plot_the_dfs(df = hour_df, plot_SE = T)
hour_p
wday_df$key <- as_factor(wday_df$key)
wday_df$key <- fct_relevel(wday_df$key,
"Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat")
wday_p <- plot_the_dfs(df = wday_df, plot_SE = T)
wday_p
x <- unique(week_day_df_p$wday_hour)
week_day_df_p <- rename(week_day_df_p, key = wday_hour)
week_day_df_p$key <- factor(week_day_df_p$key,
levels = x)
week_hour_p <- plot_the_dfs(df = week_day_df_p, plot_SE = T)
week_hour_p <- week_hour_p + theme(axis.text.x = element_text(angle = 90, hjust = 1))
week_hour_p