library(tidyverse)
library(haven)
library(stringr)
library(lubridate)
library(hrbrthemes)
library(viridis)
library(forcats)
library(lazyeval)
# Loading demographic data
grades_data <- read_sav("~/Dropbox/1_research/Engagement/data/all_three_semesters.sav")
grades_data <- select(grades_data, ID, contains("Exam"), FinalGrade)
grades_data$ID <- as.integer(grades_data$ID)
IDs_data <- read_csv("~/Dropbox/1_Research/Engagement/Archive/ss15_key.csv")
## Warning: Missing column names filled in: 'X12' [12]
IDs_data <- select(IDs_data, ID, username = Username)
demographic_data <- left_join(IDs_data, grades_data, by = "ID")
# Log data
file_names <- list.files("~/dropbox/1_research/engagement/data/s15data/csv")
file_names <- paste0("~/dropbox/1_research/engagement/data/s15data/csv/", file_names)
log_data <- map_df(file_names, read_csv, .id = "video_ID")
video_ID_corrected <- as.numeric(str_sub(file_names, start = -6, end = -5))
video_ID <- unique(log_data$video_ID)
lookup <- data_frame(video_ID_corrected, video_ID)
log_data <- log_data %>%
left_join(., lookup, by = "video_ID") %>%
select(-video_ID) %>%
select(video_ID = video_ID_corrected, everything())
log_data$UserName <- sapply(str_split(log_data$UserName, "\\\\"), function(x) x[2])
log_data <- rename(log_data, username = UserName)
# Joining data
df <- left_join(log_data, demographic_data, by = "username")
# Processing data
# quantile(demographic_data$Percent_FinalExam, c(1/3, 2/3), na.rm = T)
# quantile(demographic_data$FinalGrade, c(1/3, 2/3), na.rm = T)
df <- df %>%
mutate(Timestamp_n = mdy_hms(Timestamp, tz = "EST"),
Timestamp_n = ifelse(is.na(Timestamp_n), mdy_hm(Timestamp, tz = "EST"), Timestamp_n),
Timestamp_n = as.POSIXct(Timestamp_n, origin = "1970-01-01", tz = "EST")) %>%
rename(date = Timestamp_n) %>%
select(-Timestamp) %>%
mutate(grade_quartile = ifelse(FinalGrade <= 951.5, "Low",
ifelse(FinalGrade > 951.5 &
FinalGrade <= 1070.6, "Middle",
ifelse(FinalGrade > 1070.6 , "High", NA)))) %>%
# mutate(grade_quartile = ifelse(Percent_FinalExam <= 73.33333, "Low",
# ifelse(Percent_FinalExam > 73.33333 &
# Percent_FinalExam <= 85.55556, "Middle",
# ifelse(Percent_FinalExam > 85.55556, "High", NA)))) %>%
mutate(yday = yday(date),
yday = yday - min(yday),
wday = wday(date, label = T),
wday = as_factor(wday),
wday = fct_relevel(wday,
"Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"),
hour = as.numeric(hour(date))) %>%
filter(yday < 131) %>%
mutate(days_until_exam = ifelse(yday <= 47, 47 - yday,
ifelse(yday > 47 & yday <= 90, 90 - yday,
ifelse(yday > 90 & yday <= 112, 112 - yday, NA)))) %>%
rename(twm = `Minutes Viewed`)
## Warning: 6206 failed to parse.
## Warning: 34625 failed to parse.
video_length <- df %>%
group_by(video_ID) %>%
summarize(video_length = max(twm))
df <- left_join(df, video_length)
df$proportion_watched <- df$twm / df$video_length
df
## # A tibble: 40,805 x 23
## video_ID `Start Position` twm username ID Exam1 Exam2 Exam3
## <dbl> <dbl> <dbl> <chr> <int> <dbl> <dbl> <dbl>
## 1 2 0 0.32 levei1rl 887723 100 95.0 90.1
## 2 2 0 3.37 jehnz1ce 888624 100 85.5 85.0
## 3 2 165 3.68 jehnz1ce 888624 100 85.5 85.0
## 4 2 348 0.15 jehnz1ce 888624 100 85.5 85.0
## 5 2 294 0.05 jehnz1ce 888624 100 85.5 85.0
## 6 2 321 0.04 jehnz1ce 888624 100 85.5 85.0
## 7 2 268 0.26 hines1ha 887924 60 77.9 64.6
## 8 2 400 0.22 hines1ha 887924 60 77.9 64.6
## 9 2 0 11.53 cooks1e 887715 80 93.1 81.6
## 10 2 331 6.01 stite1km 887829 92 93.1 83.3
## # ... with 40,795 more rows, and 15 more variables:
## # Percent_FinalExam <dbl>, FinalExam <dbl>, ZExam1 <dbl>, ZExam2 <dbl>,
## # ZExam3 <dbl>, ZFinalExam <dbl>, FinalGrade <dbl>, date <dttm>,
## # grade_quartile <chr>, yday <dbl>, wday <ord>, hour <dbl>,
## # days_until_exam <dbl>, video_length <dbl>, proportion_watched <dbl>
create_individual_df <- function(df, which_var1, which_var2) {
out <- df %>%
select_("ID", which_var1, which_var2, "twm") %>%
group_by_("ID", which_var1, which_var2) %>%
summarize(twm_sum = sum(twm)) %>%
spread_(which_var1, "twm_sum", fill = 0) %>%
gather_(key_col = "key",
value_col = "twm_sum",
gather_cols = colnames(.)[!colnames(.) %in% c("ID", which_var2)]) %>%
ungroup()
return(out)
}
yday_model_df <- df %>%
create_individual_df("yday", "FinalGrade") %>%
rename(yday = key) %>%
mutate(yday = as.numeric(yday),
days_until_exam = ifelse(yday <= 47, 47 - yday,
ifelse(yday > 47 & yday <= 90, 90 - yday,
ifelse(yday > 90 & yday <= 112, 112 - yday, NA)))) %>%
left_join(demographic_data, by = "ID") %>%
select(student_ID = ID, final_grade = FinalGrade.x, final_exam = FinalExam, time_watched_minutes = twm_sum, yday, days_until_exam)
write_csv(yday_model_df, "yday.csv")
wday_model_df <- df %>%
create_individual_df("wday", "FinalGrade") %>%
rename(wday = key) %>%
left_join(demographic_data, by = "ID") %>%
select(student_ID = ID, final_grade = FinalGrade.x, final_exam = FinalExam, time_watched_minutes = twm_sum, wday)
write_csv(wday_model_df, "wday.csv")
hour_model_df <- df %>%
create_individual_df("hour", "FinalGrade") %>%
rename(hour = key) %>%
mutate(hour = as.numeric(hour)) %>%
left_join(demographic_data, by = "ID") %>%
select(student_ID = ID, final_grade = FinalGrade.x, final_exam = FinalExam, time_watched_minutes = twm_sum, hour)
write_csv(hour_model_df, "hour.csv")
create_individual_df <- function(df, which_var1, which_var2) {
out <- df %>%
select_("username", which_var1, which_var2, "twm") %>%
group_by_("username", which_var1, which_var2) %>%
summarize(twm_sum = sum(twm)) %>%
spread_(which_var1, "twm_sum", fill = 0) %>%
gather(key, twm_sum, -username, -grade_quartile)
return(out)
}
create_aggregate_df <- function(df, which_var1, which_var2) {
out <- df %>%
filter(!is.na(grade_quartile)) %>%
create_individual_df(which_var1, which_var2) %>%
group_by(grade_quartile, key) %>%
summarize(twm_mean_g = mean(twm_sum),
twm_se_g = sd(twm_sum) / (n() - 1))
return(out)
}
# for proportion
wday_df <- create_aggregate_df(df, which_var1 = "wday", which_var2 = "grade_quartile")
yday_df <- create_aggregate_df(df, which_var1 = "yday", which_var2 = "grade_quartile")
hour_df <- create_aggregate_df(df, which_var1 = "hour", which_var2 = "grade_quartile")
exam_df <- create_aggregate_df(df, which_var1 = "days_until_exam", which_var2 = "grade_quartile")
video_df <- create_aggregate_df(df, which_var1 = "video_ID", which_var2 = "grade_quartile")
video_prop_df <- create_individual_df(df, which_var1 = "video_ID", which_var2 = "grade_quartile")
video_prop_df <- rename(video_prop_df, video_ID = key)
video_prop_df$video_ID <- as.integer(video_prop_df$video_ID)
video_prop_df <- left_join(video_prop_df, video_length)
## Joining, by = "video_ID"
video_prop_df <- mutate(video_prop_df, twp = twm_sum / video_length)
video_prop_to_plot <- video_prop_df %>%
filter(!is.na(grade_quartile)) %>%
group_by(grade_quartile, video_ID) %>%
summarize(twm_mean_g = mean(twp),
twm_se_g = sd(twp) / (n() - 1))
x <- df %>% unite(week_hour, wday, hour)
week_day_df <- create_aggregate_df(x, which_var1 = "week_hour", which_var2 = "grade_quartile")
week_day_df_p <- week_day_df %>%
ungroup() %>%
separate(key, c("wday", "hour")) %>%
mutate(wday = factor(wday, levels = c("Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat")),
hour = as.numeric(hour)) %>%
arrange(wday, hour) %>%
unite(wday_hour, wday, hour)
So, this is showing how to plot.
plot_the_dfs <- function(df, plot_SE = F) {
p <- ggplot(df, aes_string(x = "key", y = "twm_mean_g", group = "grade_quartile", color = "grade_quartile")) +
geom_point() + geom_line() +
theme_ipsum() +
scale_color_viridis("", discrete = T)
if (plot_SE == T) {
p <- p + geom_errorbar(aes(ymin = twm_mean_g - twm_se_g, ymax = twm_mean_g + twm_se_g))
}
p <- p + xlab(NULL) + ylab("Mean Minutes Viewed")
p <- p + theme(text = element_text(size = 14))
return(p)
}
# yday_df %>%
# group_by(key) %>%
# summarize(tot_twm = mean(twm_mean_g)) %>%
# arrange(desc(tot_twm))
#
# video_df$key <- as.numeric(video_df$key)
# video_p <- plot_the_dfs(df = video_df)
# video_p
Plot: Proportion of video viewed
Note: Need to think through how we test these and other hypotheses.
video_prop_to_plot <- rename(video_prop_to_plot, key = video_ID)
video_prop_p <- plot_the_dfs(df = video_prop_to_plot, plot_SE = T)
video_prop_p <- video_prop_p + xlab(NULL) + ylab("Proportion of Video Viewed")
video_prop_p <- video_prop_p + theme(text = element_text(size = 14))
video_prop_p
Plot: Day of the semester (with days until exam as a time-varying covariate)
yday_df$key <- as.numeric(yday_df$key)
yday_p <- plot_the_dfs(df = yday_df, plot_SE = T)
yday_p
exam_df$key <- as.numeric(exam_df$key)
## Warning: NAs introduced by coercion
exam_p <- plot_the_dfs(df = exam_df, plot_SE = T)
exam_p
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_path).
## Warning: Removed 3 rows containing missing values (geom_errorbar).
Plot: Weekday and hour (separate and combined)
Note: the last one’s labels are a mess, but the order of the weekday and hour should be correct (Sunday early morning through Saturday late night). ALso, there are just a few hours with no responses (i.e. early in the morning) that need to be changed to 0s.
hour_df$key <- as.numeric(hour_df$key)
hour_p <- plot_the_dfs(df = hour_df, plot_SE = T)
hour_p
wday_df$key <- as_factor(wday_df$key)
wday_df$key <- fct_relevel(wday_df$key,
"Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat")
wday_p <- plot_the_dfs(df = wday_df, plot_SE = T)
wday_p
x <- unique(week_day_df_p$wday_hour)
week_day_df_p <- rename(week_day_df_p, key = wday_hour)
week_day_df_p$key <- factor(week_day_df_p$key,
levels = x)
week_hour_p <- plot_the_dfs(df = week_day_df_p, plot_SE = T)
week_hour_p <- week_hour_p + theme(axis.text.x = element_text(angle = 90, hjust = 1))
week_hour_p