1. Loading the data and some pre-processing

library(tidyverse)
library(haven)
library(stringr)
library(lubridate)
library(hrbrthemes)
library(viridis)
library(forcats)
library(lazyeval)

# Loading demographic data

grades_data <- read_sav("~/Dropbox/1_research/Engagement/data/all_three_semesters.sav")
grades_data <- select(grades_data, ID, contains("Exam"), FinalGrade)
grades_data$ID <- as.integer(grades_data$ID)

IDs_data <- read_csv("~/Dropbox/1_Research/Engagement/Archive/ss15_key.csv")
## Warning: Missing column names filled in: 'X12' [12]
IDs_data <- select(IDs_data, ID, username = Username)

demographic_data <- left_join(IDs_data, grades_data, by = "ID")

# Log data

file_names <- list.files("~/dropbox/1_research/engagement/data/s15data/csv")
file_names <- paste0("~/dropbox/1_research/engagement/data/s15data/csv/", file_names)

log_data <- map_df(file_names, read_csv, .id = "video_ID")

video_ID_corrected <- as.numeric(str_sub(file_names, start = -6, end = -5))
video_ID <- unique(log_data$video_ID)

lookup <- data_frame(video_ID_corrected, video_ID)

log_data <- log_data %>% 
  left_join(., lookup, by = "video_ID") %>% 
  select(-video_ID) %>% 
  select(video_ID = video_ID_corrected, everything())

log_data$UserName <- sapply(str_split(log_data$UserName, "\\\\"), function(x) x[2])

log_data <- rename(log_data, username = UserName)

# Joining data

df <- left_join(log_data, demographic_data, by = "username")

# Processing data

# quantile(demographic_data$Percent_FinalExam, c(1/3, 2/3), na.rm = T)
# quantile(demographic_data$FinalGrade, c(1/3, 2/3), na.rm = T)

df <- df %>% 
  mutate(Timestamp_n = mdy_hms(Timestamp, tz = "EST"),
         Timestamp_n = ifelse(is.na(Timestamp_n), mdy_hm(Timestamp, tz = "EST"), Timestamp_n),
         Timestamp_n = as.POSIXct(Timestamp_n, origin = "1970-01-01", tz = "EST")) %>% 
  rename(date = Timestamp_n) %>% 
  select(-Timestamp) %>% 
  mutate(grade_quartile = ifelse(FinalGrade <= 951.5, "Low",
                                 ifelse(FinalGrade > 951.5 &
                                          FinalGrade <= 1070.6, "Middle",
                                        ifelse(FinalGrade > 1070.6 , "High", NA)))) %>%
  # mutate(grade_quartile = ifelse(Percent_FinalExam <= 73.33333, "Low",
  #                                ifelse(Percent_FinalExam > 73.33333 &
  #                                         Percent_FinalExam <= 85.55556, "Middle",
  #                                       ifelse(Percent_FinalExam > 85.55556, "High", NA)))) %>%
  mutate(yday = yday(date),
         yday = yday - min(yday),
         wday = wday(date, label = T),
         wday = as_factor(wday),
         wday = fct_relevel(wday, 
                            "Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"),
         hour = as.numeric(hour(date))) %>% 
  filter(yday < 131) %>% 
  mutate(days_until_exam = ifelse(yday <= 47, 47 - yday,
                                  ifelse(yday > 47 & yday <= 90, 90 - yday,
                                         ifelse(yday > 90 & yday <= 112, 112 - yday, NA)))) %>%
  rename(twm = `Minutes Viewed`)
## Warning: 6206 failed to parse.
## Warning: 34625 failed to parse.
video_length <- df %>% 
  group_by(video_ID) %>% 
  summarize(video_length = max(twm))
df <- left_join(df, video_length)
df$proportion_watched <- df$twm / df$video_length

2. Processed data (just to get a look at what the plots and data are made from)

df
## # A tibble: 40,805 x 23
##    video_ID `Start Position`   twm username     ID Exam1 Exam2 Exam3
##       <dbl>            <dbl> <dbl>    <chr>  <int> <dbl> <dbl> <dbl>
##  1        2                0  0.32 levei1rl 887723   100  95.0  90.1
##  2        2                0  3.37 jehnz1ce 888624   100  85.5  85.0
##  3        2              165  3.68 jehnz1ce 888624   100  85.5  85.0
##  4        2              348  0.15 jehnz1ce 888624   100  85.5  85.0
##  5        2              294  0.05 jehnz1ce 888624   100  85.5  85.0
##  6        2              321  0.04 jehnz1ce 888624   100  85.5  85.0
##  7        2              268  0.26 hines1ha 887924    60  77.9  64.6
##  8        2              400  0.22 hines1ha 887924    60  77.9  64.6
##  9        2                0 11.53  cooks1e 887715    80  93.1  81.6
## 10        2              331  6.01 stite1km 887829    92  93.1  83.3
## # ... with 40,795 more rows, and 15 more variables:
## #   Percent_FinalExam <dbl>, FinalExam <dbl>, ZExam1 <dbl>, ZExam2 <dbl>,
## #   ZExam3 <dbl>, ZFinalExam <dbl>, FinalGrade <dbl>, date <dttm>,
## #   grade_quartile <chr>, yday <dbl>, wday <ord>, hour <dbl>,
## #   days_until_exam <dbl>, video_length <dbl>, proportion_watched <dbl>

3.back: Data just for modeling

create_individual_df <- function(df, which_var1, which_var2) {
  out <- df %>% 
    select_("ID", which_var1, which_var2, "twm") %>% 
    group_by_("ID", which_var1, which_var2) %>% 
    summarize(twm_sum = sum(twm)) %>% 
    spread_(which_var1, "twm_sum", fill = 0) %>% 
    gather_(key_col = "key",
            value_col = "twm_sum",
            gather_cols = colnames(.)[!colnames(.) %in% c("ID", which_var2)]) %>% 
    ungroup()
  return(out)
}

yday_model_df <- df %>% 
  create_individual_df("yday", "FinalGrade") %>% 
  rename(yday = key) %>% 
  mutate(yday = as.numeric(yday),
         days_until_exam = ifelse(yday <= 47, 47 - yday,
                                  ifelse(yday > 47 & yday <= 90, 90 - yday,
                                         ifelse(yday > 90 & yday <= 112, 112 - yday, NA)))) %>%
  left_join(demographic_data, by = "ID") %>% 
  select(student_ID = ID, final_grade = FinalGrade.x, final_exam = FinalExam, time_watched_minutes = twm_sum, yday, days_until_exam)

write_csv(yday_model_df, "yday.csv")

wday_model_df <- df %>% 
  create_individual_df("wday", "FinalGrade") %>% 
  rename(wday = key) %>% 
  left_join(demographic_data, by = "ID") %>% 
  select(student_ID = ID, final_grade = FinalGrade.x, final_exam = FinalExam, time_watched_minutes = twm_sum, wday)

write_csv(wday_model_df, "wday.csv")

hour_model_df <- df %>% 
  create_individual_df("hour", "FinalGrade") %>% 
  rename(hour = key) %>% 
  mutate(hour = as.numeric(hour)) %>% 
  left_join(demographic_data, by = "ID") %>% 
  select(student_ID = ID, final_grade = FinalGrade.x, final_exam = FinalExam, time_watched_minutes = twm_sum, hour)

write_csv(hour_model_df, "hour.csv")

3. Functions to create dfs for plotting and creating dfs

create_individual_df <- function(df, which_var1, which_var2) {
  out <- df %>% 
    select_("username", which_var1, which_var2, "twm") %>% 
    group_by_("username", which_var1, which_var2) %>% 
    summarize(twm_sum = sum(twm)) %>% 
    spread_(which_var1, "twm_sum", fill = 0) %>% 
    gather(key, twm_sum, -username, -grade_quartile)
  return(out)
}

create_aggregate_df <- function(df, which_var1, which_var2) {
  out <- df %>% 
    filter(!is.na(grade_quartile)) %>% 
    create_individual_df(which_var1, which_var2) %>% 
    group_by(grade_quartile, key) %>%
    summarize(twm_mean_g = mean(twm_sum),
              twm_se_g = sd(twm_sum) / (n() - 1))
  return(out)
}

# for proportion

wday_df <- create_aggregate_df(df, which_var1 = "wday", which_var2 = "grade_quartile")
yday_df <- create_aggregate_df(df, which_var1 = "yday", which_var2 = "grade_quartile")
hour_df <- create_aggregate_df(df, which_var1 = "hour", which_var2 = "grade_quartile")
exam_df <- create_aggregate_df(df, which_var1 = "days_until_exam", which_var2 = "grade_quartile")
video_df <- create_aggregate_df(df, which_var1 = "video_ID", which_var2 = "grade_quartile")

video_prop_df <- create_individual_df(df, which_var1 = "video_ID", which_var2 = "grade_quartile")
video_prop_df <- rename(video_prop_df, video_ID = key)
video_prop_df$video_ID <- as.integer(video_prop_df$video_ID)
video_prop_df <- left_join(video_prop_df, video_length)
## Joining, by = "video_ID"
video_prop_df <- mutate(video_prop_df, twp = twm_sum / video_length)

video_prop_to_plot <- video_prop_df %>%
  filter(!is.na(grade_quartile)) %>% 
  group_by(grade_quartile, video_ID) %>% 
  summarize(twm_mean_g = mean(twp),
            twm_se_g = sd(twp) / (n() - 1))

x <- df %>% unite(week_hour, wday, hour)

week_day_df <- create_aggregate_df(x, which_var1 = "week_hour", which_var2 = "grade_quartile")

week_day_df_p <- week_day_df %>% 
  ungroup() %>% 
  separate(key, c("wday", "hour")) %>% 
  mutate(wday = factor(wday, levels = c("Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat")),
         hour = as.numeric(hour)) %>% 
  arrange(wday, hour) %>% 
  unite(wday_hour, wday, hour)

4. Plotting function

So, this is showing how to plot.

plot_the_dfs <- function(df, plot_SE = F) {
  p <- ggplot(df, aes_string(x = "key", y = "twm_mean_g", group = "grade_quartile", color = "grade_quartile")) +
    geom_point() + geom_line() +
    theme_ipsum() +
    scale_color_viridis("", discrete = T)
  if (plot_SE == T) {
    p <- p + geom_errorbar(aes(ymin = twm_mean_g - twm_se_g, ymax = twm_mean_g + twm_se_g))
  }
  p <- p + xlab(NULL) + ylab("Mean Minutes Viewed")
  p <- p + theme(text = element_text(size = 14))
  return(p)
}

# yday_df %>% 
#   group_by(key) %>% 
#   summarize(tot_twm = mean(twm_mean_g)) %>% 
#   arrange(desc(tot_twm))
# 
# video_df$key <- as.numeric(video_df$key)
# video_p <- plot_the_dfs(df = video_df)
# video_p

5. Research Questions and Associated Plots

How does engagement outside-of-class relate to the time in the semester?

How do these relations differ by final exam group?

Plot: Proportion of video viewed

Note: Need to think through how we test these and other hypotheses.

video_prop_to_plot <- rename(video_prop_to_plot, key = video_ID)
video_prop_p <- plot_the_dfs(df = video_prop_to_plot, plot_SE = T)
video_prop_p <- video_prop_p + xlab(NULL) + ylab("Proportion of Video Viewed")
video_prop_p <- video_prop_p + theme(text = element_text(size = 14))
video_prop_p

How does engagement outside-of-class relate to the time before an exam?

How do these relations differ by final exam group?

Plot: Day of the semester (with days until exam as a time-varying covariate)

yday_df$key <- as.numeric(yday_df$key)
yday_p <- plot_the_dfs(df = yday_df, plot_SE = T) 
yday_p

exam_df$key <- as.numeric(exam_df$key)
## Warning: NAs introduced by coercion
exam_p <- plot_the_dfs(df = exam_df, plot_SE = T)
exam_p
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_path).
## Warning: Removed 3 rows containing missing values (geom_errorbar).

How does engagement with out-of-class videos in a flipped class change throughout the week?

How do these relations differ by final exam group?

Plot: Weekday and hour (separate and combined)

Note: the last one’s labels are a mess, but the order of the weekday and hour should be correct (Sunday early morning through Saturday late night). ALso, there are just a few hours with no responses (i.e. early in the morning) that need to be changed to 0s.

hour_df$key <- as.numeric(hour_df$key)
hour_p <- plot_the_dfs(df = hour_df, plot_SE = T)
hour_p

wday_df$key <- as_factor(wday_df$key)
wday_df$key <- fct_relevel(wday_df$key,
                           "Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat")
wday_p <- plot_the_dfs(df = wday_df, plot_SE = T)
wday_p 

x <- unique(week_day_df_p$wday_hour)
week_day_df_p <- rename(week_day_df_p, key = wday_hour)
week_day_df_p$key <- factor(week_day_df_p$key,
                            levels = x)
week_hour_p <- plot_the_dfs(df = week_day_df_p, plot_SE = T)
week_hour_p <- week_hour_p + theme(axis.text.x = element_text(angle = 90, hjust = 1))
week_hour_p