1. Loading the data and some pre-processing

library(tidyverse)
library(haven)
library(stringr)
library(lubridate)
library(hrbrthemes)
library(viridis)
library(forcats)
library(lazyeval)

# Loading demographic data

grades_data <- read_sav("~/Dropbox/1_research/Engagement/data/all_three_semesters.sav")
grades_data <- select(grades_data, ID, contains("Exam"), FinalGrade)
grades_data$ID <- as.integer(grades_data$ID)

IDs_data <- read_csv("~/Dropbox/1_Research/Engagement/Archive/ss15_key.csv")

## Warning: Missing column names filled in: 'X12' [12]

IDs_data <- select(IDs_data, ID, username = Username)

demographic_data <- left_join(IDs_data, grades_data, by = "ID")

# Log data

file_names <- list.files("~/dropbox/1_research/engagement/data/s15data/csv")
file_names <- paste0("~/dropbox/1_research/engagement/data/s15data/csv/", file_names)

log_data <- map_df(file_names, read_csv, .id = "video_ID")

video_ID_corrected <- as.numeric(str_sub(file_names, start = -6, end = -5))
video_ID <- unique(log_data$video_ID)

lookup <- data_frame(video_ID_corrected, video_ID)

log_data <- log_data %>% 
  left_join(., lookup, by = "video_ID") %>% 
  select(-video_ID) %>% 
  select(video_ID = video_ID_corrected, everything())

log_data$UserName <- sapply(str_split(log_data$UserName, "\\\\"), function(x) x[2])

log_data <- rename(log_data, username = UserName)

# Joining data

df <- left_join(log_data, demographic_data, by = "username")

# Processing data

# quantile(demographic_data$Percent_FinalExam, c(1/3, 2/3), na.rm = T)
# quantile(demographic_data$FinalGrade, c(1/3, 2/3), na.rm = T)

df <- df %>% 
  mutate(Timestamp_n = mdy_hms(Timestamp, tz = "EST"),
         Timestamp_n = ifelse(is.na(Timestamp_n), mdy_hm(Timestamp, tz = "EST"), Timestamp_n),
         Timestamp_n = as.POSIXct(Timestamp_n, origin = "1970-01-01", tz = "EST")) %>% 
  rename(date = Timestamp_n) %>% 
  select(-Timestamp) %>% 
  mutate(grade_quartile = ifelse(FinalGrade <= 951.5, "Low",
                                 ifelse(FinalGrade > 951.5 &
                                          FinalGrade <= 1070.6, "Middle",
                                        ifelse(FinalGrade > 1070.6 , "High", NA)))) %>%
  # mutate(grade_quartile = ifelse(Percent_FinalExam <= 73.33333, "Low",
  #                                ifelse(Percent_FinalExam > 73.33333 &
  #                                         Percent_FinalExam <= 85.55556, "Middle",
  #                                       ifelse(Percent_FinalExam > 85.55556, "High", NA)))) %>%
  mutate(yday = yday(date),
         yday = yday - min(yday),
         wday = wday(date, label = T),
         wday = as_factor(wday),
         wday = fct_relevel(wday, 
                            "Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"),
         hour = as.numeric(hour(date))) %>% 
  filter(yday < 131) %>% 
  mutate(days_until_exam = ifelse(yday <= 47, 47 - yday,
                                  ifelse(yday > 47 & yday <= 90, 90 - yday,
                                         ifelse(yday > 90 & yday <= 112, 112 - yday, NA)))) %>%
  select(username,
         video_ID, 
         twm = `Minutes Viewed`,
         yday, 
         wday,
         hour,
         days_until_exam,
         grade_quartile)

## Warning: 6206 failed to parse.

## Warning: 34625 failed to parse.

video_length <- df %>% 
  group_by(video_ID) %>% 
  summarize(video_length = max(twm))
df <- left_join(df, video_length)
df$proportion_watched <- df$twm / df$video_length

2. Processed data (just to get a look at what the plots and data are made from)

df

## # A tibble: 40,805 x 10
##    username video_ID   twm  yday  wday  hour days_until_exam
##       <chr>    <dbl> <dbl> <dbl> <ord> <dbl>           <dbl>
##  1 levei1rl        2  0.32     0   Wed     8              47
##  2 jehnz1ce        2  3.37     0   Wed    10              47
##  3 jehnz1ce        2  3.68     0   Wed    10              47
##  4 jehnz1ce        2  0.15     0   Wed    10              47
##  5 jehnz1ce        2  0.05     0   Wed    10              47
##  6 jehnz1ce        2  0.04     0   Wed    10              47
##  7 hines1ha        2  0.26    33   Mon    19              14
##  8 hines1ha        2  0.22    33   Mon    19              14
##  9  cooks1e        2 11.53   109   Sun    13               3
## 10 stite1km        2  6.01    18   Sun    14              29
## # ... with 40,795 more rows, and 3 more variables: grade_quartile <chr>,
## #   video_length <dbl>, proportion_watched <dbl>

3. Functions to create dfs for plotting and creating dfs

create_individual_df <- function(df, which_var1, which_var2) {
  out <- df %>% 
    select_("username", which_var1, which_var2, "twm") %>% 
    group_by_("username", which_var1, which_var2) %>% 
    summarize(twm_sum = sum(twm)) %>% 
    spread_(which_var1, "twm_sum", fill = 0) %>% 
    gather(key, twm_sum, -username, -grade_quartile)
  return(out)
}

create_aggregate_df <- function(df, which_var1, which_var2) {
  out <- df %>% 
    filter(!is.na(grade_quartile)) %>% 
    create_individual_df(which_var1, which_var2) %>% 
    group_by(grade_quartile, key) %>%
    summarize(twm_mean_g = mean(twm_sum),
              twm_se_g = sd(twm_sum) / (n() - 1))
  return(out)
}

# for proportion

wday_df <- create_aggregate_df(df, which_var1 = "wday", which_var2 = "grade_quartile")
yday_df <- create_aggregate_df(df, which_var1 = "yday", which_var2 = "grade_quartile")
hour_df <- create_aggregate_df(df, which_var1 = "hour", which_var2 = "grade_quartile")
exam_df <- create_aggregate_df(df, which_var1 = "days_until_exam", which_var2 = "grade_quartile")
video_df <- create_aggregate_df(df, which_var1 = "video_ID", which_var2 = "grade_quartile")

video_prop_df <- create_individual_df(df, which_var1 = "video_ID", which_var2 = "grade_quartile")
video_prop_df <- rename(video_prop_df, video_ID = key)
video_prop_df$video_ID <- as.integer(video_prop_df$video_ID)
video_prop_df <- left_join(video_prop_df, video_length)

## Joining, by = "video_ID"

video_prop_df <- mutate(video_prop_df, twp = twm_sum / video_length)

video_prop_to_plot <- video_prop_df %>%
  filter(!is.na(grade_quartile)) %>% 
  group_by(grade_quartile, video_ID) %>% 
  summarize(twm_mean_g = mean(twp),
            twm_se_g = sd(twp) / (n() - 1))

x <- df %>% unite(week_hour, wday, hour)

week_day_df <- create_aggregate_df(x, which_var1 = "week_hour", which_var2 = "grade_quartile")

week_day_df_p <- week_day_df %>% 
  ungroup() %>% 
  separate(key, c("wday", "hour")) %>% 
  mutate(wday = factor(wday, levels = c("Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat")),
         hour = as.numeric(hour)) %>% 
  arrange(wday, hour) %>% 
  unite(wday_hour, wday, hour)

4. Plotting function

So, this is showing how to plot.

plot_the_dfs <- function(df, plot_SE = F) {
  p <- ggplot(df, aes_string(x = "key", y = "twm_mean_g", group = "grade_quartile", color = "grade_quartile")) +
    geom_point() + geom_line() +
    theme_ipsum() +
    scale_color_viridis("", discrete = T)
  if (plot_SE == T) {
    p <- p + geom_errorbar(aes(ymin = twm_mean_g - twm_se_g, ymax = twm_mean_g + twm_se_g))
  }
  p <- p + xlab(NULL) + ylab("Mean Minutes Viewed")
  p <- p + theme(text = element_text(size = 14))
  return(p)
}

# yday_df %>% 
#   group_by(key) %>% 
#   summarize(tot_twm = mean(twm_mean_g)) %>% 
#   arrange(desc(tot_twm))
# 
# video_df$key <- as.numeric(video_df$key)
# video_p <- plot_the_dfs(df = video_df)
# video_p

5. Research Questions and Associated Plots

How does engagement outside-of-class relate to the time in the semester?

How do these relations differ by final exam group?

Plot: Proportion of video viewed

Note: Need to think through how we test these and other hypotheses.

video_prop_to_plot <- rename(video_prop_to_plot, key = video_ID)
video_prop_p <- plot_the_dfs(df = video_prop_to_plot, plot_SE = T)
video_prop_p <- video_prop_p + xlab(NULL) + ylab("Proportion of Video Viewed")
video_prop_p <- video_prop_p + theme(text = element_text(size = 14))
video_prop_p

How does engagement outside-of-class relate to the time before an exam?

How do these relations differ by final exam group?

Plot: Day of the semester (with days until exam as a time-varying covariate)

yday_df$key <- as.numeric(yday_df$key)
yday_p <- plot_the_dfs(df = yday_df, plot_SE = T) 
yday_p

exam_df$key <- as.numeric(exam_df$key)

## Warning: NAs introduced by coercion

exam_p <- plot_the_dfs(df = exam_df, plot_SE = T)
exam_p

## Warning: Removed 3 rows containing missing values (geom_point).

## Warning: Removed 3 rows containing missing values (geom_path).

## Warning: Removed 3 rows containing missing values (geom_errorbar).

How does engagement with out-of-class videos in a flipped class change throughout the week?

How do these relations differ by final exam group?

Plot: Weekday and hour (separate and combined)

Note: the last one’s labels are a mess, but the order of the weekday and hour should be correct (Sunday early morning through Saturday late night). ALso, there are just a few hours with no responses (i.e. early in the morning) that need to be changed to 0s.

hour_df$key <- as.numeric(hour_df$key)
hour_p <- plot_the_dfs(df = hour_df, plot_SE = T)
hour_p

wday_df$key <- as_factor(wday_df$key)
wday_df$key <- fct_relevel(wday_df$key,
                           "Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat")
wday_p <- plot_the_dfs(df = wday_df, plot_SE = T)
wday_p

x <- unique(week_day_df_p$wday_hour)
week_day_df_p <- rename(week_day_df_p, key = wday_hour)
week_day_df_p$key <- factor(week_day_df_p$key,
                            levels = x)
week_hour_p <- plot_the_dfs(df = week_day_df_p, plot_SE = T)
week_hour_p <- week_hour_p + theme(axis.text.x = element_text(angle = 90, hjust = 1))
week_hour_p

Engagement Views - AERA 2018

Joshua Rosenberg

6/2/2017

1. Loading the data and some pre-processing

2. Processed data (just to get a look at what the plots and data are made from)

3. Functions to create dfs for plotting and creating dfs

4. Plotting function

5. Research Questions and Associated Plots

How does engagement outside-of-class relate to the time in the semester?

How do these relations differ by final exam group?

How does engagement outside-of-class relate to the time before an exam?

How do these relations differ by final exam group?

How does engagement with out-of-class videos in a flipped class change throughout the week?

How do these relations differ by final exam group?