Configuration and datasets

# Installing
install.packages("fst")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("readr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("moderndive")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
# Loading
library(fst)
library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ dplyr   1.0.9
## ✔ tibble  3.1.7     ✔ stringr 1.4.0
## ✔ tidyr   1.2.0     ✔ forcats 0.5.1
## ✔ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(moderndive)

# Reading
churn <- read_fst("churn.fst")
taiwan_real_estate <- read_fst("taiwan_real_estate2.fst")

auctions_raw <- read_csv("auction.csv")
## Rows: 10681 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): bidder, item, auction_type
## dbl (6): auctionid, bid, bidtime, bidderrate, openbid, price
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Palm pilot has most data, no outliers
auctions <- auctions_raw %>%
  filter(item == "Palm Pilot M515 PDA") %>%
  select(price, openbid, auction_type)

write_fst(auctions, "auctions.fst")

Parallel Slopes

# Fit a linear regr'n of price_twd_msq vs. n_convenience
mdl_price_vs_conv <- lm(price_twd_msq ~ n_convenience, taiwan_real_estate)

# See the result
mdl_price_vs_conv
## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_real_estate)
## 
## Coefficients:
##   (Intercept)  n_convenience  
##        8.2242         0.7981
# Fit a linear regr'n of price_twd_msq vs. house_age_years, no intercept
mdl_price_vs_age <- lm(price_twd_msq ~ house_age_years + 0, taiwan_real_estate)

# See the result
mdl_price_vs_age
## 
## Call:
## lm(formula = price_twd_msq ~ house_age_years + 0, data = taiwan_real_estate)
## 
## Coefficients:
##  house_age_years0 to 15  house_age_years15 to 30  house_age_years30 to 45  
##                  12.637                    9.877                   11.393
# Fit a linear regr'n of price_twd_msq vs. n_convenience plus house_age_years, no intercept
mdl_price_vs_both <- lm(price_twd_msq ~ n_convenience + house_age_years + 0, taiwan_real_estate)

# See the result
mdl_price_vs_both
## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience + house_age_years + 
##     0, data = taiwan_real_estate)
## 
## Coefficients:
##           n_convenience   house_age_years0 to 15  house_age_years15 to 30  
##                  0.7915                   9.4133                   7.0852  
## house_age_years30 to 45  
##                  7.5110
# Using taiwan_real_estate, plot price_twd_msq vs. n_convenience
ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq)) +
  # Add a point layer
  geom_point() +
  # Add a smooth trend line using linear regr'n, no ribbon
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

# Using taiwan_real_estate, plot price_twd_msq vs. house_age_years
ggplot(taiwan_real_estate, aes(house_age_years, price_twd_msq)) +
  # Add a box plot layer
  geom_boxplot() +  stat_summary(fun.y = mean, shape =15)
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Removed 3 rows containing missing values (geom_segment).

# Using taiwan_real_estate, plot price_twd_msq vs. n_convenience colored by house_age_years
ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq, color = house_age_years)) +
  # Add a point layer
  geom_point() +
  # Add parallel slopes, no ribbon
  geom_parallel_slopes(se = FALSE)

# Make a grid of explanatory data
explanatory_data <- expand_grid(
  # Set n_convenience to zero to ten
  n_convenience = seq(0, 10, 1), # or n_convenience = 0:10,
  # Set house_age_years to the unique values of that variable
  house_age_years = unique(taiwan_real_estate$house_age_years)
)

# See the result
explanatory_data
## # A tibble: 33 × 2
##    n_convenience house_age_years
##            <dbl> <fct>          
##  1             0 30 to 45       
##  2             0 15 to 30       
##  3             0 0 to 15        
##  4             1 30 to 45       
##  5             1 15 to 30       
##  6             1 0 to 15        
##  7             2 30 to 45       
##  8             2 15 to 30       
##  9             2 0 to 15        
## 10             3 30 to 45       
## # … with 23 more rows
# Add predictions to the data frame
prediction_data <- explanatory_data %>% 
  mutate(
    price_twd_msq = predict(mdl_price_vs_both, explanatory_data)
  )

# See the result
prediction_data
## # A tibble: 33 × 3
##    n_convenience house_age_years price_twd_msq
##            <dbl> <fct>                   <dbl>
##  1             0 30 to 45                 7.51
##  2             0 15 to 30                 7.09
##  3             0 0 to 15                  9.41
##  4             1 30 to 45                 8.30
##  5             1 15 to 30                 7.88
##  6             1 0 to 15                 10.2 
##  7             2 30 to 45                 9.09
##  8             2 15 to 30                 8.67
##  9             2 0 to 15                 11.0 
## 10             3 30 to 45                 9.89
## # … with 23 more rows
# Plot result
taiwan_real_estate %>% 
  ggplot(aes(n_convenience, price_twd_msq, color = house_age_years)) +
  geom_point() +
  geom_parallel_slopes(se = FALSE) +
  # Add points using prediction_data, with size 5 and shape 15
  geom_point(    
    data = prediction_data,    
    size = 5, shape = 15)

# Get the coefficients from mdl_price_vs_both
coeffs <- coefficients(mdl_price_vs_both)

# Extract the slope coefficient
slope <- coeffs[1]

# Extract the intercept coefficient for 0 to 15
intercept_0_15 <- coeffs[2]

# Extract the intercept coefficient for 15 to 30
intercept_15_30 <- coeffs[3]

# Extract the intercept coefficient for 30 to 45
intercept_30_45 <- coeffs[4]

prediction_data <- explanatory_data %>% 
  mutate(
    # Consider the 3 cases to choose the intercept
    intercept = case_when(
      house_age_years == "0 to 15" ~ intercept_0_15,
      house_age_years == "15 to 30" ~ intercept_15_30,
      house_age_years == "30 to 45" ~ intercept_30_45,
    ),
    
    # Manually calculate the predictions
    price_twd_msq = intercept + slope * n_convenience
  )

# See the results
prediction_data
## # A tibble: 33 × 4
##    n_convenience house_age_years intercept price_twd_msq
##            <dbl> <fct>               <dbl>         <dbl>
##  1             0 30 to 45             7.51          7.51
##  2             0 15 to 30             7.09          7.09
##  3             0 0 to 15              9.41          9.41
##  4             1 30 to 45             7.51          8.30
##  5             1 15 to 30             7.09          7.88
##  6             1 0 to 15              9.41         10.2 
##  7             2 30 to 45             7.51          9.09
##  8             2 15 to 30             7.09          8.67
##  9             2 0 to 15              9.41         11.0 
## 10             3 30 to 45             7.51          9.89
## # … with 23 more rows

Interactions

Multiple Linear Regression

Multiple Logistic Regression