Configuration and datasets
# Installing
install.packages("fst")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("readr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("moderndive")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
# Loading
library(fst)
library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ dplyr 1.0.9
## ✔ tibble 3.1.7 ✔ stringr 1.4.0
## ✔ tidyr 1.2.0 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(moderndive)
# Reading
churn <- read_fst("churn.fst")
taiwan_real_estate <- read_fst("taiwan_real_estate2.fst")
auctions_raw <- read_csv("auction.csv")
## Rows: 10681 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): bidder, item, auction_type
## dbl (6): auctionid, bid, bidtime, bidderrate, openbid, price
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Palm pilot has most data, no outliers
auctions <- auctions_raw %>%
filter(item == "Palm Pilot M515 PDA") %>%
select(price, openbid, auction_type)
write_fst(auctions, "auctions.fst")
Parallel Slopes
# Fit a linear regr'n of price_twd_msq vs. n_convenience
mdl_price_vs_conv <- lm(price_twd_msq ~ n_convenience, taiwan_real_estate)
# See the result
mdl_price_vs_conv
##
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_real_estate)
##
## Coefficients:
## (Intercept) n_convenience
## 8.2242 0.7981
# Fit a linear regr'n of price_twd_msq vs. house_age_years, no intercept
mdl_price_vs_age <- lm(price_twd_msq ~ house_age_years + 0, taiwan_real_estate)
# See the result
mdl_price_vs_age
##
## Call:
## lm(formula = price_twd_msq ~ house_age_years + 0, data = taiwan_real_estate)
##
## Coefficients:
## house_age_years0 to 15 house_age_years15 to 30 house_age_years30 to 45
## 12.637 9.877 11.393
# Fit a linear regr'n of price_twd_msq vs. n_convenience plus house_age_years, no intercept
mdl_price_vs_both <- lm(price_twd_msq ~ n_convenience + house_age_years + 0, taiwan_real_estate)
# See the result
mdl_price_vs_both
##
## Call:
## lm(formula = price_twd_msq ~ n_convenience + house_age_years +
## 0, data = taiwan_real_estate)
##
## Coefficients:
## n_convenience house_age_years0 to 15 house_age_years15 to 30
## 0.7915 9.4133 7.0852
## house_age_years30 to 45
## 7.5110
# Using taiwan_real_estate, plot price_twd_msq vs. n_convenience
ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq)) +
# Add a point layer
geom_point() +
# Add a smooth trend line using linear regr'n, no ribbon
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

# Using taiwan_real_estate, plot price_twd_msq vs. house_age_years
ggplot(taiwan_real_estate, aes(house_age_years, price_twd_msq)) +
# Add a box plot layer
geom_boxplot() + stat_summary(fun.y = mean, shape =15)
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Removed 3 rows containing missing values (geom_segment).

# Using taiwan_real_estate, plot price_twd_msq vs. n_convenience colored by house_age_years
ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq, color = house_age_years)) +
# Add a point layer
geom_point() +
# Add parallel slopes, no ribbon
geom_parallel_slopes(se = FALSE)

# Make a grid of explanatory data
explanatory_data <- expand_grid(
# Set n_convenience to zero to ten
n_convenience = seq(0, 10, 1), # or n_convenience = 0:10,
# Set house_age_years to the unique values of that variable
house_age_years = unique(taiwan_real_estate$house_age_years)
)
# See the result
explanatory_data
## # A tibble: 33 × 2
## n_convenience house_age_years
## <dbl> <fct>
## 1 0 30 to 45
## 2 0 15 to 30
## 3 0 0 to 15
## 4 1 30 to 45
## 5 1 15 to 30
## 6 1 0 to 15
## 7 2 30 to 45
## 8 2 15 to 30
## 9 2 0 to 15
## 10 3 30 to 45
## # … with 23 more rows
# Add predictions to the data frame
prediction_data <- explanatory_data %>%
mutate(
price_twd_msq = predict(mdl_price_vs_both, explanatory_data)
)
# See the result
prediction_data
## # A tibble: 33 × 3
## n_convenience house_age_years price_twd_msq
## <dbl> <fct> <dbl>
## 1 0 30 to 45 7.51
## 2 0 15 to 30 7.09
## 3 0 0 to 15 9.41
## 4 1 30 to 45 8.30
## 5 1 15 to 30 7.88
## 6 1 0 to 15 10.2
## 7 2 30 to 45 9.09
## 8 2 15 to 30 8.67
## 9 2 0 to 15 11.0
## 10 3 30 to 45 9.89
## # … with 23 more rows
# Plot result
taiwan_real_estate %>%
ggplot(aes(n_convenience, price_twd_msq, color = house_age_years)) +
geom_point() +
geom_parallel_slopes(se = FALSE) +
# Add points using prediction_data, with size 5 and shape 15
geom_point(
data = prediction_data,
size = 5, shape = 15)

# Get the coefficients from mdl_price_vs_both
coeffs <- coefficients(mdl_price_vs_both)
# Extract the slope coefficient
slope <- coeffs[1]
# Extract the intercept coefficient for 0 to 15
intercept_0_15 <- coeffs[2]
# Extract the intercept coefficient for 15 to 30
intercept_15_30 <- coeffs[3]
# Extract the intercept coefficient for 30 to 45
intercept_30_45 <- coeffs[4]
prediction_data <- explanatory_data %>%
mutate(
# Consider the 3 cases to choose the intercept
intercept = case_when(
house_age_years == "0 to 15" ~ intercept_0_15,
house_age_years == "15 to 30" ~ intercept_15_30,
house_age_years == "30 to 45" ~ intercept_30_45,
),
# Manually calculate the predictions
price_twd_msq = intercept + slope * n_convenience
)
# See the results
prediction_data
## # A tibble: 33 × 4
## n_convenience house_age_years intercept price_twd_msq
## <dbl> <fct> <dbl> <dbl>
## 1 0 30 to 45 7.51 7.51
## 2 0 15 to 30 7.09 7.09
## 3 0 0 to 15 9.41 9.41
## 4 1 30 to 45 7.51 8.30
## 5 1 15 to 30 7.09 7.88
## 6 1 0 to 15 9.41 10.2
## 7 2 30 to 45 7.51 9.09
## 8 2 15 to 30 7.09 8.67
## 9 2 0 to 15 9.41 11.0
## 10 3 30 to 45 7.51 9.89
## # … with 23 more rows