Chapter 3 Digit Model
In preparation for neural networks, we take a brief chapter to run other models on MNIST hand-written data. First we will run a binomial GLM on each digit and keep the maximum outputted likelihood as the predicted digit, then we will run a multinomial GLM to assess the likelihood of every digit simultaneously.
This chapter can be safely skipped / ignored.
3.1 Binomial Model
3.1.1 Setup
# Loads the MNIST dataset, saves as an .RData file if not in WD
if (!(file.exists("mnist_data.RData"))) {
# ## installs older python version
# reticulate::install_python("3.10:latest")
# keras::install_keras(python_version = "3.10")
# ## re-loads keras
# library(keras)
## get MNIST data
<- dataset_mnist()
mnist ## save to WD as .RData
save(mnist, file = "mnist_data.RData")
else {
} ## read-in MNIST data
load(file = "mnist_data.RData")
}
# Access the training and testing sets
<- mnist$train$x
x_train <- mnist$train$y
y_train <- mnist$test$x
x_test <- mnist$test$y
y_test
rm(mnist)
## plot function, from OG data
<- function(plt) {
plot_mnist ## create image
image(x = 1:28,
y = 1:28,
## image is oriented incorrectly, this fixes it
z = t(apply(plt, 2, rev)),
## 255:0 puts black on white canvas,
## changing to 0:255 puts white on black canvas
col = gray((255:0)/255),
axes = FALSE)
## create plot border
rect(xleft = 0.5,
ybottom = 0.5,
xright = 28 + 0.5,
ytop = 28 + 0.5,
border = "black",
lwd = 1)
}
## train data
# initialize matrix
<- matrix(nrow = nrow(x_train),
x_train_2 ncol = 28*28)
## likely a faster way to do this in the future
for (i in 1:nrow(x_train)) {
## get each layer's matrix image, stretch to 28^2 x 1
<- matrix(x_train[i, , ], 1, 28*28)
x_train_2[i, ]
}
<- x_train_2 %>%
x_train_2 as.data.frame()
## test data
<- matrix(nrow = nrow(x_test),
x_test_2 ncol = 28*28)
for (i in 1:nrow(x_test)) {
<- matrix(x_test[i, , ], 1, 28*28)
x_test_2[i, ]
}
<- x_test_2 %>%
x_test_2 as.data.frame()
## re-scale data
<- x_train_2 / 256
x_train_2 <- x_test_2 / 256
x_test_2
## response
# x_test_2$y <- y_test
# x_train_2$y <- y_train
3.1.2 Model
## for speed
# n <- nrow(x_train_2)
<- 100
n <- sample(x = 1:nrow(x_train_2),
indices size = n)
## init data
<- x_train_2[indices, ]
x_glm <- y_train[indices]
y_glm <- list()
train_pred
## drop cols with all 0s
<- x_glm[, (colSums(x_glm) > 0)] x_glm
## 10 model method
for (i in 0:9) {
print(i)
= (y_glm == i)
y_glm_i
<- cv.glmnet(x = x_glm %>% as.matrix,
init_model y = y_glm_i,
family = binomial,
alpha = 1)
+ 1]] <- predict(init_model,
train_pred[[i %>% as.matrix,
x_glm s = init_model$lambda.min,
type = "response")
}
## [1] 0
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## format results
<- data.frame(train_pred)
predictions names(predictions) <- c("zero",
"one",
"two",
"three",
"four",
"five",
"six",
"seven",
"eight",
"nine")
#write.csv(predictions, "pred.csv", row.names = FALSE)
## convert to numeric
<- apply(X = predictions,
max_col MARGIN = 1,
FUN = function(x) names(x)[which.max(x)])
<- c("zero" = 0,
word_to_number "one" = 1,
"two" = 2,
"three" = 3,
"four" = 4,
"five" = 5,
"six" = 6,
"seven" = 7,
"eight" = 8,
"nine" = 9)
<- word_to_number[max_col] %>% as.numeric
preds
## confusion matrix
table(y_glm, preds)
## preds
## y_glm 0 1 2 3 4 5 6 7 8 9
## 0 10 0 0 0 0 0 0 0 0 0
## 1 0 12 0 0 0 0 0 0 0 0
## 2 0 0 7 0 0 0 0 0 0 0
## 3 0 0 0 9 0 0 0 0 0 0
## 4 0 0 0 0 10 0 0 0 0 0
## 5 0 0 0 0 0 16 0 0 0 0
## 6 0 0 0 0 0 0 12 0 0 0
## 7 0 0 0 0 0 0 0 11 0 0
## 8 2 0 1 0 0 3 0 0 3 0
## 9 0 0 0 0 2 0 0 0 0 2
## misclassification rate
mean(!(y_glm == preds))
## [1] 0.08
3.2 Multinomial Model
3.2.1 Setup
# Loads the MNIST dataset, saves as an .RData file if not in WD
if (!(file.exists("mnist_data.RData"))) {
# ## installs older python version
# reticulate::install_python("3.10:latest")
# keras::install_keras(python_version = "3.10")
# ## re-loads keras
# library(keras)
## get MNIST data
<- dataset_mnist()
mnist ## save to WD as .RData
save(mnist, file = "mnist_data.RData")
else {
} ## read-in MNIST data
load(file = "mnist_data.RData")
}
# Access the training and testing sets
<- mnist$train$x
x_train <- mnist$train$y
y_train <- mnist$test$x
x_test <- mnist$test$y
y_test
rm(mnist)
## plot function
<- function(plt, main_label = NA, color = FALSE, dim_n = 28) {
plot_mnist_array
## setup color
if (color == TRUE) {
<- colorRampPalette(c("red", "white", "blue"))
colfunc
<- -max(abs(range(plt)))
min_abs <- max(abs(range(plt)))
max_abs
<- colfunc(256)
col else {
} <- gray((255:0)/255)
col <- 0
min_abs <- 255
max_abs
}
## create image
image(x = 1:dim_n,
y = 1:dim_n,
## image is oriented incorrectly, this fixes it
z = t(apply(plt, 2, rev)),
col = col,
zlim = c(min_abs, max_abs),
axes = FALSE,
xlab = NA,
ylab = NA)
## create plot border
rect(xleft = 0.5,
ybottom = 0.5,
xright = 28 + 0.5,
ytop = 28 + 0.5,
border = "black",
lwd = 1)
## display prediction result
text(x = 2,
y = dim_n - 3,
labels = ifelse(is.na(main_label),
"",
main_label),col = ifelse(color == TRUE,
"black",
"red"),
cex = 1.5)
}
## train data
# initialize matrix
<- matrix(nrow = nrow(x_train),
x_train_2 ncol = 28*28)
## likely a faster way to do this in the future
for (i in 1:nrow(x_train)) {
## get each layer's matrix image, stretch to 28^2 x 1
<- matrix(x_train[i, , ], 1, 28*28)
x_train_2[i, ]
}
<- x_train_2 %>%
x_train_2 as.data.frame()
## test data
<- matrix(nrow = nrow(x_test),
x_test_2 ncol = 28*28)
for (i in 1:nrow(x_test)) {
<- matrix(x_test[i, , ], 1, 28*28)
x_test_2[i, ]
}
<- x_test_2 %>%
x_test_2 as.data.frame()
## re-scale data
<- x_train_2 / 256
x_train_2 <- x_test_2 / 256
x_test_2
## response
# x_test_2$y <- y_test
# x_train_2$y <- y_train
3.2.2 Model
3.2.2.1 train
## set training data size
# n <- nrow(x_train_2)
<- 100
n
<- sample(x = 1:nrow(x_train_2),
indices size = n)
## init data
<- x_train_2[indices, ]
x_multi <- y_train[indices]
y_multi
## drop cols with all 0s
#x_multi <- x_multi[, (colSums(x_multi) > 0)]
## for the sake of the coefficients viz, setting alpha = 0
<- cv.glmnet(x = x_multi %>% as.matrix,
init_model y = y_multi %>% factor,
family = "multinomial",
alpha = 0)
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
<- predict(init_model,
multi_model %>% as.matrix,
x_multi s = init_model$lambda.min,
type = "response")
## format results
<- multi_model[, , 1] %>%
preds_init as.data.frame()
<- apply(X = preds_init,
preds MARGIN = 1,
FUN = function(x) names(which.max(x)) %>% as.numeric)
## TRAIN confusion matrix
table(y_multi, preds)
## preds
## y_multi 0 1 2 3 4 5 6 7 8 9
## 0 11 0 0 0 0 0 0 0 1 0
## 1 0 14 0 0 0 0 0 0 0 0
## 2 0 1 4 0 0 0 0 0 0 0
## 3 0 0 0 12 0 0 0 0 0 0
## 4 0 0 0 0 11 0 0 0 0 0
## 5 0 0 0 0 0 4 0 0 0 0
## 6 0 0 0 0 0 0 11 0 0 0
## 7 0 2 0 0 0 0 0 6 0 0
## 8 0 1 0 0 0 0 0 0 13 0
## 9 0 0 0 0 1 0 0 0 0 8
## TRAIN misclassification rate
mean(!(y_multi == preds))
## [1] 0.06
3.2.2.2 test
## pre-process data
<- x_test_2 %>%
x_multi_test select(all_of(names(x_multi)))
## get preds
<- predict(init_model,
multi_model_test %>% as.matrix,
x_multi_test s = init_model$lambda.min,
type = "response")
## format results
<- multi_model_test[, , 1] %>%
preds_init_test as.data.frame()
<- apply(X = preds_init_test,
preds_test MARGIN = 1,
FUN = function(x) names(which.max(x)) %>% as.numeric)
## TEST confusion matrix
table(y_test, preds_test)
## preds_test
## y_test 0 1 2 3 4 5 6 7 8 9
## 0 859 1 2 10 7 16 26 1 55 3
## 1 0 1102 0 5 4 1 4 0 19 0
## 2 62 102 243 226 55 68 93 11 170 2
## 3 12 46 35 693 3 5 9 23 178 6
## 4 7 36 2 0 817 13 39 6 23 39
## 5 78 94 8 119 40 102 48 16 364 23
## 6 37 19 3 28 94 2 760 0 15 0
## 7 7 91 0 2 37 1 1 614 38 237
## 8 19 74 2 41 7 20 16 4 768 23
## 9 19 41 0 8 289 8 0 129 127 388
## TEST misclassification rate
mean(!(y_test == preds_test))
## [1] 0.3654
## sort vectors so outputs are grouped
<- x_test[order(y_test), , ]
x_test_sort <- y_test[order(y_test)]
y_test_sort <- preds_test[order(y_test)]
preds_test_sort
## get misclassified obs
<- which(!(y_test_sort == preds_test_sort))
wrong
## plot a sample of misclassified obs
<- wrong[sample(x = 1:length(wrong), size = 3*8*6)] %>%
plot_wrong sort()
## plot params
par(mfcol = c(8, 6))
par(mar = c(0, 0, 0, 0))
for (i in plot_wrong) {
plot_mnist_array(plt = x_test_sort[i, , ],
main_label = preds_test_sort[i])
}
par(mfcol = c(1, 1))
3.2.3 model heatmaps
## get coefficients into matrices
<- coef(init_model, s = init_model$lambda.min) %>%
model_coef lapply(as.matrix) %>%
lapply(function(x) matrix(x[-1, ], nrow = 28, ncol = 28)) %>%
## take sigmoid activation just to help viz
lapply(function(x) 1 / (1 + exp(-x)) - 0.5)
mapply(FUN = plot_mnist_array,
plt = model_coef,
main_label = names(model_coef),
color = TRUE)
## $`0`
## NULL
##
## $`1`
## NULL
##
## $`2`
## NULL
##
## $`3`
## NULL
##
## $`4`
## NULL
##
## $`5`
## NULL
##
## $`6`
## NULL
##
## $`7`
## NULL
##
## $`8`
## NULL
##
## $`9`
## NULL
3.2.4 no outside cells model
earlier runs of the above sections revealed that for a regularization method that does not perform variable selection, odd importance is given to outermost cell for prediction. Thus, those will be removed:
## set training data size
# n <- nrow(x_train_2)
<- 100
n
<- sample(x = 1:nrow(x_train_2),
indices size = n)
## init data
<- x_train_2[indices, ]
x_multi <- y_train[indices]
y_multi
## drop outer cells
<- x_multi[, rep(seq(146, 622, 28), each = 18) + rep(0:17, times = 18)] x_multi
## for the sake of the coefficients viz, setting alpha = 0
<- cv.glmnet(x = x_multi %>% as.matrix,
init_model y = y_multi %>% factor,
family = "multinomial",
alpha = 0)
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
<- predict(init_model,
multi_model %>% as.matrix,
x_multi s = init_model$lambda.min,
type = "response")
## get coefficients into matrices
<- coef(init_model, s = init_model$lambda.min) %>%
model_coef lapply(as.matrix) %>%
lapply(function(x) matrix(x[-1, ], nrow = 18, ncol = 18)) %>%
## take sigmoid activation just to help viz
lapply(function(x) 1 / (1 + exp(-x)) - 0.5)
mapply(FUN = plot_mnist_array,
plt = model_coef,
main_label = names(model_coef),
color = TRUE,
dim_n = 18)
## $`0`
## NULL
##
## $`1`
## NULL
##
## $`2`
## NULL
##
## $`3`
## NULL
##
## $`4`
## NULL
##
## $`5`
## NULL
##
## $`6`
## NULL
##
## $`7`
## NULL
##
## $`8`
## NULL
##
## $`9`
## NULL