Chapter 5 Multi-Layer NN Model
This chapter presents the final functional-programming model. Uses functions to define ‘neural networks’, perform forward propagation, and perform gradient descent. Section at the end details future components that could be added in.
5.1 Generate Data
For now, having 3 inputs and combining them to create y, with a random error term. Would like to tweak the setup eventually.
## create data:
<- 1000
m <- 3
n_1_manual <- 1
n_L_manual
# initialize Xs
<- data.frame(X1 = runif(n = m, min = -10, max = 10),
X X2 = rnorm(n = m, mean = 0, sd = 10),
X3 = rexp(n = m, rate = 1)) %>%
as.matrix(nrow = m,
ncol = n_1_manual)
# get response
<- X[, 1] + 10 * sin(X[, 2])^2 + 10 * X[, 3] + rnorm(n = 1000)
Y
# fix dims according to NN specs
<- t(X)
X <- t(Y) Y
5.2 Functions
5.2.1 Link Functions
## Specify Link Functions & Derivatives
<- function(type = "sigmoid") {
get_link
if (type == "identity") {
# identity
<- function(x) {x}
g
else if (type == "sigmoid") {
} # sigmoid
<- function(x) {1 / (1 + exp(-x))}
g
else if (type == "relu") {
} # ReLU
<- function(x) {x * as.numeric(x > 0)}
g
else (return(NULL))
}
return(g)
}
<- function(type = "sigmoid") {
get_link_prime
if (type == "identity") {
# identity [FIX]
<- function(x) {
g_prime ## there's probably a better way to do this
<- x / x
b is.nan(x/x)] <- 1
b[return(b)
}
else if (type == "sigmoid") {
} # sigmoid
<- function(x) {exp(-x) / (1 + exp(-x))^2}
g_prime
else if (type == "relu") {
} # ReLU
<- function(x) {as.numeric(x > 0)}
g_prime
else (return(NULL))
}
return(g_prime)
}
5.2.2 Loss Functions
## Specify Loss Functions & Derivatives
<- function(type = "squared_error") {
get_loss_function
if (type == "squared_error") {
<- function(y_hat, y) {sum((y_hat - y)^2)}
loss
else if (type == "cross_entropy") {
}
<- function(y_hat, y) {sum(y * log(y_hat))}
loss
else (return(NULL))
}
return(loss)
}
<- function(type = "squared_error") {
get_loss_prime
if (type == "squared_error") {
<- function(y_hat, y) {sum(2 * (y_hat - y))}
loss_prime
else if (type == "cross_entropy") {
}
<- function(y_hat, y) {999}
loss_prime
else (return(NULL))
}
return(loss_prime)
}
5.2.3 Misc Helpers
## creates a list of n empty lists
<- function(n) {
create_lists <- list()
out
for (i in 1:n) {
<- list()
out[[i]]
}
return(out)
}
## friendlier diag() function
<- function(x) {
diag_D
if (length(x) == 1) {
<- x
out else {
} <- diag(as.numeric(x))
out
}
return(out)
}
<- function(X,
generate_layer_sizes
Y,
hidden_layer_sizes) {
return(c(nrow(X), hidden_layer_sizes, nrow(Y)))
}
<- function(layer_sizes,
initialize_NN activation_function = "sigmoid",
last_activation_function = "identity",
lower_bound = 0,
upper_bound = 1) {
<- layer_sizes
n
## initialize parameter matrices
<- list()
W <- list()
b
## could vectorize w/ mapply()
for (l in 2:length(n)) {
<- matrix(data = runif(n = n[l - 1] * n[l],
W[[l]] min = lower_bound,
max = upper_bound),
nrow = n[l],
ncol = n[l - 1])
<- matrix(data = runif(n = n[l],
b[[l]] min = lower_bound,
max = upper_bound),
nrow = n[l],
ncol = 1)
}
## return
return(list(W = W,
b = b,
activation_function = activation_function,
last_activation_function = last_activation_function))
}
5.2.4 Forward Propagation
<- function(X,
NN_output
NN_obj) {
<- length(NN_obj$W)
L ## if X is one obs, input will be a vector so dim will be null
<- ifelse(is.null(ncol(X)),
m 1,
ncol(X))
<- get_link(NN_obj$activation_function)
g <- get_link(NN_obj$last_activation_function)
g_last
<- list()
a
1]] <- X
a[[
for (l in 2:(L - 1)) {
<- g(NN_obj$W[[l]] %*% a[[l - 1]] + matrix(data = rep(x = NN_obj$b[[l]],
a[[l]] times = m),
ncol = m))
}
<- g_last(NN_obj$W[[L]] %*% a[[L - 1]] + matrix(data = rep(x = NN_obj$b[[L]],
a[[L]] times = m),
ncol = m))
return(a[[L]])
}
5.2.5 Gradient Descent Iteration
<- function(NN_obj,
GD_iter
X,
Y,rho = 1,
verbose = FALSE,
very_verbose = FALSE) {
<- length(NN_obj$W)
L ## if X is one obs, input will be a vector so dim will be null
<- ifelse(is.null(ncol(X)),
m 1,
ncol(X))
## get links
<- get_link(NN_obj$activation_function)
g <- get_link_prime(NN_obj$activation_function)
g_prime <- get_link(NN_obj$last_activation_function)
g_last <- get_link_prime(NN_obj$last_activation_function)
g_last_prime
<- create_lists(L)
z <- create_lists(L)
a <- create_lists(L)
D <- create_lists(L)
delta <- create_lists(L)
del_W <- create_lists(L)
del_b
## gradient descent
for (i in 1:m) {
## forward
1]][[i]] <- X[, i]
a[[
for (l in 2:(L - 1)) {
<- NN_obj$W[[l]] %*% a[[l - 1]][[i]] + NN_obj$b[[l]]
z[[l]][[i]] <- g(z[[l]][[i]])
a[[l]][[i]] <- diag_D(g_prime(z[[l]][[i]]))
D[[l]][[i]]
if (very_verbose == TRUE) {print(paste0("Forward: obs ", i, " - layer ", l))}
}
## last layer
<- NN_obj$W[[L]] %*% a[[L - 1]][[i]] + NN_obj$b[[L]]
z[[L]][[i]] <- g_last(z[[L]][[i]])
a[[L]][[i]] <- diag_D(g_last_prime(z[[L]][[i]]))
D[[L]][[i]]
## backward
# eventually fix to match with loss function
<- D[[L]][[i]] %*% (a[[L]][[i]] - Y[, i])
delta[[L]][[i]]
for (l in (L - 1):2) {
<- D[[l]][[i]] %*% t(NN_obj$W[[l + 1]]) %*% delta[[l + 1]][[i]]
delta[[l]][[i]] if (very_verbose == TRUE) {print(paste0("Backward: obs ", i, " - layer ", l))}
}
for (l in 2:L) {
<- delta[[l]][[i]] %*% t(a[[l - 1]][[i]])
del_W[[l]][[i]] <- delta[[l]][[i]]
del_b[[l]][[i]] if (very_verbose == TRUE) {print(paste0("del: obs ", i, " - layer ", l))}
}
if ((verbose == TRUE) & (i %% 100 == 0)) {print(paste("obs", i, "/", m))}
}
## update parameters
# get averages
## del_W is a list where each element represents a layer
## in each layer, there's a list representing the layer's result for that obs
## here we collapse the results by taking the sum of our gradients
<- lapply(X = del_W,
del_W_all FUN = Reduce,
f = "+") %>%
lapply(X = .,
FUN = function(x) x / m)
<- lapply(X = del_b,
del_b_all FUN = Reduce,
f = "+") %>%
lapply(X = .,
FUN = function(x) x / m)
# apply gradient
<- mapply(FUN = function(A, del_A) {A - rho * del_A},
W_out A = NN_obj$W,
del_A = del_W_all)
<- mapply(FUN = function(A, del_A) {A - rho * del_A},
b_out A = NN_obj$b,
del_A = del_b_all)
## return a new NN object
return(list(W = W_out,
b = b_out,
activation_function = NN_obj$activation_function,
last_activation_function = NN_obj$last_activation_function))
}
5.2.6 Perform Gradient Descent
<- function(X,
GD_perform
Y,
init_NN_obj,rho = 0.01,
loss_function = "squared_error",
threshold = 1,
max_iter = 100,
print_descent = FALSE) {
## setup
<- FALSE
done_decreasing
<- get_loss_function(type = loss_function)
objective_function
<- list()
iteration_outputs <- numeric()
output_objectives
<- init_NN_obj
iteration_input
<- 1
iter
<- objective_function(y = Y,
initial_objective y_hat = NN_output(X = X,
NN_obj = init_NN_obj))
if (print_descent == TRUE) {
print(paste0("iter: ", 0, "; obj: ", round(initial_objective, 1)))
}
while ((!done_decreasing) & (iter < max_iter)) {
## get input loss
<- objective_function(y = Y,
in_objective y_hat = NN_output(X = X,
NN_obj = iteration_input))
## iterate
<- GD_iter(NN_obj = iteration_input,
iteration_output X = X,
Y = Y,
rho = rho,
verbose = FALSE,
very_verbose = FALSE)
## outputs
<- objective_function(y = Y,
out_objective y_hat = NN_output(X = X,
NN_obj = iteration_output))
<- iteration_output
iteration_input <- iteration_output
iteration_outputs[[iter]] <- out_objective
output_objectives[[iter]]
if (print_descent == TRUE) {
print(paste0("iter: ", iter, "; obj: ", round(out_objective, 1)))
}
<- iter + 1
iter
## evaluate
if (abs(in_objective - out_objective) < threshold) {
<- TRUE
done_decreasing
}
}
return(list(final_NN = iteration_output,
intermediate_NN = iteration_outputs,
output_objectives = output_objectives,
initial_objective = initial_objective,
params = list(rho = rho,
loss_function = loss_function,
initial_NN = init_NN_obj)))
}
5.2.7 Summary Functions
<- function(GD_obj) {
GD_plot
data.frame(x = 1:length(GD_obj$output_objectives),
y = GD_obj$output_objectives) %>%
ggplot(aes(x = x,
y = y)) +
geom_point() +
theme_bw() +
labs(x = "Iteration",
y = "Loss")
}
<- function(GD_obj,
GD_summary print_summary = TRUE) {
## num iter
<- length(GD_obj$output_objectives)
num_iter
## loss improvement
<- GD_obj$initial_objective %>% round(1)
initial_objective <- last(GD_obj$output_objectives) %>% round(1)
final_objective <- (final_objective / initial_objective) %>% round(4)
loss_improvement_ratio
if (print_summary == TRUE) {
## prints
cat(paste0("Gradient Descent Summary:", "\n",
" |", "\n",
" | Number of Iterations: ", num_iter, "\n",
" |", "\n",
" | Initial Objective: ", initial_objective, "\n",
" | Final Objective: ", final_objective, "\n",
" | Ratio: ", loss_improvement_ratio, "\n", "\n"))
cat(paste0("----------------------------------------", "\n",
"Initial W:", "\n", "\n"))
print(GD_obj$params$initial_NN$W[-1])
cat(paste0("----------------------------------------", "\n",
"Final W:", "\n", "\n"))
print(GD_obj$final_NN$W[-1])
cat(paste0("----------------------------------------", "\n",
"Initial b:", "\n", "\n"))
print(GD_obj$params$initial_NN$b[-1])
cat(paste0("----------------------------------------", "\n",
"Final b:", "\n", "\n"))
print(GD_obj$final_NN$b[-1])
}
return(list(num_iter = num_iter,
initial_objective = initial_objective,
final_objective = final_objective,
loss_improvement_ratio = loss_improvement_ratio))
}
5.3 Test
## initialize NN
<- initialize_NN(layer_sizes = generate_layer_sizes(X = X,
init_NN Y = Y,
hidden_layer_sizes = c(3)),
activation_function = "relu",
last_activation_function = "identity",
lower_bound = 0,
upper_bound = 1)
## train NN
<- GD_perform(X = X,
GD_NN Y = Y,
init_NN_obj = init_NN,
rho = 0.001,
loss_function = "squared_error",
threshold = 100,
max_iter = 1000,
print_descent = FALSE)
<- GD_NN$final_NN
final_NN
## Summaries
<- GD_summary(GD_obj = GD_NN) NN_sum
## Gradient Descent Summary:
## |
## | Number of Iterations: 144
## |
## | Initial Objective: 238990.1
## | Final Objective: 16834.7
## | Ratio: 0.0704
##
## ----------------------------------------
## Initial W:
##
## [[1]]
## [,1] [,2] [,3]
## [1,] 0.3028608 0.5655690 0.95437084
## [2,] 0.5046878 0.5267906 0.02262433
## [3,] 0.2724395 0.2474305 0.40767901
##
## [[2]]
## [,1] [,2] [,3]
## [1,] 0.4370019 0.7598625 0.8453935
##
## ----------------------------------------
## Final W:
##
## [[1]]
## X1 X2 X3
## [1,] -0.001842719 0.03928530 1.9846660
## [2,] 0.695969115 0.12981398 0.7693617
## [3,] 0.187358905 -0.08034643 2.0087035
##
## [[2]]
## [,1] [,2] [,3]
## [1,] 1.699798 1.149463 2.324604
##
## ----------------------------------------
## Initial b:
##
## [[1]]
## [,1]
## [1,] 0.007521215
## [2,] 0.425427183
## [3,] 0.791751480
##
## [[2]]
## [,1]
## [1,] 0.3945866
##
## ----------------------------------------
## Final b:
##
## [[1]]
## [,1]
## [1,] 0.2857580
## [2,] 0.6093002
## [3,] 1.2551489
##
## [[2]]
## [,1]
## [1,] 0.7948542
GD_plot(GD_NN)
5.4 Next Steps
In the future:
- need some sort of divergence check / pick ‘best so far’ output
- vis for gradient descent — pick 2 vars and for every combo of those 2, plot the objective function
- vis for gradient descent — show the evolution of the var through gradient descent over iterations
- NN overall vis & perhaps animation
- multi-dimensional output (cat / 1-hot)
- different cost functions (softmax squared-error & cross-entropy)
- ‘from scratch’ from scratch — mmult and maybe further lol
- get ‘best-case’ / perfect objective function (if data creation process known)
- stochastic gradient descent, minibatches (what gets passed down to GD_iter from GD_perform)
- regularization methods & CV-validation