My goal is to integrate allodb – at least a simple subset of it – with bmss. This document has two sections:
In the first section I explore the equations
dataset; I realize that some parameters are missing. This helps me to ask Erika the right question: Where can I find the missing parameters? The answer is, “In the allodb_master
dataset.
In the second section I try to integrate the parameters into the equation
column.
In this section my goal is to explain what columns are missing from the current equations
dataset. I’ll show two examples, starting with a simplistic one.
library(tidyverse)
#> -- Attaching packages --------------------- tidyverse 1.2.1 --
#> v ggplot2 2.2.1 v purrr 0.2.4
#> v tibble 1.4.2 v dplyr 0.7.4
#> v tidyr 0.8.0 v stringr 1.3.0
#> v readr 1.1.1 v forcats 0.3.0
#> -- Conflicts ------------------------ tidyverse_conflicts() --
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
library(bmss)
library(allodb)
library(here)
#> here() starts at C:/Users/LeporeM/Dropbox/git_repos/allodb
biomass_per_indiv()
Let’s start with a little reminder of what the current code of bmss does.
This is a simplistic model of what the equations dataset might look like.
eqn <- bmss::toy_default_eqn
eqn
#> # A tibble: 6 x 5
#> site sp eqn eqn_source eqn_type
#> <chr> <chr> <chr> <chr> <chr>
#> 1 bci swars1 30 * dbh default species
#> 2 bci hybapr 40 * dbh default species
#> 3 bci aegipa 8 * dbh default site
#> 4 bci beilpe 8 * dbh default site
#> 5 xxx aaaaaa 9 * dbh default site
#> 6 xxx bbbbbb 9 * dbh default site
Now say that we have a census dataset like this one:
cns <- bmss::user_data
cns
#> # A tibble: 4 x 3
#> site sp dbh
#> <chr> <chr> <int>
#> 1 bci swars1 33
#> 2 bci hybapr 24
#> 3 bci aegipa 12
#> 4 bci beilpe 9
We can calculate biomass with our currently naive biomass_per_ind()
function.
biomass_per_ind(cns, site = "site", sp = "sp", dbh = "dbh")
#> You gave no custom equations.
#> * Using default equations.
#> # A tibble: 4 x 7
#> site sp dbh eqn eqn_source eqn_type bmss
#> <chr> <chr> <int> <chr> <chr> <chr> <dbl>
#> 1 bci swars1 33 30 * dbh default species 264.
#> 2 bci hybapr 24 40 * dbh default species 192.
#> 3 bci aegipa 12 8 * dbh default site 96.
#> 4 bci beilpe 9 8 * dbh default site 72.
biomass_per_indiv()
Let’s brake down what biomass_per_ind()
does to better understand what columns we need from the equations
dataset.
joint <- left_join(cns, eqn)
#> Joining, by = c("site", "sp")
joint
#> # A tibble: 4 x 6
#> site sp dbh eqn eqn_source eqn_type
#> <chr> <chr> <int> <chr> <chr> <chr>
#> 1 bci swars1 33 30 * dbh default species
#> 2 bci hybapr 24 40 * dbh default species
#> 3 bci aegipa 12 8 * dbh default site
#> 4 bci beilpe 9 8 * dbh default site
To make things simpler, let’s keep only the columns that matter.
joint <- select(joint, dbh, eqn)
joint
#> # A tibble: 4 x 2
#> dbh eqn
#> <int> <chr>
#> 1 33 30 * dbh
#> 2 24 40 * dbh
#> 3 12 8 * dbh
#> 4 9 8 * dbh
eqn
with the corresponding value from the column dbh
.joint <- mutate(
joint,
eqn_replaced = str_replace(eqn, "dbh", as.character(dbh))
)
joint
#> # A tibble: 4 x 3
#> dbh eqn eqn_replaced
#> <int> <chr> <chr>
#> 1 33 30 * dbh 30 * 33
#> 2 24 40 * dbh 40 * 24
#> 3 12 8 * dbh 8 * 12
#> 4 9 8 * dbh 8 * 9
eqn_replaced
to calculate biomass (bmss
).dplyr::mutate(
joint,
bmss = map_dbl(.data$eqn_replaced, ~eval(parse(text = .x), envir = joint))
)
#> # A tibble: 4 x 4
#> dbh eqn eqn_replaced bmss
#> <int> <chr> <chr> <dbl>
#> 1 33 30 * dbh 30 * 33 990.
#> 2 24 40 * dbh 40 * 24 960.
#> 3 12 8 * dbh 8 * 12 96.
#> 4 9 8 * dbh 8 * 9 72.
The equations in the equations
dataset have many parameters – not only dbh
. And we need one column for each of those parameters.
Let’s glimpse the dataset, and focus on the equation
column.
glimpse(allodb::equations)
#> Observations: 421
#> Variables: 23
#> $ equation_id <chr> NA, NA, NA, NA, NA, NA, NA, NA, N...
#> $ biomass_component <chr> "Stem and branches (live)", "Stem...
#> $ equation <chr> "a*(DBH^2)^b", "a*(DBH^2)^b", "a+...
#> $ allometry_specificity <chr> "Species", "Species", "Species", ...
#> $ development_species <chr> NA, NA, NA, "Ulmus americana", NA...
#> $ geographic_area <chr> "North Carolina, Georgia", "North...
#> $ dbh_min_cm <chr> "14.22", "29.46", "2.5", "2.5", "...
#> $ dbh_max_cm <chr> "25.91", "41.66", "40", "40", "55...
#> $ n_trees <int> 9, 9, NA, NA, NA, NA, NA, NA, NA,...
#> $ dbh_units_original <chr> "in", "in", "cm", "cm", "mm", "mm...
#> $ biomass_units_original <chr> "lb", "lb", "kg", "kg", "kg", "kg...
#> $ allometry_development_method <chr> "harvest", "harvest", "harvest", ...
#> $ model_parameters <chr> "DBH", "DBH", "DBH", "DBH", "DBH"...
#> $ regression_model <chr> "linear_multiple", "linear_multip...
#> $ other_equations_tested <chr> "yes", "yes", NA, NA, NA, NA, NA,...
#> $ log_biomass <chr> "10", "10", NA, NA, NA, NA, NA, N...
#> $ bias_corrected <chr> "yes", "yes", "no", "no", "no", "...
#> $ bias_correction_factor <chr> "included in model", "included in...
#> $ notes_fitting_model <chr> "Regression equations were develo...
#> $ original_data_availability <chr> "1", "1", NA, NA, NA, NA, NA, NA,...
#> $ notes_to_consider <chr> NA, NA, NA, NA, NA, NA, "DBA = ba...
#> $ warning <chr> NA, NA, NA, NA, NA, NA, NA, NA, N...
#> $ ref_id <chr> NA, NA, NA, NA, NA, NA, NA, NA, N...
The column equation
has many parameters, such as a
, b
and c
. To show how the code may deal with these parameters, I’ll create a small dataset with only the parameter a
.
joint2 <- tibble::tribble(
~eqn, ~a, ~dbh,
"33 * dbh * a", 5, 10,
"24 * dbh * a", 1, 15,
"12 * dbh * a", 2, 23
)
joint2
#> # A tibble: 3 x 3
#> eqn a dbh
#> <chr> <dbl> <dbl>
#> 1 33 * dbh * a 5. 10.
#> 2 24 * dbh * a 1. 15.
#> 3 12 * dbh * a 2. 23.
And I will assume that this dataset is already merged with the user data, so it already has a dbh
column – that is, here I start at the step 2 above.
eqn
with the corresponding value from the column dbh
.a
.joint2 <- mutate(
joint2,
eqn_replaced = str_replace(eqn, "dbh", as.character(dbh)),
eqn_replaced = str_replace(eqn_replaced, "a", as.character(a))
)
joint2
#> # A tibble: 3 x 4
#> eqn a dbh eqn_replaced
#> <chr> <dbl> <dbl> <chr>
#> 1 33 * dbh * a 5. 10. 33 * 10 * 5
#> 2 24 * dbh * a 1. 15. 24 * 15 * 1
#> 3 12 * dbh * a 2. 23. 12 * 23 * 2
eqn_replaced
to calculate biomass (bmss
).dplyr::mutate(
joint2,
bmss = map_dbl(.data$eqn_replaced, ~eval(parse(text = .x), envir = joint2))
)
#> # A tibble: 3 x 5
#> eqn a dbh eqn_replaced bmss
#> <chr> <dbl> <dbl> <chr> <dbl>
#> 1 33 * dbh * a 5. 10. 33 * 10 * 5 1650.
#> 2 24 * dbh * a 1. 15. 24 * 15 * 1 360.
#> 3 12 * dbh * a 2. 23. 12 * 23 * 2 552.
In this section I show that we can programatically replace the parameters a
-d
from equations$equation
. Also I identify a few parameters – other than a
-d
and DBH
– that are missing and we’ll need to somehow feed into the code for it to to its job.
Get data.
master <- readr::read_csv(
here::here("data-raw/allodb_master.csv"),
col_types = allodb::type_allodb_master()
)
# Tidy
master <- master %>%
# Add spacing for readability (http://style.tidyverse.org/syntax.html#spacing)
mutate(
equation = formatR::tidy_source(text = equation)[[1]]
) %>%
# Show most key column first
select(equation, everything())
master
#> # A tibble: 421 x 40
#> equation site family species species_code life_form wsg wsg_id
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 a * (DBH^2~ Lilly~ Fabace~ Robinia~ 901 Tree 0.6 <NA>
#> 2 a * (DBH^2~ Lilly~ Fabace~ Robinia~ 901 Tree 0.66 <NA>
#> 3 a * (DBH^b) Lilly~ Ulmace~ Ulmus a~ 972 Tree 0.46 <NA>
#> 4 a * (DBH^b) Lilly~ Ulmace~ Ulmus r~ 975 Tree 0.48 <NA>
#> 5 a + b * DB~ Lilly~ Oleace~ Fraxinu~ 541 Tree 0.51 <NA>
#> 6 a + b * DB~ Lilly~ Oleace~ Fraxinu~ 544 Tree 0.53 <NA>
#> 7 a * (DBA^b) Lilly~ Hamame~ Hamamel~ 498 Shrub 0.71 <NA>
#> 8 a * (DBH^b) Lilly~ Cupres~ Juniper~ 68 Tree 0.44 <NA>
#> 9 a * (DBH^b) Lilly~ Fagace~ Fagus g~ 531 Tree 0.56 <NA>
#> 10 a * (DBH^b) Lilly~ Fagace~ Quercus~ 802 Tree 0.6 <NA>
#> # ... with 411 more rows, and 32 more variables: wsg_specificity <chr>,
#> # a <dbl>, b <dbl>, c <dbl>, d <dbl>, dbh_min_cm <chr>,
#> # dbh_max_cm <chr>, n_trees <int>, dbh_units_original <chr>,
#> # biomass_units_original <chr>, allometry_development_method <chr>,
#> # site_dbh_unit <chr>, equation_id <chr>, equation_grouping <chr>,
#> # independent_variable <chr>, regression_model <chr>,
#> # other_equations_tested <chr>, log_biomass <chr>, bias_corrected <chr>,
#> # bias_correction_factor <chr>, notes_fitting_model <chr>,
#> # dependent_variable_biomass_component <chr>,
#> # allometry_specificity <chr>, development_species <chr>,
#> # geographic_area <chr>, biomass_equation_source <chr>, ref_id <chr>,
#> # wsg_source <chr>, ref_wsg_id <chr>, original_data_availability <chr>,
#> # notes_to_consider <chr>, warning <chr>
Explore distinct equations
master %>%
select(equation) %>%
distinct() %>%
print(n = nrow(.))
#> # A tibble: 17 x 1
#> equation
#> <chr>
#> 1 a * (DBH^2)^b
#> 2 a * (DBH^b)
#> 3 a + b * DBH + c * (DBH^d)
#> 4 a * (DBA^b)
#> 5 exp(a + b * ln(DBH))
#> 6 exp(a + b * DBH + c * (ln(DBH^d)))
#> 7 10^a + b * (log10(DBH^c))
#> 8 a + b * DBH
#> 9 a + b * BA
#> 10 exp(a + b * ln(DBA))
#> 11 exp(a + (b * ln(DBH))) * 419.814 * 1.22
#> 12 exp(a + (b * ln(DBH))) * 645.704 * 1.05
#> 13 10^a * DBH^b
#> 14 a + (b * DBH) + c * (DBH^2) + d * (DBH^3)
#> 15 NA
#> 16 exp(a + (b * (ln(pi * DBH))))
#> 17 exp(a + b * (DBH/DBH + c))
Notice NA
s. What should we do with this?
is_missing <- is.na(master$equation) | grepl("NA", master$equation)
master %>%
# Find both: missing value `NA` and the literal string "NA"
filter(is_missing)
#> # A tibble: 2 x 40
#> equation site family species species_code life_form wsg wsg_id
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 NA Wind Ri~ Unkown Unknown Un~ UNKN Tree 0.43 <NA>
#> 2 NA Yosemite Unkown Unknown Un~ UNKN <NA> <NA> <NA>
#> # ... with 32 more variables: wsg_specificity <chr>, a <dbl>, b <dbl>,
#> # c <dbl>, d <dbl>, dbh_min_cm <chr>, dbh_max_cm <chr>, n_trees <int>,
#> # dbh_units_original <chr>, biomass_units_original <chr>,
#> # allometry_development_method <chr>, site_dbh_unit <chr>,
#> # equation_id <chr>, equation_grouping <chr>,
#> # independent_variable <chr>, regression_model <chr>,
#> # other_equations_tested <chr>, log_biomass <chr>, bias_corrected <chr>,
#> # bias_correction_factor <chr>, notes_fitting_model <chr>,
#> # dependent_variable_biomass_component <chr>,
#> # allometry_specificity <chr>, development_species <chr>,
#> # geographic_area <chr>, biomass_equation_source <chr>, ref_id <chr>,
#> # wsg_source <chr>, ref_wsg_id <chr>, original_data_availability <chr>,
#> # notes_to_consider <chr>, warning <chr>
In the meantime, I keep only non missing equations.
non_missing <- master %>% filter(!is_missing)
Replacing a-d.
replaced <- non_missing %>%
mutate(
eqn_replaced = str_replace_all(equation, "a", as.character(a)),
) %>%
mutate(
eqn_replaced = str_replace_all(eqn_replaced, "b", as.character(b)),
eqn_replaced = str_replace_all(eqn_replaced, "c", as.character(c)),
eqn_replaced = str_replace_all(eqn_replaced, "d", as.character(d)),
) %>%
select(eqn_replaced)
replaced
#> # A tibble: 419 x 1
#> eqn_replaced
#> <chr>
#> 1 4.13741 * (DBH^2)^1.08876
#> 2 1.04649 * (DBH^2)^1.37539
#> 3 0.08248 * (DBH^2.468)
#> 4 0.08248 * (DBH^2.468)
#> 5 3.203 + -0.234 * DBH + 0.006 * (DBH^2)
#> 6 3.203 + -0.234 * DBH + 0.006 * (DBH^2)
#> 7 38.111 * (DBA^2.9)
#> 8 0.1632 * (DBH^2.2454)
#> 9 2.0394 * (DBH^2.5715)
#> 10 1.5647 * (DBH^2.6887)
#> # ... with 409 more rows