My goal is to integrate allodb – at least a simple subset of it – with bmss. This document has two sections:

Section 1

In this section my goal is to explain what columns are missing from the current equations dataset. I’ll show two examples, starting with a simplistic one.

library(tidyverse)
#> -- Attaching packages --------------------- tidyverse 1.2.1 --
#> v ggplot2 2.2.1     v purrr   0.2.4
#> v tibble  1.4.2     v dplyr   0.7.4
#> v tidyr   0.8.0     v stringr 1.3.0
#> v readr   1.1.1     v forcats 0.3.0
#> -- Conflicts ------------------------ tidyverse_conflicts() --
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag()    masks stats::lag()
library(bmss)
library(allodb)
library(here)
#> here() starts at C:/Users/LeporeM/Dropbox/git_repos/allodb

1. Simplistic example

Revisiting biomass_per_indiv()

Let’s start with a little reminder of what the current code of bmss does.

This is a simplistic model of what the equations dataset might look like.

eqn <- bmss::toy_default_eqn

eqn
#> # A tibble: 6 x 5
#>   site  sp     eqn      eqn_source eqn_type
#>   <chr> <chr>  <chr>    <chr>      <chr>   
#> 1 bci   swars1 30 * dbh default    species 
#> 2 bci   hybapr 40 * dbh default    species 
#> 3 bci   aegipa 8 * dbh  default    site    
#> 4 bci   beilpe 8 * dbh  default    site    
#> 5 xxx   aaaaaa 9 * dbh  default    site    
#> 6 xxx   bbbbbb 9 * dbh  default    site

Now say that we have a census dataset like this one:

cns <- bmss::user_data

cns
#> # A tibble: 4 x 3
#>   site  sp       dbh
#>   <chr> <chr>  <int>
#> 1 bci   swars1    33
#> 2 bci   hybapr    24
#> 3 bci   aegipa    12
#> 4 bci   beilpe     9

We can calculate biomass with our currently naive biomass_per_ind() function.

biomass_per_ind(cns, site = "site", sp = "sp", dbh = "dbh")
#> You gave no custom equations.
#>   * Using default equations.
#> # A tibble: 4 x 7
#>   site  sp       dbh eqn      eqn_source eqn_type  bmss
#>   <chr> <chr>  <int> <chr>    <chr>      <chr>    <dbl>
#> 1 bci   swars1    33 30 * dbh default    species   264.
#> 2 bci   hybapr    24 40 * dbh default    species   192.
#> 3 bci   aegipa    12 8 * dbh  default    site       96.
#> 4 bci   beilpe     9 8 * dbh  default    site       72.

Braking down biomass_per_indiv()

Let’s brake down what biomass_per_ind() does to better understand what columns we need from the equations dataset.

  • Step 1: Join the census data with the equations data.
joint <- left_join(cns, eqn)
#> Joining, by = c("site", "sp")

joint
#> # A tibble: 4 x 6
#>   site  sp       dbh eqn      eqn_source eqn_type
#>   <chr> <chr>  <int> <chr>    <chr>      <chr>   
#> 1 bci   swars1    33 30 * dbh default    species 
#> 2 bci   hybapr    24 40 * dbh default    species 
#> 3 bci   aegipa    12 8 * dbh  default    site    
#> 4 bci   beilpe     9 8 * dbh  default    site

To make things simpler, let’s keep only the columns that matter.

joint <- select(joint, dbh, eqn)

joint
#> # A tibble: 4 x 2
#>     dbh eqn     
#>   <int> <chr>   
#> 1    33 30 * dbh
#> 2    24 40 * dbh
#> 3    12 8 * dbh 
#> 4     9 8 * dbh
  • Step 2: Replace the string “dbh” of the column eqn with the corresponding value from the column dbh.
joint <- mutate(
  joint, 
  eqn_replaced = str_replace(eqn, "dbh", as.character(dbh))
)

joint
#> # A tibble: 4 x 3
#>     dbh eqn      eqn_replaced
#>   <int> <chr>    <chr>       
#> 1    33 30 * dbh 30 * 33     
#> 2    24 40 * dbh 40 * 24     
#> 3    12 8 * dbh  8 * 12      
#> 4     9 8 * dbh  8 * 9
  • Step 3: Evaluate the equation strings in eqn_replaced to calculate biomass (bmss).
dplyr::mutate(
  joint, 
  bmss = map_dbl(.data$eqn_replaced, ~eval(parse(text = .x), envir = joint))
)
#> # A tibble: 4 x 4
#>     dbh eqn      eqn_replaced  bmss
#>   <int> <chr>    <chr>        <dbl>
#> 1    33 30 * dbh 30 * 33       990.
#> 2    24 40 * dbh 40 * 24       960.
#> 3    12 8 * dbh  8 * 12         96.
#> 4     9 8 * dbh  8 * 9          72.

2. An example that is a bit more realistic

The equations in the equations dataset have many parameters – not only dbh. And we need one column for each of those parameters.

Let’s glimpse the dataset, and focus on the equation column.

glimpse(allodb::equations)
#> Observations: 421
#> Variables: 23
#> $ equation_id                  <chr> NA, NA, NA, NA, NA, NA, NA, NA, N...
#> $ biomass_component            <chr> "Stem and branches (live)", "Stem...
#> $ equation                     <chr> "a*(DBH^2)^b", "a*(DBH^2)^b", "a+...
#> $ allometry_specificity        <chr> "Species", "Species", "Species", ...
#> $ development_species          <chr> NA, NA, NA, "Ulmus americana", NA...
#> $ geographic_area              <chr> "North Carolina, Georgia", "North...
#> $ dbh_min_cm                   <chr> "14.22", "29.46", "2.5", "2.5", "...
#> $ dbh_max_cm                   <chr> "25.91", "41.66", "40", "40", "55...
#> $ n_trees                      <int> 9, 9, NA, NA, NA, NA, NA, NA, NA,...
#> $ dbh_units_original           <chr> "in", "in", "cm", "cm", "mm", "mm...
#> $ biomass_units_original       <chr> "lb", "lb", "kg", "kg", "kg", "kg...
#> $ allometry_development_method <chr> "harvest", "harvest", "harvest", ...
#> $ model_parameters             <chr> "DBH", "DBH", "DBH", "DBH", "DBH"...
#> $ regression_model             <chr> "linear_multiple", "linear_multip...
#> $ other_equations_tested       <chr> "yes", "yes", NA, NA, NA, NA, NA,...
#> $ log_biomass                  <chr> "10", "10", NA, NA, NA, NA, NA, N...
#> $ bias_corrected               <chr> "yes", "yes", "no", "no", "no", "...
#> $ bias_correction_factor       <chr> "included in model", "included in...
#> $ notes_fitting_model          <chr> "Regression equations were develo...
#> $ original_data_availability   <chr> "1", "1", NA, NA, NA, NA, NA, NA,...
#> $ notes_to_consider            <chr> NA, NA, NA, NA, NA, NA, "DBA = ba...
#> $ warning                      <chr> NA, NA, NA, NA, NA, NA, NA, NA, N...
#> $ ref_id                       <chr> NA, NA, NA, NA, NA, NA, NA, NA, N...

The column equation has many parameters, such as a, b and c. To show how the code may deal with these parameters, I’ll create a small dataset with only the parameter a.

joint2 <- tibble::tribble(
            ~eqn, ~a, ~dbh,
  "33 * dbh * a",  5,   10,
  "24 * dbh * a",  1,   15,
  "12 * dbh * a",  2,   23
)

joint2
#> # A tibble: 3 x 3
#>   eqn              a   dbh
#>   <chr>        <dbl> <dbl>
#> 1 33 * dbh * a    5.   10.
#> 2 24 * dbh * a    1.   15.
#> 3 12 * dbh * a    2.   23.

And I will assume that this dataset is already merged with the user data, so it already has a dbh column – that is, here I start at the step 2 above.

  • Step 2:
    • Replace the string “dbh” of the column eqn with the corresponding value from the column dbh.
    • Same with a.
joint2 <- mutate(
  joint2, 
  eqn_replaced = str_replace(eqn, "dbh", as.character(dbh)),
  eqn_replaced = str_replace(eqn_replaced, "a", as.character(a))
)

joint2
#> # A tibble: 3 x 4
#>   eqn              a   dbh eqn_replaced
#>   <chr>        <dbl> <dbl> <chr>       
#> 1 33 * dbh * a    5.   10. 33 * 10 * 5 
#> 2 24 * dbh * a    1.   15. 24 * 15 * 1 
#> 3 12 * dbh * a    2.   23. 12 * 23 * 2
  • Step 3: Evaluate the equation strings in eqn_replaced to calculate biomass (bmss).
dplyr::mutate(
  joint2, 
  bmss = map_dbl(.data$eqn_replaced, ~eval(parse(text = .x), envir = joint2))
)
#> # A tibble: 3 x 5
#>   eqn              a   dbh eqn_replaced  bmss
#>   <chr>        <dbl> <dbl> <chr>        <dbl>
#> 1 33 * dbh * a    5.   10. 33 * 10 * 5  1650.
#> 2 24 * dbh * a    1.   15. 24 * 15 * 1   360.
#> 3 12 * dbh * a    2.   23. 12 * 23 * 2   552.

Section 2

In this section I show that we can programatically replace the parameters a-d from equations$equation. Also I identify a few parameters – other than a-d and DBH – that are missing and we’ll need to somehow feed into the code for it to to its job.

Get data.

master <- readr::read_csv(
  here::here("data-raw/allodb_master.csv"),
  col_types = allodb::type_allodb_master()
)

# Tidy
master <- master %>% 
  # Add spacing for readability (http://style.tidyverse.org/syntax.html#spacing)
  mutate(
    equation = formatR::tidy_source(text = equation)[[1]]
  ) %>% 
  # Show most key column first
  select(equation, everything())
master
#> # A tibble: 421 x 40
#>    equation    site   family  species  species_code life_form wsg   wsg_id
#>    <chr>       <chr>  <chr>   <chr>    <chr>        <chr>     <chr> <chr> 
#>  1 a * (DBH^2~ Lilly~ Fabace~ Robinia~ 901          Tree      0.6   <NA>  
#>  2 a * (DBH^2~ Lilly~ Fabace~ Robinia~ 901          Tree      0.66  <NA>  
#>  3 a * (DBH^b) Lilly~ Ulmace~ Ulmus a~ 972          Tree      0.46  <NA>  
#>  4 a * (DBH^b) Lilly~ Ulmace~ Ulmus r~ 975          Tree      0.48  <NA>  
#>  5 a + b * DB~ Lilly~ Oleace~ Fraxinu~ 541          Tree      0.51  <NA>  
#>  6 a + b * DB~ Lilly~ Oleace~ Fraxinu~ 544          Tree      0.53  <NA>  
#>  7 a * (DBA^b) Lilly~ Hamame~ Hamamel~ 498          Shrub     0.71  <NA>  
#>  8 a * (DBH^b) Lilly~ Cupres~ Juniper~ 68           Tree      0.44  <NA>  
#>  9 a * (DBH^b) Lilly~ Fagace~ Fagus g~ 531          Tree      0.56  <NA>  
#> 10 a * (DBH^b) Lilly~ Fagace~ Quercus~ 802          Tree      0.6   <NA>  
#> # ... with 411 more rows, and 32 more variables: wsg_specificity <chr>,
#> #   a <dbl>, b <dbl>, c <dbl>, d <dbl>, dbh_min_cm <chr>,
#> #   dbh_max_cm <chr>, n_trees <int>, dbh_units_original <chr>,
#> #   biomass_units_original <chr>, allometry_development_method <chr>,
#> #   site_dbh_unit <chr>, equation_id <chr>, equation_grouping <chr>,
#> #   independent_variable <chr>, regression_model <chr>,
#> #   other_equations_tested <chr>, log_biomass <chr>, bias_corrected <chr>,
#> #   bias_correction_factor <chr>, notes_fitting_model <chr>,
#> #   dependent_variable_biomass_component <chr>,
#> #   allometry_specificity <chr>, development_species <chr>,
#> #   geographic_area <chr>, biomass_equation_source <chr>, ref_id <chr>,
#> #   wsg_source <chr>, ref_wsg_id <chr>, original_data_availability <chr>,
#> #   notes_to_consider <chr>, warning <chr>

Explore distinct equations

master %>%
  select(equation) %>% 
  distinct() %>% 
  print(n = nrow(.))
#> # A tibble: 17 x 1
#>    equation                                 
#>    <chr>                                    
#>  1 a * (DBH^2)^b                            
#>  2 a * (DBH^b)                              
#>  3 a + b * DBH + c * (DBH^d)                
#>  4 a * (DBA^b)                              
#>  5 exp(a + b * ln(DBH))                     
#>  6 exp(a + b * DBH + c * (ln(DBH^d)))       
#>  7 10^a + b * (log10(DBH^c))                
#>  8 a + b * DBH                              
#>  9 a + b * BA                               
#> 10 exp(a + b * ln(DBA))                     
#> 11 exp(a + (b * ln(DBH))) * 419.814 * 1.22  
#> 12 exp(a + (b * ln(DBH))) * 645.704 * 1.05  
#> 13 10^a * DBH^b                             
#> 14 a + (b * DBH) + c * (DBH^2) + d * (DBH^3)
#> 15 NA                                       
#> 16 exp(a + (b * (ln(pi * DBH))))            
#> 17 exp(a + b * (DBH/DBH + c))

Notice NAs. What should we do with this?

is_missing <- is.na(master$equation) | grepl("NA", master$equation)
master %>% 
  # Find both: missing value `NA` and the literal string "NA"
  filter(is_missing)
#> # A tibble: 2 x 40
#>   equation site     family species     species_code life_form wsg   wsg_id
#>   <chr>    <chr>    <chr>  <chr>       <chr>        <chr>     <chr> <chr> 
#> 1 NA       Wind Ri~ Unkown Unknown Un~ UNKN         Tree      0.43  <NA>  
#> 2 NA       Yosemite Unkown Unknown Un~ UNKN         <NA>      <NA>  <NA>  
#> # ... with 32 more variables: wsg_specificity <chr>, a <dbl>, b <dbl>,
#> #   c <dbl>, d <dbl>, dbh_min_cm <chr>, dbh_max_cm <chr>, n_trees <int>,
#> #   dbh_units_original <chr>, biomass_units_original <chr>,
#> #   allometry_development_method <chr>, site_dbh_unit <chr>,
#> #   equation_id <chr>, equation_grouping <chr>,
#> #   independent_variable <chr>, regression_model <chr>,
#> #   other_equations_tested <chr>, log_biomass <chr>, bias_corrected <chr>,
#> #   bias_correction_factor <chr>, notes_fitting_model <chr>,
#> #   dependent_variable_biomass_component <chr>,
#> #   allometry_specificity <chr>, development_species <chr>,
#> #   geographic_area <chr>, biomass_equation_source <chr>, ref_id <chr>,
#> #   wsg_source <chr>, ref_wsg_id <chr>, original_data_availability <chr>,
#> #   notes_to_consider <chr>, warning <chr>

In the meantime, I keep only non missing equations.

non_missing <- master %>% filter(!is_missing)

Replacing a-d.

replaced <- non_missing %>%
  mutate(
    eqn_replaced = str_replace_all(equation, "a", as.character(a)),
  ) %>% 
  mutate(
    eqn_replaced = str_replace_all(eqn_replaced, "b", as.character(b)),
    eqn_replaced = str_replace_all(eqn_replaced, "c", as.character(c)),
    eqn_replaced = str_replace_all(eqn_replaced, "d", as.character(d)),
  ) %>% 
  select(eqn_replaced)

replaced
#> # A tibble: 419 x 1
#>    eqn_replaced                          
#>    <chr>                                 
#>  1 4.13741 * (DBH^2)^1.08876             
#>  2 1.04649 * (DBH^2)^1.37539             
#>  3 0.08248 * (DBH^2.468)                 
#>  4 0.08248 * (DBH^2.468)                 
#>  5 3.203 + -0.234 * DBH + 0.006 * (DBH^2)
#>  6 3.203 + -0.234 * DBH + 0.006 * (DBH^2)
#>  7 38.111 * (DBA^2.9)                    
#>  8 0.1632 * (DBH^2.2454)                 
#>  9 2.0394 * (DBH^2.5715)                 
#> 10 1.5647 * (DBH^2.6887)                 
#> # ... with 409 more rows