Overview

This document wrangles fieldwork data collected with the software FastField. Jess Sue designed the data structure, and the job she required is done via the function FastField::xl_sheets_to_csv() of the qcr package.

FastField::xl_sheets_to_csv() does all of this:

Setup

Install FastField if it is missing.

missing_remotes <- !require(FastField)
if (missing_remotes) {
  install.packages("remotes")
}

missing_FastField <- !require(FastField)
if (missing_FastField) {
  remotes::install_github("forestgeo/FastField")
}

Load required packages.

library(FastField)

# I will referr to functions' source explicitely -- as `package::function()`
library(dplyr)
library(DT)
library(fgeo.tool)
library(fs)
library(purrr)
library(readr)
library(tibble)
library(tidyr)

Hint

Check these articles if you struggle to install, load, or use R packages:


View the contents of the input directory.

fs::dir_ls(params$input_dir)
#> input/101_2018-01-05.xlsx input/101_2018-01-06.xlsx

Work

Do the job (the output is silent.)

FastField::xl_sheets_to_csv(params$input_dir, params$output_dir)

Note

FastField::xl_sheets_to_csv() expects that the names of the spreadsheets and columns of each spreadsheet will be these ones:

one_excel_file <- fs::dir_ls(params$input_dir)[[1]]
df_list <- fgeo.tool::ls_list_spreadsheets(one_excel_file)
purrr::map(df_list, names) %>% 
  tibble::enframe(name = "spreadsheet", value = "column names") %>% 
  tidyr::unnest() %>% 
  DT::datatable()

This is rigid but makes the interface extreemely simple and thus easy to use. If you change the names of the spreadsheets or the names of the columns in each spreadsheet please let me know (maurolepore@gmail.com). These are the options: (1) To make FastField::xl_sheets_to_csv() more flexible at the cost of making it also more complex and a little harder to use; or (2) To update FastField::xl_sheets_to_csv() to hard-wire the new names.


Review

This section reviews the output. You can safely ignore the code because the implementation details are unimportant.

Confirm that the output directory contains the expected .csv files.

csv_files_in_output <- fs::dir_ls(params$output_dir, regexp = ".csv$")
csv_files_in_output
#> output/101_2018-01-05.csv output/101_2018-01-06.csv

Combine and the resulting .csv files to make it easier to explore the data.

names_of_csv_files <- sub(".csv$", "", basename(csv_files_in_output))

combined <- csv_files_in_output %>% 
  purrr::map(read_csv) %>% 
  purrr::set_names(names_of_csv_files) %>% 
  tibble::enframe(name = "csv_filename") %>% 
  tidyr::unnest()

DT::datatable(combined)

Observe that some tags have missing values.

combined %>% 
  dplyr::select(unique_stem, matches("stem|tag|dbh"), everything()) %>% 
  dplyr::filter(is.na(tag))
#> # A tibble: 2 x 27
#>   unique_stem stem_count   tag stem_tag   dbh dbh_2018 dbh_check new_stem
#>   <chr>            <int> <int>    <int> <dbl>    <dbl>     <dbl>    <int>
#> 1 NA_NA               78    NA       NA    NA       NA        NA       NA
#> 2 NA_NA               78    NA       NA    NA       NA        NA       NA
#> # ... with 19 more variables: csv_filename <chr>, submission_id <chr>,
#> #   section_id <chr>, quadrat <int>, date <chr>, team <chr>,
#> #   view_map <chr>, map <chr>, sheet <chr>, species <chr>, px <dbl>,
#> #   py <dbl>, status <chr>, codes <chr>, pom <dbl>, status_2018 <chr>,
#> #   codes_2018 <chr>, notes <chr>, data_check <chr>

Removing missing tags.

combined <- dplyr::filter(combined, !is.na(tag))

Identify stems that were skipped in this census.

sampled_before <- combined %>% 
  dplyr::filter(sheet == "orignal_stems") %>% 
  dplyr::pull(unique_stem) %>% 
  unique()
sampled_now <- combined %>% 
  dplyr::filter(sheet != "orignal_stems") %>% 
  dplyr::pull(unique_stem) %>% 
  unique()
skipped_stems <- setdiff(sampled_before, sampled_now)

combined %>% 
  dplyr::select(unique_stem, sheet, matches("stem|tag|dbh"), everything()) %>% 
  dplyr::filter(unique_stem %in% skipped_stems)
#> # A tibble: 6 x 27
#>   unique_stem sheet     stem_count   tag stem_tag   dbh dbh_2018 dbh_check
#>   <chr>       <chr>          <int> <int>    <int> <dbl>    <dbl>     <dbl>
#> 1 10004_1     orignal_…         NA 10004        1  3.90     4.10     0.200
#> 2 10004_2     orignal_…         NA 10004        2  1.80     2.00     0.200
#> 3 10004_3     orignal_…         NA 10004        3  1.60     1.80     0.200
#> 4 10004_1     orignal_…         NA 10004        1  3.90     4.10     0.200
#> 5 10004_2     orignal_…         NA 10004        2  1.80     2.00     0.200
#> 6 10004_3     orignal_…         NA 10004        3  1.60     1.80     0.200
#> # ... with 19 more variables: new_stem <int>, csv_filename <chr>,
#> #   submission_id <chr>, section_id <chr>, quadrat <int>, date <chr>,
#> #   team <chr>, view_map <chr>, map <chr>, species <chr>, px <dbl>,
#> #   py <dbl>, status <chr>, codes <chr>, pom <dbl>, status_2018 <chr>,
#> #   codes_2018 <chr>, notes <chr>, data_check <chr>

Identify duplicated stems.

duplicated_stems <- combined %>% 
  dplyr::group_by(unique_stem) %>% 
  dplyr::count(unique_stem) %>% 
  dplyr::filter(n > 1) %>% 
  dplyr::pull(unique_stem) %>% 
  unique()

combined %>% 
  dplyr::select(unique_stem, sheet, matches("stem|tag|dbh"), everything()) %>% 
  dplyr::filter(unique_stem %in% duplicated_stems) %>% 
  dplyr::arrange(unique_stem)
#> # A tibble: 12 x 27
#>    unique_stem sheet   stem_count    tag stem_tag   dbh dbh_2018 dbh_check
#>    <chr>       <chr>        <int>  <int>    <int> <dbl>    <dbl>     <dbl>
#>  1 10004_1     origna…         NA  10004        1  3.90     4.10     0.200
#>  2 10004_1     origna…         NA  10004        1  3.90     4.10     0.200
#>  3 10004_2     origna…         NA  10004        2  1.80     2.00     0.200
#>  4 10004_2     origna…         NA  10004        2  1.80     2.00     0.200
#>  5 10004_3     origna…         NA  10004        3  1.60     1.80     0.200
#>  6 10004_3     origna…         NA  10004        3  1.60     1.80     0.200
#>  7 10004_4     new_se…         NA  10004        4  1.00    NA       NA    
#>  8 10004_4     new_se…         NA  10004        4  1.00    NA       NA    
#>  9 123456_1    recrui…         NA 123456        1  1.50    NA       NA    
#> 10 123456_1    recrui…         NA 123456        1  1.50    NA       NA    
#> 11 123456_2    recrui…         NA 123456        2  1.00    NA       NA    
#> 12 123456_2    recrui…         NA 123456        2  1.00    NA       NA    
#> # ... with 19 more variables: new_stem <int>, csv_filename <chr>,
#> #   submission_id <chr>, section_id <chr>, quadrat <int>, date <chr>,
#> #   team <chr>, view_map <chr>, map <chr>, species <chr>, px <dbl>,
#> #   py <dbl>, status <chr>, codes <chr>, pom <dbl>, status_2018 <chr>,
#> #   codes_2018 <chr>, notes <chr>, data_check <chr>

Note


Future work

To do more comlex checks I need more data (e.g. to check that DBH values are reasonable).