This document outlines progress I have made on webscraping Colorado environmental data. I am undertaking this project to:

  1. Understand how precipitation (snowfall) has changed in the Colorado Rocky Mountains over the past 20-30 years.
  2. Learn how to webscrape data.
  3. Learn how to automate tasks.

I aim to webscrape data from USDA webpages. These webpages have data from the Snow Telemetry (SNOTEL) Network. In the state of Colorado, there are 115 SNOTEL sites, which automatically collect data on snowpack, precipitation, temperature, and other environmental phenomena. These SNOTEL sites are equipped with several sensors that allow for the data collection. In particular, they have a device known as a snow pillow, which measures how much water is in the snowpack by weighing the snow with a pressure transducer.

install.packages("rvest", repos = "https://cran.us.r-project.org")
install.packages("tidyverse", repos = "https://cran.us.r-project.org")
library(rvest)
library(tidyverse)

Create object to encompass html where datatable is located:

webpage2 <- read_html("https://wcc.sc.egov.usda.gov/reportGenerator/view/customWaterYearGroupByMonthReport/monthly/start_of_period/556:CO:SNTL%7Cid=%22%22%7Cname/-357,0/WTEQ::value?fitToScreen=false&sortBy=1:-1")

Extract table from the webpage:

water_data <- webpage2 %>% 
  html_nodes("table") %>%  
  html_table() %>% 
    .[[37]] 

Print the datatable:

DT::datatable(water_data,extensions = 'FixedColumns', #DT:: allows you to not install package or load its library 
           options = list(
    dom = 't',
    scrollX = TRUE,
    fixedColumns = TRUE))

Data cleaning:

h2o <- filter(water_data, Oct != "SnowWaterEquivalent(in)Start of Month Values") #eliminate rows with missing data
names(h2o)[names(h2o) == 'Water Year'] <- 'year' #changes name of variable
h2o <- gather(h2o, month, snowvol, 2:13) #convert datatable from wide to long format
h2o$snowvol <- as.numeric(h2o$snowvol) #coerce the snowvol variable into being numeric

#convert month names to numbers 
h2o<-h2o %>%
  mutate(month = recode(month,
                        Jan = 1,
                        Feb = 2,
                        Mar = 3,
                        Apr = 4,
                        May = 5,
                        Jun = 6,
                        Jul = 7,
                        Aug = 8,
                        Sep = 9,
                        Oct = 10,
                        Nov = 11,
                        Dec = 12))
h2o$month <- as.character(h2o$month) #coerce month variable into numeric value


DT::datatable(h2o, extensions = 'Scroller', options = list(
  deferRender = TRUE,
  scrollY = 200,
  scroller = TRUE
))

View data structure:

str(h2o)
## tibble [360 × 3] (S3: tbl_df/tbl/data.frame)
##  $ year   : chr [1:360] "1993" "1994" "1995" "1996" ...
##  $ month  : chr [1:360] "10" "10" "10" "10" ...
##  $ snowvol: num [1:360] 0 0 0 0 0 0 0 0 0 0 ...

Plot the data:

ggplot(h2o, aes(x = month, y = snowvol, color = year)) + 
  geom_point(size = 5) + ylab("Snow water equivalent") + xlab("Month") +
 ylim(0,10) + 
  scale_x_discrete(name ="month", limits=c("10","11","12", "1","2","3", "4", "5", "6", "7","8",
                                           "9")) + 
  theme(
    axis.line.y.left = element_line(color = "black"), #color of left axis line
    axis.title.y.left = element_text(color="black", face="bold", size=18), #color, face, and size of y axis title
    axis.text.y.left = element_text(color="black", face="bold", size =18), #color, face, and size of y axis values
    axis.text.x = element_text(color="black", face="bold", size = 18),  #color, face, and size of y axis values
    axis.ticks = element_line(size = 1), #size of tick on x and y axes
    axis.ticks.length = unit(-0.25, "cm"), #makes ticks go inside the plot 
    axis.title.x=element_text(color="black",size=18), #color, size of x axis title (not included)
    plot.title = element_text(face="bold", size=19, hjust = 0.5), 
    plot.caption = element_text(size=12.5, colour ="red"),
    plot.tag = element_text(color="black",face = "bold", size=18),
    axis.line = element_line(colour = "black"),
    legend.position = "bottom",
    legend.title = element_text(size = 14, face ="bold"),
    legend.text = element_text(size = 12, face ="bold"),
    legend.key.size = unit(0.5, "cm"), #set each tile size in legend 
    panel.border = element_rect(colour = "black", fill=NA, size=1),
    panel.grid.minor = element_blank(),
    panel.background = element_blank()) +
  guides(col = guide_legend(nrow = 3, byrow = TRUE))  #set number of legend rows/columns 

The Colorado SNOwpack TELemetry (SNOTEL) sites have unique identifying numbers. These numbers can be found here. Read in the url as an html:

sitepg <- read_html("https://wcc.sc.egov.usda.gov/nwcc/yearcount?network=sntl&state=CO&counttype=statelist")

Scrape the table from the site:

site_data <- sitepg %>% html_nodes("table") %>% 
  html_table()  %>% .[[3]]

DT::datatable(site_data,extensions = 'FixedColumns', #DT:: allows you to not install package or load its library 
           options = list(
    dom = 't',
    scrollX = TRUE,
    fixedColumns = TRUE))

Extract the site number from the original table, make site number a new column, and create a new column for the site’s corresponding url:

site_id <- select(site_data, 3)
nums <- regmatches(site_id$site_name, gregexpr("[[:digit:]]+", site_id$site_name)) #extracts number from string 
number<-as.numeric(unlist(nums)) #converts list to a vector 
number<-number [! number %in% 2] #removes extra values that were identified from the actual site names 
site_id <- data.frame(site_id, number) #combines dataframes into one     
#create new column that encompasses each row's corresponding url, which is based on the site number: 
site_id$url <- paste0("https://wcc.sc.egov.usda.gov/reportGenerator/view/customWaterYearGroupByMonthReport/monthly/start_of_period/", site_id$number, ":CO:SNTL%7Cid=%22%22%7Cname/-360,0/WTEQ::value?fitToScreen=false")

DT::datatable(site_id,extensions = 'FixedColumns', #DT:: allows you to not install package or load its library 
           options = list(
    dom = 't',
    scrollX = TRUE,
    fixedColumns = TRUE))

Automate the

for(i in 1:115){
  df <- as.vector(read_html(site_id$url[i])) #read every SNOTEL site url as a separate object called 'df' 
  assign(paste0("site_id_",site_id$number[i]),df) #rename the dataframe to begin with 'site_id_" and the corresponding site id number 
}

Print all objects in the environment:

ls()
##   [1] "df"           "h2o"          "i"            "number"       "nums"        
##   [6] "site_data"    "site_id"      "site_id_1005" "site_id_1014" "site_id_1030"
##  [11] "site_id_1031" "site_id_1032" "site_id_1033" "site_id_1040" "site_id_1041"
##  [16] "site_id_1042" "site_id_1057" "site_id_1058" "site_id_1059" "site_id_1060"
##  [21] "site_id_1061" "site_id_1100" "site_id_1101" "site_id_1102" "site_id_1120"
##  [26] "site_id_1122" "site_id_1123" "site_id_1124" "site_id_1128" "site_id_1141"
##  [31] "site_id_1160" "site_id_1161" "site_id_1185" "site_id_1186" "site_id_1187"
##  [36] "site_id_1188" "site_id_1251" "site_id_1252" "site_id_303"  "site_id_322" 
##  [41] "site_id_327"  "site_id_335"  "site_id_345"  "site_id_369"  "site_id_378" 
##  [46] "site_id_380"  "site_id_386"  "site_id_387"  "site_id_408"  "site_id_409" 
##  [51] "site_id_412"  "site_id_415"  "site_id_426"  "site_id_430"  "site_id_431" 
##  [56] "site_id_438"  "site_id_457"  "site_id_465"  "site_id_467"  "site_id_485" 
##  [61] "site_id_505"  "site_id_531"  "site_id_538"  "site_id_542"  "site_id_547" 
##  [66] "site_id_551"  "site_id_556"  "site_id_564"  "site_id_565"  "site_id_580" 
##  [71] "site_id_586"  "site_id_589"  "site_id_602"  "site_id_607"  "site_id_618" 
##  [76] "site_id_622"  "site_id_624"  "site_id_629"  "site_id_632"  "site_id_658" 
##  [81] "site_id_663"  "site_id_669"  "site_id_675"  "site_id_680"  "site_id_682" 
##  [86] "site_id_688"  "site_id_701"  "site_id_709"  "site_id_713"  "site_id_717" 
##  [91] "site_id_718"  "site_id_737"  "site_id_739"  "site_id_762"  "site_id_773" 
##  [96] "site_id_780"  "site_id_793"  "site_id_797"  "site_id_802"  "site_id_825" 
## [101] "site_id_827"  "site_id_829"  "site_id_838"  "site_id_839"  "site_id_840" 
## [106] "site_id_842"  "site_id_843"  "site_id_857"  "site_id_869"  "site_id_870" 
## [111] "site_id_874"  "site_id_904"  "site_id_905"  "site_id_913"  "site_id_914" 
## [116] "site_id_935"  "site_id_936"  "site_id_937"  "site_id_938"  "site_id_939" 
## [121] "site_id_940"  "site_id_970"  "sitepg"       "water_data"   "webpage2"

Create long-format table for one of the dataframes:

water_data_939 <- site_id_939 %>% html_nodes("table") %>% 
  html_table() %>% .[[37]] %>% 
  filter(Oct != "SnowWaterEquivalent(in)Start of Month Values") %>% gather(month, snowvol, 2:13) 
head(water_data_939)
## # A tibble: 6 × 3
##   `Water Year` month snowvol
##   <chr>        <chr> <chr>  
## 1 1999         Oct   0.0    
## 2 2000         Oct   0.0    
## 3 2001         Oct   0.0    
## 4 2002         Oct   0.0    
## 5 2003         Oct   0.0    
## 6 2004         Oct   0.0