Open and explore each file with the previous questions in mind.
Database: Collection of tables connected via some value.
Table; record (= observation = row); field (= column); key; unique identifier; entry (value).
Explore all tabs. In Database Structure:
Is data redundant?
Menu:
File > New Database.
Modify Table
library(here) library(tidyverse)
Easiest is to work directly with the .csv files.
plots <- read_csv(here("plots.csv")) species <- read_csv(here("species.csv")) surveys <- read_csv(here("surveys.csv"))
surveys ## # A tibble: 35,549 x 9 ## record_id month day year plot_id species_id sex hindfoot_length ## <int> <int> <int> <int> <int> <chr> <chr> <int> ## 1 1 7 16 1977 2 NL M 32 ## 2 2 7 16 1977 3 NL M 33 ## 3 3 7 16 1977 2 DM F 37 ## 4 4 7 16 1977 7 DM M 36 ## 5 5 7 16 1977 3 DM M 35 ## 6 6 7 16 1977 1 PF M 14 ## 7 7 7 16 1977 2 PE F NA ## 8 8 7 16 1977 1 DM M 37 ## 9 9 7 16 1977 1 DM F 34 ## 10 10 7 16 1977 6 PF F 20 ## # ... with 35,539 more rows, and 1 more variable: weight <int>
If you already have a database, you can also use it.
path <- here("data-raw/portal/portal_mammals.sqlite") db <- DBI::dbConnect(RSQLite::SQLite(), path)
Copy each table from the database.
plots_db <- tbl(db, "plots") species_db <- tbl(db, "species") surveys_db <- tbl(db, "surveys")
surveys_db ## # Source: table<surveys> [?? x 9] ## # Database: sqlite 3.22.0 ## # [C:\Users\LeporeM\Documents\Dropbox\git\carpentries\data-raw\portal\portal_mammals.sqlite] ## record_id month day year plot_id species_id sex hindfoot_length ## <int> <int> <int> <int> <int> <chr> <chr> <dbl> ## 1 1 7 16 1977 2 NL M 32 ## 2 2 7 16 1977 3 NL M 33 ## 3 3 7 16 1977 2 DM F 37 ## 4 4 7 16 1977 7 DM M 36 ## 5 5 7 16 1977 3 DM M 35 ## 6 6 7 16 1977 1 PF M 14 ## 7 7 7 16 1977 2 PE F NA ## 8 8 7 16 1977 1 DM M 37 ## 9 9 7 16 1977 1 DM F 34 ## 10 10 7 16 1977 6 PF F 20 ## # ... with more rows, and 1 more variable: weight <dbl>
SELECT <columns> FROM <table>
SELECT * FROM surveys;
SELECT year, month, day FROM surveys;
SELECT surveys.year, surveys.month, surveys.day FROM surveys;
Good
SELECT year FROM surveys;
Bad
select year from surveys;
LIMIT
and DISTINCT
SELECT * FROM surveys LIMIT 10;
SELECT DISTINCT species_id FROM surveys;
+
, -
, *
, /
, and ROUND(<what?>, <digits>)
SELECT year, month, day, weight / 1000 FROM surveys;
SELECT plot_id, species_id, sex, weight, ROUND(weight / 1000, 2) FROM surveys;
<INTEGER> / 1000
Change weight
to integer:
SELECT weight / 1000 FROM surveys
SELECT weight / 1000.00 FROM surveys
Change back
SELECT day, month, year, species_id, weight * 1000 FROM surveys;
WHERE
SELECT * FROM surveys WHERE species_id = 'DM';
SELECT * FROM surveys WHERE year >= 2000;
AND
, OR
, and IN
SELECT * FROM surveys WHERE (year >= 2000) AND (species_id = 'DM');
SELECT * FROM surveys WHERE (species_id = 'DM') OR (species_id = 'DO') OR (species_id = 'DS'); -- Same as above. BTW, I'm a comment :) SELECT * FROM surveys WHERE species_id IN ('DM', 'DO', 'DS');
SELECT day, month, year, species_id, weight / 1000 FROM surveys WHERE (plot_id = 1) AND (weight > 75);
ORDER BY <column> ASC/DESC
SELECT * FROM species ORDER BY taxa ASC;
SELECT * FROM species ORDER BY taxa DESC;
SELECT * FROM species -- ASC is implicit (default). ORDER BY genus, species;
SELECT year, species_id, weight / 1000 FROM surveys ORDER BY weight DESC;
SELECT year, month, day, species_id, ROUND(weight / 1000, 2) FROM surveys WHERE year = 1999 ORDER BY species_id;
“Good” style and comments make your code easier to understand.
SQL helps you select columns, filter and order rows
Pick specific rows using conditions with AND
, OR
and IN
.
Compute on column values with arithmetic operators.
Setup
# Use packages library(here) library(tidyverse) # Connect to the database path <- here("data-raw/portal/portal_mammals.sqlite") db <- DBI::dbConnect(RSQLite::SQLite(), path) # Use a specific table from the database surveys_db <- tbl(db, "surveys") class(surveys_db) ## [1] "tbl_dbi" "tbl_sql" "tbl_lazy" "tbl"
surveys_db ## # Source: table<surveys> [?? x 9] ## # Database: sqlite 3.22.0 ## # [C:\Users\LeporeM\Documents\Dropbox\git\carpentries\data-raw\portal\portal_mammals.sqlite] ## record_id month day year plot_id species_id sex hindfoot_length ## <int> <int> <int> <int> <int> <chr> <chr> <dbl> ## 1 1 7 16 1977 2 NL M 32 ## 2 2 7 16 1977 3 NL M 33 ## 3 3 7 16 1977 2 DM F 37 ## 4 4 7 16 1977 7 DM M 36 ## 5 5 7 16 1977 3 DM M 35 ## 6 6 7 16 1977 1 PF M 14 ## 7 7 7 16 1977 2 PE F NA ## 8 8 7 16 1977 1 DM M 37 ## 9 9 7 16 1977 1 DM F 34 ## 10 10 7 16 1977 6 PF F 20 ## # ... with more rows, and 1 more variable: weight <dbl>
show_query(surveys_db) ## <SQL> ## SELECT * ## FROM `surveys`
{sql, engine = "sql", connection = "db"}
SELECT record_id, plot_id, species_id, sex FROM surveys
record_id | plot_id | species_id | sex |
---|---|---|---|
1 | 2 | NL | M |
2 | 3 | NL | M |
3 | 2 | DM | F |
4 | 7 | DM | M |
5 | 3 | DM | M |
6 | 1 | PF | M |
7 | 2 | PE | F |
8 | 1 | DM | M |
9 | 1 | DM | F |
10 | 6 | PF | F |
How can I summarize my data by aggregating, filtering, or ordering query results?
How can I make sure column names from my queries make sense and aren’t too long?
COUNT
, MAX
, MIN
, and AVG
SELECT COUNT(*) FROM surveys;
SELECT COUNT(*), SUM(weight) FROM surveys;
-- All animals SELECT SUM(weight), AVG(weight), MIN(weight), MAX(weight) FROM surveys; -- Only weights between 5 and 10 SELECT SUM(weight), AVG(weight), MIN(weight), MAX(weight) FROM surveys WHERE (weight > 5) AND (weight < 10);
GROUP BY
SELECT species_id, COUNT(*) FROM surveys GROUP BY species_id;
Solution of 1
SELECT year, COUNT(*) FROM surveys GROUP BY year;
Solution of 2 and 3
SELECT year, species_id, COUNT(*), AVG(weight) FROM surveys GROUP BY year, species_id;
ORDER BY
a summary columnSELECT species_id, COUNT(*) FROM surveys GROUP BY species_id ORDER BY COUNT(species_id);
AS
SELECT MAX(year) AS last_surveyed_year FROM surveys; SELECT MAX(year) last_surveyed_year FROM surveys;
AS
is optional. This also works (but isn’t “good” style)
SELECT MAX(year) last_surveyed_year FROM surveys;
GROUP BY <cols> HAVING <cond>
HAVING
~ WHERE
:
<columns> WHERE <condition>
<results> HAVING <condition>
GROUP BY <cols> HAVING <cond>
SELECT species_id, COUNT(species_id) FROM surveys GROUP BY species_id -- "Smells" a bit HAVING COUNT(species_id) > 10; -- Same but nicer SELECT species_id, COUNT(species_id) AS n FROM surveys GROUP BY species_id HAVING n > 10;
http://rstd.io/code-smells (Jenny Bryan)
SELECT taxa, COUNT(*) AS n FROM species GROUP BY taxa HAVING n > 10;
CREATE VIEW viewname AS
SELECT * FROM surveys WHERE year = 2000 AND (month > 4 AND month < 10);
Store
CREATE VIEW summer_2000 AS SELECT * FROM surveys WHERE year = 2000 AND (month > 4 AND month < 10);
Reuse
SELECT * FROM summer_2000 WHERE species_id == 'PE';
NULL
values-- 45 individuals which sex is of NULL sex SELECT sex, COUNT(*) FROM summer_2000 GROUP BY sex -- As you expect, excludes NULL SELECT COUNT(*) FROM summer_2000 -- Excludes NULL WHERE sex == 'F' -- But tally includes NULL: 366 + 382 + 45 = 793 SELECT COUNT(*) FROM summer_2000
Use MIN
, MAX
, AVG
, SUM
, COUNT
, etc. to operate on aggregated data.
Use AS
to create aliases.
Use HAVING
to filter on aggregate properties (similar to WHERE
).
Use CREATE VIEW
to store a query.
surveys %>% group_by(species_id) %>% summarize(n = n()) %>% filter(n > 10) ## # A tibble: 30 x 2 ## species_id n ## <chr> <int> ## 1 AB 303 ## 2 AH 437 ## 3 BA 46 ## 4 CB 50 ## 5 CM 13 ## 6 CQ 16 ## 7 DM 10596 ## 8 DO 3027 ## 9 DS 2504 ## 10 DX 40 ## # ... with 20 more rows
count()
is so common that has a shortcut
## # A tibble: 30 x 2 ## species_id n ## <chr> <int> ## 1 AB 303 ## 2 AH 437 ## 3 BA 46 ## 4 CB 50 ## 5 CM 13 ## 6 CQ 16 ## 7 DM 10596 ## 8 DO 3027 ## 9 DS 2504 ## 10 DX 40 ## # ... with 20 more rows
JOIN <table> USING <(columns)>
SELECT * FROM surveys JOIN species ON surveys.species_id = species.species_id;
Same, when column names match.
SELECT * FROM surveys JOIN species -- Same column name in both tables USING (species_id);
SELECT species.genus, species.species, surveys.weight FROM surveys JOIN species ON surveys.species_id = species.species_id;
Same
SELECT genus, species, weight FROM surveys JOIN species USING (species_id)
<kind of> JOIN
INNER JOIN
= JOIN
: Keeps all rows that match in both tables
SELECT * FROM surveys INNER JOIN species USING (species_id);
LEFT JOIN
: Keeps all rows that match in left table
(“left” is closest to FROM
)
SELECT * FROM surveys LEFT JOIN species USING (species_id);
JOIN
ed results Left table implicit: Drives attention to surveys.weight
.
SELECT plot_type, AVG(surveys.weight) FROM plots JOIN surveys USING (plot_id) GROUP BY plot_type;
Same but buries intention.
SELECT plots.plot_type, AVG(surveys.weight) FROM surveys JOIN plots ON surveys.plot_id = plots.plot_id GROUP BY plots.plot_type;
SELECT plot_id, genus, COUNT(*) AS n FROM surveys JOIN species USING (species_id) GROUP BY genus, plot_id ORDER BY plot_id ASC, n DESC;
IFNULL(<column>, <value>)
NULL
valuesSELECT species_id, sex, IFNULL(sex, 'U') FROM surveys;
SELECT hindfoot_length, IFNULL(hindfoot_length, 30) FROM surveys;
SELECT species_id, AVG(IFNULL(hindfoot_length,30)) FROM surveys GROUP BY species_id;
NULLIF(<column>, <value>)
NULLIF
is the inverse of IFNULL
.
Returns NULL
If the first argument is equal to the second argument it returns NULL
.
Else, it returns the original value.
SELECT species_id, plot_id, NULLIF(plot_id, 7) FROM surveys;
SELECT genus, LENGTH(genus) FROM species ORDER BY LENGTH(genus) DESC
Use JOIN <table> USING <(common column)>
to combine data from two tables.
Use (INNER) JOIN
to keep rows that match on both tables; or LEFT JOIN
to keep all rows of the left table and the matching rows of the right table.
Use IFNULL(<column>, <value>)
to “fill” NULL
values; and NULLIF(<column>, <value>)
to “null out” specific values.
Use other functions e.g. LENGTH(<column>)
to operate on individual values.
plots %>% left_join(surveys) %>% group_by(plot_type) %>% mutate(mean_weight = mean(weight, na.rm = TRUE)) %>% select(plot_type, mean_weight) %>% unique() ## Joining, by = "plot_id" ## # A tibble: 5 x 2 ## # Groups: plot_type [5] ## plot_type mean_weight ## <chr> <dbl> ## 1 Spectab exclosure 51.6 ## 2 Control 48.6 ## 3 Long-term Krat Exclosure 27.2 ## 4 Rodent Exclosure 32.5 ## 5 Short-term Krat Exclosure 41.2