この資料は2020年4月24日のデータサイエンス研究会の新歓勉強会のために作成されました.初心者の方を対象としています.全体的に『Rグラフィックスクックブック』第2版を参考にしました1.
今回,用いるのはggplot2とgcookbook,そしてwooldridgeというパッケージです.インストールがまだの場合は,以下のコードを実行してください.
install.packages("ggplot2")
install.packages("gcookbook")
install.packages("wooldridge")
ggplot2は美しいグラフを簡単につくるためのパッケージです.gcookbookには『Rグラフィックスクックブック』で用いられるデータセットが入っています.wooldridgeにはIntroductory Econometricsという計量経済学の教科書のデータセットが入っています.
library()
関数を用いてパッケージをロードします.
library(ggplot2)
ここで使うデータセットはcabbage_exp
というキャベツに関する集計データです.Cultivar
が品種,Date
が栽培日,Weight
が平均重量を表しています.
mydata1 <- gcookbook::cabbage_exp
mydata1
## Cultivar Date Weight sd n se
## 1 c39 d16 3.18 0.9566144 10 0.30250803
## 2 c39 d20 2.80 0.2788867 10 0.08819171
## 3 c39 d21 2.74 0.9834181 10 0.31098410
## 4 c52 d16 2.26 0.4452215 10 0.14079141
## 5 c52 d20 3.11 0.7908505 10 0.25008887
## 6 c52 d21 1.47 0.2110819 10 0.06674995
ggplot(mydata1, aes(x = Date, y = Weight)) +
geom_col()
ggplot(mydata1, aes(x = Date, y = Weight, fill = Cultivar)) +
geom_col()
ggplot(mydata1, aes(x = Date, y = Weight, fill = Cultivar)) +
geom_col(position = "dodge")
ggplot(mydata1, aes(x = Date, y = Weight, fill = Cultivar)) +
geom_col(position = "dodge", colour = "black")
ggplot(mydata1, aes(x = Date, y = Weight, fill = Cultivar)) +
geom_col(position = "dodge", colour = "black") +
scale_fill_brewer(palette = "Pastel1")
ここで使うfaithful
というデータセットはOld Faithful Geyserという間欠泉に関するデータです.waiting
は噴出間隔の時間を表します.
mydata2 <- faithful
ggplot(mydata2, aes(x = waiting)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(mydata2, aes(x = waiting)) +
geom_histogram(fill = "lightblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(mydata2, aes(x = waiting)) +
geom_histogram(fill = "lightblue", col = "black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(mydata2, aes(x = waiting)) +
geom_density()
ggplot(mydata2, aes(x = waiting, y = ..density..)) +
geom_histogram(fill = "lightblue", colour = "black") +
geom_density()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
mydata3 <- ToothGrowth
ggplot(mydata3, aes(x = supp, y = len)) +
geom_boxplot()
mydata3 <- ToothGrowth
ggplot(mydata3, aes(x = factor(dose), y = len)) +
geom_boxplot()
mydata3 <- ToothGrowth
ggplot(mydata3, aes(x = interaction(supp, dose), y = len)) +
geom_boxplot()
mydata3 <- ToothGrowth
ggplot(mydata3, aes(x = interaction(supp, dose), y = len)) +
geom_boxplot() +
geom_jitter()
mydata4 <- gcookbook::heightweight[, c("sex", "ageYear", "heightIn")]
head(mydata4)
## sex ageYear heightIn
## 1 f 11.92 56.3
## 2 f 12.92 62.3
## 3 f 12.75 63.3
## 4 f 13.42 59.0
## 5 f 15.92 62.5
## 6 f 14.25 62.5
ggplot(mydata4, aes(x = ageYear, y = heightIn)) +
geom_point()
ggplot(mydata4, aes(x = ageYear, y = heightIn, colour = sex)) +
geom_point()
ggplot(mydata4, aes(x = ageYear, y = heightIn, shape = sex, colour = sex)) +
geom_point()
ggplot(mydata4, aes(x = ageYear, y = heightIn, shape = sex, colour = sex)) +
geom_point() +
scale_shape_manual(values = c(1,2))
ggplot(mydata4, aes(x = ageYear, y = heightIn, shape = sex, colour = sex)) +
geom_point() +
scale_shape_manual(values = c(1,2)) +
scale_colour_brewer(palette = "Set1")
ggplot(mydata4, aes(x = ageYear, y = heightIn, shape = sex, colour = sex)) +
geom_smooth()
ggplot(mydata4, aes(x = ageYear, y = heightIn, shape = sex, colour = sex)) +
geom_point() +
scale_shape_manual(values = c(1,2)) +
scale_colour_brewer(palette = "Set1") +
geom_smooth()
mydata <- wooldridge::wage1
head(mydata)
## wage educ exper tenure nonwhite female married numdep smsa northcen south
## 1 3.10 11 2 0 0 1 0 2 1 0 0
## 2 3.24 12 22 2 0 1 1 3 1 0 0
## 3 3.00 11 2 0 0 0 0 2 0 0 0
## 4 6.00 8 44 28 0 0 1 0 1 0 0
## 5 5.30 12 7 2 0 0 1 1 0 0 0
## 6 8.75 16 9 8 0 0 1 0 1 0 0
## west construc ndurman trcommpu trade services profserv profocc clerocc
## 1 1 0 0 0 0 0 0 0 0
## 2 1 0 0 0 0 1 0 0 0
## 3 1 0 0 0 1 0 0 0 0
## 4 1 0 0 0 0 0 0 0 1
## 5 1 0 0 0 0 0 0 0 0
## 6 1 0 0 0 0 0 1 1 0
## servocc lwage expersq tenursq
## 1 0 1.131402 4 0
## 2 1 1.175573 484 4
## 3 0 1.098612 4 0
## 4 0 1.791759 1936 784
## 5 0 1.667707 49 4
## 6 0 2.169054 81 64
ggplot(mydata, aes(x = factor(female), y = wage, fill = factor(female))) +
geom_boxplot()
ggplot(mydata, aes(x = factor(female), y = wage)) +
geom_boxplot() +
geom_jitter(aes(col = factor(female)), size = 0.5)
ggplot(mydata, aes(x = factor(female), y = wage)) +
geom_boxplot() +
geom_jitter(aes(col = factor(married)), size = 0.5)
ggplot(mydata, aes(x = factor(female), y = wage)) +
geom_boxplot() +
geom_jitter(aes(col = factor(nonwhite)), size = 0.5)
ggplot(mydata, aes(x = interaction(female, married), y = wage, col = interaction(female, married))) +
geom_boxplot() +
geom_jitter(size = 0.5) +
labs(colour = "F X M")
ggplot(mydata, aes(x = educ, y = wage)) +
geom_point() +
geom_smooth()
ggplot(mydata, aes(x = educ, y = wage, col = factor(female))) +
geom_point() +
geom_smooth()
ggplot(mydata, aes(x = tenure, y = wage, col = factor(female))) +
geom_point() +
geom_smooth()
ggplot(mydata, aes(x = exper, y = wage, col = factor(female))) +
geom_point() +
geom_smooth()
ggplot(mydata, aes(x = educ, y = wage, col = factor(nonwhite))) +
geom_point() +
geom_smooth()
ggplot(mydata, aes(x = tenure, y = wage, col = factor(nonwhite))) +
geom_point() +
geom_smooth()
ggplot(mydata, aes(x = exper, y = wage, col = factor(nonwhite))) +
geom_point() +
geom_smooth()
インタラクティブなチャートも簡単に作れます.ここではplotlyパッケージを使ってみましょう2.
install.packages("plotly")
install.packages("gapminder")
library(plotly)
library(gapminder)
mydata <- gapminder
myplot <- plot_ly(gapminder,
x = ~gdpPercap,
y = ~lifeExp,
size = ~pop,
color = ~continent,
frame = ~year,
text = ~country,
hoverinfo = "text",
type = 'scatter',
mode = 'markers')
layout(myplot,xaxis = list(type = "log"))
COVID-19.csvというデータセットをジャッグジャパン株式会社が公開しています3.これは都道府県別新型コロナウイルス感染者数のデータです.
install.packages("dplyr")
install.packages("choroplethr")
install.packages("choroplethrAdmin1")
mydf <- read.csv("COVID-19.csv")
「ディレクトリ?なんやそれ?」という場合は以下を実行.
mydf <- read.csv("https://dl.dropboxusercontent.com/s/6mztoeb6xf78g5w/COVID-19.csv")
mydf$d <- as.Date(mydf$確定日, "%m/%d/%y")
hist(mydf$d, breaks="days", freq=TRUE)
library(dplyr)
newdata <- mydf %>%
group_by(d) %>%
tally()
ggplot(newdata, aes(x = d, y = n)) +
geom_point() +
geom_line() +
scale_x_date(date_breaks = "1 day", labels = scales::date_format("%m-%d")) +
theme(axis.text.x = element_text(angle = 90, size = 5))
library("choroplethr")
library("choroplethrAdmin1")
data(df_japan_census)
newdata2 <- mydf %>%
group_by(居住都道府県) %>%
tally()
newdata3 <- newdata2[c(-1:-9, -11:-12, -18),]
newdata3$居住都道府県 <- c("mie", "kyoto", "saga", "hyogo",
"hokkaido", "chiba", "wakayama", "saitama",
"oita", "osaka", "nara", "miyagi",
"miyazaki", "toyama", "yamaguchi", "yamagata",
"yamanashi", "gifu", "okayama", "shimane",
"hiroshima", "tokushima", "ehime", "aichi",
"niigata", "tokyo", "tochigi", "okinawa",
"shiga", "kumamoto", "ishikawa", "kanagawa",
"fukui", "fukuoka", "fukushima", "akita",
"gunma", "ibaraki", "nagasaki", "nagano",
"aomori", "shizuoka", "kagawa", "kochi",
"tottori", "kagoshima")
newdata3[47, 1] <- "iwate"
newdata3[47, 2] <- 0
colnames(newdata3) <- c("region", "value")
admin1_choropleth(country.name = "japan",
df = newdata3,
title = "Novel Coronavirus Cases",
legend = "Cases",
num_colors = 8)
newdata4 <- merge(newdata3, df_japan_census, by = "region")
newdata4$ratio <- round(newdata4$value / newdata4$pop_2010 * 1000000, 3)
newdata5 <- newdata4[, c("region", "ratio")]
colnames(newdata5)[2] <- "value"
admin1_choropleth(country.name = "japan",
df = newdata5,
title = "Novel Coronavirus Cases per Capita",
legend = "Rate (Denom is 1000000)",
num_colors = 8)