はじめに

この資料は2020年4月24日のデータサイエンス研究会の新歓勉強会のために作成されました.初心者の方を対象としています.全体的に『Rグラフィックスクックブック』第2版を参考にしました1

Rの基本操作

Rの基本操作は,以下のウェブサイトを参照してください.

『Rによる原因を推論する』

必要なパッケージ

今回,用いるのはggplot2gcookbook,そしてwooldridgeというパッケージです.インストールがまだの場合は,以下のコードを実行してください.

install.packages("ggplot2")
install.packages("gcookbook")
install.packages("wooldridge")

ggplot2は美しいグラフを簡単につくるためのパッケージです.gcookbookには『Rグラフィックスクックブック』で用いられるデータセットが入っています.wooldridgeにはIntroductory Econometricsという計量経済学の教科書のデータセットが入っています.

library()関数を用いてパッケージをロードします.

library(ggplot2)

棒グラフ

ここで使うデータセットはcabbage_expというキャベツに関する集計データです.Cultivarが品種,Dateが栽培日,Weightが平均重量を表しています.

mydata1 <- gcookbook::cabbage_exp
mydata1
##   Cultivar Date Weight        sd  n         se
## 1      c39  d16   3.18 0.9566144 10 0.30250803
## 2      c39  d20   2.80 0.2788867 10 0.08819171
## 3      c39  d21   2.74 0.9834181 10 0.31098410
## 4      c52  d16   2.26 0.4452215 10 0.14079141
## 5      c52  d20   3.11 0.7908505 10 0.25008887
## 6      c52  d21   1.47 0.2110819 10 0.06674995
ggplot(mydata1, aes(x = Date, y = Weight)) +
  geom_col()

ggplot(mydata1, aes(x = Date, y = Weight, fill = Cultivar)) +
  geom_col()

ggplot(mydata1, aes(x = Date, y = Weight, fill = Cultivar)) +
  geom_col(position = "dodge")

ggplot(mydata1, aes(x = Date, y = Weight, fill = Cultivar)) +
  geom_col(position = "dodge", colour = "black")

ggplot(mydata1, aes(x = Date, y = Weight, fill = Cultivar)) +
  geom_col(position = "dodge", colour = "black") +
  scale_fill_brewer(palette = "Pastel1")

ヒストグラム

ここで使うfaithfulというデータセットはOld Faithful Geyserという間欠泉に関するデータです.waitingは噴出間隔の時間を表します.

mydata2 <- faithful
ggplot(mydata2, aes(x = waiting)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(mydata2, aes(x = waiting)) +
  geom_histogram(fill = "lightblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(mydata2, aes(x = waiting)) +
  geom_histogram(fill = "lightblue", col = "black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(mydata2, aes(x = waiting)) +
  geom_density()

ggplot(mydata2, aes(x = waiting, y = ..density..)) +
  geom_histogram(fill = "lightblue", colour = "black") +
  geom_density()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

箱ひげ図

mydata3 <- ToothGrowth

ggplot(mydata3, aes(x = supp, y = len)) +
  geom_boxplot()

mydata3 <- ToothGrowth

ggplot(mydata3, aes(x = factor(dose), y = len)) +
  geom_boxplot()

mydata3 <- ToothGrowth

ggplot(mydata3, aes(x = interaction(supp, dose), y = len)) +
  geom_boxplot()

mydata3 <- ToothGrowth

ggplot(mydata3, aes(x = interaction(supp, dose), y = len)) +
  geom_boxplot() +
  geom_jitter()

散布図

mydata4 <- gcookbook::heightweight[, c("sex", "ageYear", "heightIn")]

head(mydata4)
##   sex ageYear heightIn
## 1   f   11.92     56.3
## 2   f   12.92     62.3
## 3   f   12.75     63.3
## 4   f   13.42     59.0
## 5   f   15.92     62.5
## 6   f   14.25     62.5
ggplot(mydata4, aes(x = ageYear, y = heightIn)) +
  geom_point()

ggplot(mydata4, aes(x = ageYear, y = heightIn, colour = sex)) +
  geom_point()

ggplot(mydata4, aes(x = ageYear, y = heightIn, shape = sex, colour = sex)) +
  geom_point()

ggplot(mydata4, aes(x = ageYear, y = heightIn, shape = sex, colour = sex)) +
  geom_point() +
  scale_shape_manual(values = c(1,2))

ggplot(mydata4, aes(x = ageYear, y = heightIn, shape = sex, colour = sex)) +
  geom_point() +
  scale_shape_manual(values = c(1,2)) +
  scale_colour_brewer(palette = "Set1")

ggplot(mydata4, aes(x = ageYear, y = heightIn, shape = sex, colour = sex)) +
  geom_smooth()

ggplot(mydata4, aes(x = ageYear, y = heightIn, shape = sex, colour = sex)) +
  geom_point() +
  scale_shape_manual(values = c(1,2)) +
  scale_colour_brewer(palette = "Set1") +
  geom_smooth()

演習

mydata <- wooldridge::wage1

head(mydata)
##   wage educ exper tenure nonwhite female married numdep smsa northcen south
## 1 3.10   11     2      0        0      1       0      2    1        0     0
## 2 3.24   12    22      2        0      1       1      3    1        0     0
## 3 3.00   11     2      0        0      0       0      2    0        0     0
## 4 6.00    8    44     28        0      0       1      0    1        0     0
## 5 5.30   12     7      2        0      0       1      1    0        0     0
## 6 8.75   16     9      8        0      0       1      0    1        0     0
##   west construc ndurman trcommpu trade services profserv profocc clerocc
## 1    1        0       0        0     0        0        0       0       0
## 2    1        0       0        0     0        1        0       0       0
## 3    1        0       0        0     1        0        0       0       0
## 4    1        0       0        0     0        0        0       0       1
## 5    1        0       0        0     0        0        0       0       0
## 6    1        0       0        0     0        0        1       1       0
##   servocc    lwage expersq tenursq
## 1       0 1.131402       4       0
## 2       1 1.175573     484       4
## 3       0 1.098612       4       0
## 4       0 1.791759    1936     784
## 5       0 1.667707      49       4
## 6       0 2.169054      81      64
ggplot(mydata, aes(x = factor(female), y = wage, fill = factor(female))) +
  geom_boxplot()

ggplot(mydata, aes(x = factor(female), y = wage)) +
  geom_boxplot() +
  geom_jitter(aes(col = factor(female)), size = 0.5)

ggplot(mydata, aes(x = factor(female), y = wage)) +
  geom_boxplot() +
  geom_jitter(aes(col = factor(married)), size = 0.5)

ggplot(mydata, aes(x = factor(female), y = wage)) +
  geom_boxplot() +
  geom_jitter(aes(col = factor(nonwhite)), size = 0.5)

ggplot(mydata, aes(x = interaction(female, married), y = wage, col = interaction(female, married))) +
  geom_boxplot() +
  geom_jitter(size = 0.5) +
  labs(colour = "F X M")

ggplot(mydata, aes(x = educ, y = wage)) +
  geom_point() +
  geom_smooth()

ggplot(mydata, aes(x = educ, y = wage, col = factor(female))) +
  geom_point() +
  geom_smooth()

ggplot(mydata, aes(x = tenure, y = wage, col = factor(female))) +
  geom_point() +
  geom_smooth()

ggplot(mydata, aes(x = exper, y = wage, col = factor(female))) +
  geom_point() +
  geom_smooth()

ggplot(mydata, aes(x = educ, y = wage, col = factor(nonwhite))) +
  geom_point() +
  geom_smooth()

ggplot(mydata, aes(x = tenure, y = wage, col = factor(nonwhite))) +
  geom_point() +
  geom_smooth()

ggplot(mydata, aes(x = exper, y = wage, col = factor(nonwhite))) +
  geom_point() +
  geom_smooth()

おまけ1

インタラクティブなチャートも簡単に作れます.ここではplotlyパッケージを使ってみましょう2

install.packages("plotly")
install.packages("gapminder")
library(plotly)
library(gapminder)

mydata <- gapminder


myplot <- plot_ly(gapminder,
               x = ~gdpPercap, 
               y = ~lifeExp, 
               size = ~pop, 
               color = ~continent, 
               frame = ~year, 
               text = ~country, 
               hoverinfo = "text",
               type = 'scatter',
               mode = 'markers')

layout(myplot,xaxis = list(type = "log"))

おまけ2

COVID-19.csvというデータセットをジャッグジャパン株式会社が公開しています3.これは都道府県別新型コロナウイルス感染者数のデータです.

install.packages("dplyr")
install.packages("choroplethr")
install.packages("choroplethrAdmin1")
mydf <- read.csv("COVID-19.csv")

「ディレクトリ?なんやそれ?」という場合は以下を実行.

mydf <- read.csv("https://dl.dropboxusercontent.com/s/6mztoeb6xf78g5w/COVID-19.csv")
mydf$d <- as.Date(mydf$確定日, "%m/%d/%y")
hist(mydf$d, breaks="days", freq=TRUE)

library(dplyr)

newdata <- mydf %>% 
  group_by(d) %>% 
  tally()
ggplot(newdata, aes(x = d, y = n)) +
  geom_point() +
  geom_line() +
  scale_x_date(date_breaks = "1 day", labels = scales::date_format("%m-%d")) +
  theme(axis.text.x = element_text(angle = 90, size = 5))

library("choroplethr")
library("choroplethrAdmin1")

data(df_japan_census)
newdata2 <- mydf %>% 
  group_by(居住都道府県) %>% 
  tally()

newdata3 <- newdata2[c(-1:-9, -11:-12, -18),]


newdata3$居住都道府県 <- c("mie", "kyoto", "saga", "hyogo",
                     "hokkaido", "chiba", "wakayama", "saitama",
                     "oita", "osaka", "nara", "miyagi",
                     "miyazaki", "toyama", "yamaguchi", "yamagata",
                     "yamanashi", "gifu", "okayama", "shimane",
                     "hiroshima", "tokushima", "ehime", "aichi", 
                     "niigata", "tokyo", "tochigi", "okinawa",
                     "shiga", "kumamoto", "ishikawa", "kanagawa",
                     "fukui", "fukuoka", "fukushima", "akita", 
                     "gunma", "ibaraki", "nagasaki", "nagano",
                     "aomori", "shizuoka", "kagawa", "kochi", 
                     "tottori", "kagoshima")

newdata3[47, 1] <- "iwate"
newdata3[47, 2] <- 0

colnames(newdata3) <- c("region", "value")
admin1_choropleth(country.name = "japan",
                  df           = newdata3,
                  title        = "Novel Coronavirus Cases",
                  legend       = "Cases",
                  num_colors   = 8)

newdata4 <- merge(newdata3, df_japan_census, by = "region")
newdata4$ratio <- round(newdata4$value / newdata4$pop_2010 * 1000000, 3)

newdata5 <- newdata4[, c("region", "ratio")]
colnames(newdata5)[2] <- "value" 
admin1_choropleth(country.name = "japan",
                  df           = newdata5,
                  title        = "Novel Coronavirus Cases per Capita",
                  legend       = "Rate (Denom is 1000000)",
                  num_colors   = 8)


  1. 正確にはその原著R Graphics Cookbookを参考にしました.↩︎

  2. ここはI. Faber. (2018). Animating Your Data Visualizations Like a Boss Using R, towards data scienceを参照しました.↩︎

  3. データセットについてはこちらを参照↩︎