Rでグラフを作ろう

はじめに

この資料は2020年4月24日のデータサイエンス研究会の新歓勉強会のために作成されました．初心者の方を対象としています．全体的に『Rグラフィックスクックブック』第2版を参考にしました¹．

Rの基本操作

Rの基本操作は，以下のウェブサイトを参照してください．

必要なパッケージ

今回，用いるのはggplot2とgcookbook，そしてwooldridgeというパッケージです．インストールがまだの場合は，以下のコードを実行してください．

install.packages("ggplot2")
install.packages("gcookbook")
install.packages("wooldridge")

ggplot2は美しいグラフを簡単につくるためのパッケージです．gcookbookには『Rグラフィックスクックブック』で用いられるデータセットが入っています．wooldridgeにはIntroductory Econometricsという計量経済学の教科書のデータセットが入っています．

library()関数を用いてパッケージをロードします．

library(ggplot2)

棒グラフ

ここで使うデータセットはcabbage_expというキャベツに関する集計データです．Cultivarが品種，Dateが栽培日，Weightが平均重量を表しています．

mydata1 <- gcookbook::cabbage_exp
mydata1

##   Cultivar Date Weight        sd  n         se
## 1      c39  d16   3.18 0.9566144 10 0.30250803
## 2      c39  d20   2.80 0.2788867 10 0.08819171
## 3      c39  d21   2.74 0.9834181 10 0.31098410
## 4      c52  d16   2.26 0.4452215 10 0.14079141
## 5      c52  d20   3.11 0.7908505 10 0.25008887
## 6      c52  d21   1.47 0.2110819 10 0.06674995

ggplot(mydata1, aes(x = Date, y = Weight)) +
  geom_col()

ggplot(mydata1, aes(x = Date, y = Weight, fill = Cultivar)) +
  geom_col()

ggplot(mydata1, aes(x = Date, y = Weight, fill = Cultivar)) +
  geom_col(position = "dodge")

ggplot(mydata1, aes(x = Date, y = Weight, fill = Cultivar)) +
  geom_col(position = "dodge", colour = "black")

ggplot(mydata1, aes(x = Date, y = Weight, fill = Cultivar)) +
  geom_col(position = "dodge", colour = "black") +
  scale_fill_brewer(palette = "Pastel1")

ヒストグラム

ここで使うfaithfulというデータセットはOld Faithful Geyserという間欠泉に関するデータです．waitingは噴出間隔の時間を表します．

mydata2 <- faithful

ggplot(mydata2, aes(x = waiting)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(mydata2, aes(x = waiting)) +
  geom_histogram(fill = "lightblue")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(mydata2, aes(x = waiting)) +
  geom_histogram(fill = "lightblue", col = "black")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(mydata2, aes(x = waiting)) +
  geom_density()

ggplot(mydata2, aes(x = waiting, y = ..density..)) +
  geom_histogram(fill = "lightblue", colour = "black") +
  geom_density()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

箱ひげ図

mydata3 <- ToothGrowth

ggplot(mydata3, aes(x = supp, y = len)) +
  geom_boxplot()

mydata3 <- ToothGrowth

ggplot(mydata3, aes(x = factor(dose), y = len)) +
  geom_boxplot()

mydata3 <- ToothGrowth

ggplot(mydata3, aes(x = interaction(supp, dose), y = len)) +
  geom_boxplot()

mydata3 <- ToothGrowth

ggplot(mydata3, aes(x = interaction(supp, dose), y = len)) +
  geom_boxplot() +
  geom_jitter()

散布図

mydata4 <- gcookbook::heightweight[, c("sex", "ageYear", "heightIn")]

head(mydata4)

##   sex ageYear heightIn
## 1   f   11.92     56.3
## 2   f   12.92     62.3
## 3   f   12.75     63.3
## 4   f   13.42     59.0
## 5   f   15.92     62.5
## 6   f   14.25     62.5

ggplot(mydata4, aes(x = ageYear, y = heightIn)) +
  geom_point()

ggplot(mydata4, aes(x = ageYear, y = heightIn, colour = sex)) +
  geom_point()

ggplot(mydata4, aes(x = ageYear, y = heightIn, shape = sex, colour = sex)) +
  geom_point()

ggplot(mydata4, aes(x = ageYear, y = heightIn, shape = sex, colour = sex)) +
  geom_point() +
  scale_shape_manual(values = c(1,2))

ggplot(mydata4, aes(x = ageYear, y = heightIn, shape = sex, colour = sex)) +
  geom_point() +
  scale_shape_manual(values = c(1,2)) +
  scale_colour_brewer(palette = "Set1")

ggplot(mydata4, aes(x = ageYear, y = heightIn, shape = sex, colour = sex)) +
  geom_smooth()

ggplot(mydata4, aes(x = ageYear, y = heightIn, shape = sex, colour = sex)) +
  geom_point() +
  scale_shape_manual(values = c(1,2)) +
  scale_colour_brewer(palette = "Set1") +
  geom_smooth()

演習

mydata <- wooldridge::wage1

head(mydata)

##   wage educ exper tenure nonwhite female married numdep smsa northcen south
## 1 3.10   11     2      0        0      1       0      2    1        0     0
## 2 3.24   12    22      2        0      1       1      3    1        0     0
## 3 3.00   11     2      0        0      0       0      2    0        0     0
## 4 6.00    8    44     28        0      0       1      0    1        0     0
## 5 5.30   12     7      2        0      0       1      1    0        0     0
## 6 8.75   16     9      8        0      0       1      0    1        0     0
##   west construc ndurman trcommpu trade services profserv profocc clerocc
## 1    1        0       0        0     0        0        0       0       0
## 2    1        0       0        0     0        1        0       0       0
## 3    1        0       0        0     1        0        0       0       0
## 4    1        0       0        0     0        0        0       0       1
## 5    1        0       0        0     0        0        0       0       0
## 6    1        0       0        0     0        0        1       1       0
##   servocc    lwage expersq tenursq
## 1       0 1.131402       4       0
## 2       1 1.175573     484       4
## 3       0 1.098612       4       0
## 4       0 1.791759    1936     784
## 5       0 1.667707      49       4
## 6       0 2.169054      81      64

ggplot(mydata, aes(x = factor(female), y = wage, fill = factor(female))) +
  geom_boxplot()

ggplot(mydata, aes(x = factor(female), y = wage)) +
  geom_boxplot() +
  geom_jitter(aes(col = factor(female)), size = 0.5)

ggplot(mydata, aes(x = factor(female), y = wage)) +
  geom_boxplot() +
  geom_jitter(aes(col = factor(married)), size = 0.5)

ggplot(mydata, aes(x = factor(female), y = wage)) +
  geom_boxplot() +
  geom_jitter(aes(col = factor(nonwhite)), size = 0.5)

ggplot(mydata, aes(x = interaction(female, married), y = wage, col = interaction(female, married))) +
  geom_boxplot() +
  geom_jitter(size = 0.5) +
  labs(colour = "F X M")

ggplot(mydata, aes(x = educ, y = wage)) +
  geom_point() +
  geom_smooth()

ggplot(mydata, aes(x = educ, y = wage, col = factor(female))) +
  geom_point() +
  geom_smooth()

ggplot(mydata, aes(x = tenure, y = wage, col = factor(female))) +
  geom_point() +
  geom_smooth()

ggplot(mydata, aes(x = exper, y = wage, col = factor(female))) +
  geom_point() +
  geom_smooth()

ggplot(mydata, aes(x = educ, y = wage, col = factor(nonwhite))) +
  geom_point() +
  geom_smooth()

ggplot(mydata, aes(x = tenure, y = wage, col = factor(nonwhite))) +
  geom_point() +
  geom_smooth()

ggplot(mydata, aes(x = exper, y = wage, col = factor(nonwhite))) +
  geom_point() +
  geom_smooth()

おまけ1

インタラクティブなチャートも簡単に作れます．ここではplotlyパッケージを使ってみましょう²．

install.packages("plotly")
install.packages("gapminder")

library(plotly)
library(gapminder)

mydata <- gapminder


myplot <- plot_ly(gapminder,
               x = ~gdpPercap, 
               y = ~lifeExp, 
               size = ~pop, 
               color = ~continent, 
               frame = ~year, 
               text = ~country, 
               hoverinfo = "text",
               type = 'scatter',
               mode = 'markers')

layout(myplot,xaxis = list(type = "log"))

おまけ2

COVID-19.csvというデータセットをジャッグジャパン株式会社が公開しています³．これは都道府県別新型コロナウイルス感染者数のデータです．

install.packages("dplyr")
install.packages("choroplethr")
install.packages("choroplethrAdmin1")

mydf <- read.csv("COVID-19.csv")

「ディレクトリ？なんやそれ？」という場合は以下を実行．

mydf <- read.csv("https://dl.dropboxusercontent.com/s/6mztoeb6xf78g5w/COVID-19.csv")

mydf$d <- as.Date(mydf$確定日, "%m/%d/%y")
hist(mydf$d, breaks="days", freq=TRUE)

library(dplyr)

newdata <- mydf %>% 
  group_by(d) %>% 
  tally()

ggplot(newdata, aes(x = d, y = n)) +
  geom_point() +
  geom_line() +
  scale_x_date(date_breaks = "1 day", labels = scales::date_format("%m-%d")) +
  theme(axis.text.x = element_text(angle = 90, size = 5))

library("choroplethr")
library("choroplethrAdmin1")

data(df_japan_census)

newdata2 <- mydf %>% 
  group_by(居住都道府県) %>% 
  tally()

newdata3 <- newdata2[c(-1:-9, -11:-12, -18),]


newdata3$居住都道府県 <- c("mie", "kyoto", "saga", "hyogo",
                     "hokkaido", "chiba", "wakayama", "saitama",
                     "oita", "osaka", "nara", "miyagi",
                     "miyazaki", "toyama", "yamaguchi", "yamagata",
                     "yamanashi", "gifu", "okayama", "shimane",
                     "hiroshima", "tokushima", "ehime", "aichi", 
                     "niigata", "tokyo", "tochigi", "okinawa",
                     "shiga", "kumamoto", "ishikawa", "kanagawa",
                     "fukui", "fukuoka", "fukushima", "akita", 
                     "gunma", "ibaraki", "nagasaki", "nagano",
                     "aomori", "shizuoka", "kagawa", "kochi", 
                     "tottori", "kagoshima")

newdata3[47, 1] <- "iwate"
newdata3[47, 2] <- 0

colnames(newdata3) <- c("region", "value")

admin1_choropleth(country.name = "japan",
                  df           = newdata3,
                  title        = "Novel Coronavirus Cases",
                  legend       = "Cases",
                  num_colors   = 8)

newdata4 <- merge(newdata3, df_japan_census, by = "region")
newdata4$ratio <- round(newdata4$value / newdata4$pop_2010 * 1000000, 3)

newdata5 <- newdata4[, c("region", "ratio")]
colnames(newdata5)[2] <- "value"

admin1_choropleth(country.name = "japan",
                  df           = newdata5,
                  title        = "Novel Coronavirus Cases per Capita",
                  legend       = "Rate (Denom is 1000000)",
                  num_colors   = 8)

正確にはその原著R Graphics Cookbookを参考にしました．↩︎
ここはI. Faber. (2018). Animating Your Data Visualizations Like a Boss Using R, towards data scienceを参照しました．↩︎
データセットについてはこちらを参照．↩︎