library(tidyverse)
library(palmerpenguins)
Solution to dplyr tasks
1 Get started
2 Data transformation with dplyr
Find all penguins that …
- … have a bill length between 40 and 45 mm.
filter(penguins, between(bill_length_mm, 40, 45))
# A tibble: 77 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 40.3 18 195 3250
2 Adelie Torgersen 42 20.2 190 4250
3 Adelie Torgersen 41.1 17.6 182 3200
4 Adelie Torgersen 42.5 20.7 197 4500
5 Adelie Biscoe 40.6 18.6 183 3550
6 Adelie Biscoe 40.5 17.9 187 3200
7 Adelie Biscoe 40.5 18.9 180 3950
8 Adelie Dream 40.9 18.9 184 3900
9 Adelie Dream 42.2 18.5 180 3550
10 Adelie Dream 40.8 18.4 195 3900
# ℹ 67 more rows
# ℹ 2 more variables: sex <fct>, year <int>
# same as
# filter(penguins, bill_length_mm < 45 & bill_length_mm > 40)
- … for which we know the sex.
filter(penguins, !is.na(sex))
# A tibble: 333 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen 36.7 19.3 193 3450
5 Adelie Torgersen 39.3 20.6 190 3650
6 Adelie Torgersen 38.9 17.8 181 3625
7 Adelie Torgersen 39.2 19.6 195 4675
8 Adelie Torgersen 41.1 17.6 182 3200
9 Adelie Torgersen 38.6 21.2 191 3800
10 Adelie Torgersen 34.6 21.1 198 4400
# ℹ 323 more rows
# ℹ 2 more variables: sex <fct>, year <int>
- … which are of the species Adelie or Gentoo
filter(penguins, species %in% c("Adelie", "Gentoo")
# or
# filter(penguins, (species == "Adelie" | species == "Gentoo"))
Error in parse(text = input): <text>:4:0: unexpected end of input
2: # or
3: # filter(penguins, (species == "Adelie" | species == "Gentoo"))
^
- … lived on the island Dream in the year 2007. How many of them were from each of the 3 species?
filter(penguins, island == "Dream" & year == 2007) |>
count(species)
# A tibble: 2 × 2
species n
<fct> <int>
1 Adelie 20
2 Chinstrap 26
Count …
- … the number of penguins on each island.
count(penguins, island)
# A tibble: 3 × 2
island n
<fct> <int>
1 Biscoe 168
2 Dream 124
3 Torgersen 52
- … the number of penguins of each species on each island.
count(penguins, island, species)
# A tibble: 5 × 3
island species n
<fct> <fct> <int>
1 Biscoe Adelie 44
2 Biscoe Gentoo 124
3 Dream Adelie 56
4 Dream Chinstrap 68
5 Torgersen Adelie 52
Select …
- … only the variables species, sex and year
select(penguins, species, sex, year)
# A tibble: 344 × 3
species sex year
<fct> <fct> <int>
1 Adelie male 2007
2 Adelie female 2007
3 Adelie female 2007
4 Adelie <NA> 2007
5 Adelie female 2007
6 Adelie male 2007
7 Adelie female 2007
8 Adelie male 2007
9 Adelie <NA> 2007
10 Adelie <NA> 2007
# ℹ 334 more rows
- … only columns that contain measurements in mm
select(penguins, ends_with("mm"))
# A tibble: 344 × 3
bill_length_mm bill_depth_mm flipper_length_mm
<dbl> <dbl> <int>
1 39.1 18.7 181
2 39.5 17.4 186
3 40.3 18 195
4 NA NA NA
5 36.7 19.3 193
6 39.3 20.6 190
7 38.9 17.8 181
8 39.2 19.6 195
9 34.1 18.1 193
10 42 20.2 190
# ℹ 334 more rows
# or
#select(penguins, contains("_mm"))
Add a column …
- … with the ratio of bill length to bill depth
mutate(penguins,
ratio = bill_length_mm / bill_depth_mm)
# A tibble: 344 × 9
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
7 Adelie Torgersen 38.9 17.8 181 3625
8 Adelie Torgersen 39.2 19.6 195 4675
9 Adelie Torgersen 34.1 18.1 193 3475
10 Adelie Torgersen 42 20.2 190 4250
# ℹ 334 more rows
# ℹ 3 more variables: sex <fct>, year <int>, ratio <dbl>
- … with abbreviations for the species (Adelie = A, Gentoo = G, Chinstrap = C).
mutate(penguins,
species_short = case_when(
== "Adelie" ~ "A",
species == "Gentoo" ~ "G",
species == "Chinstrap" ~ "C"
species ))
# A tibble: 344 × 9
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
7 Adelie Torgersen 38.9 17.8 181 3625
8 Adelie Torgersen 39.2 19.6 195 4675
9 Adelie Torgersen 34.1 18.1 193 3475
10 Adelie Torgersen 42 20.2 190 4250
# ℹ 334 more rows
# ℹ 3 more variables: sex <fct>, year <int>, species_short <chr>
Calculate …
- … mean flipper length and body mass for the 3 species and male and female penguins separately
|>
penguins summarize(
mean_flipper = mean(flipper_length_mm, na.rm = TRUE),
mean_body = mean(body_mass_g, na.rm = TRUE),
.by = c(species, sex)
)
# A tibble: 8 × 4
species sex mean_flipper mean_body
<fct> <fct> <dbl> <dbl>
1 Adelie male 192. 4043.
2 Adelie female 188. 3369.
3 Adelie <NA> 186. 3540
4 Gentoo female 213. 4680.
5 Gentoo male 222. 5485.
6 Gentoo <NA> 216. 4588.
7 Chinstrap female 192. 3527.
8 Chinstrap male 200. 3939.
- Can you do the same but remove the penguins for which we don’t know the sex first?
|>
penguins filter(!is.na(sex)) |>
summarize(
mean_flipper = mean(flipper_length_mm, na.rm = TRUE),
mean_body = mean(body_mass_g, na.rm = TRUE),
.by = c(species, sex)
)
# A tibble: 6 × 4
species sex mean_flipper mean_body
<fct> <fct> <dbl> <dbl>
1 Adelie male 192. 4043.
2 Adelie female 188. 3369.
3 Gentoo female 213. 4680.
4 Gentoo male 222. 5485.
5 Chinstrap female 192. 3527.
6 Chinstrap male 200. 3939.
3 Extras
- Make a boxplot of penguin body mass with sex on the y-axis and facets for the different species. Can you remove the penguins with missing values for sex first?
|>
penguins filter(!is.na(sex)) |>
ggplot(aes(x = sex, y = body_mass_g)) +
geom_boxplot() +
facet_wrap(~species)
- Make a scatterplot with the ratio of bill length to bill depth on the y axis and flipper length on the x axis? Can you distinguish the point between male and female penguins and remove penguins with unknown sex before making the plot?
|>
penguins mutate(ratio = bill_length_mm / bill_depth_mm) |>
filter(!is.na(sex)) |>
ggplot(aes(x = flipper_length_mm, y = ratio, color = sex)) +
geom_point() +
scale_color_manual(values = c("cyan4", "darkorange")) +
labs(
x = "Flipper lenght (mm)",
y = "Ratio bill length / bill depth (-)"
+
) theme_minimal()