library(tidyverse)Solution: Filter, select, and mutate
Get started
Filter penguins
Find all penguins that …
- … have a bill length between 40 and 45 mm.
filter(penguins, bill_len >= 40 & bill_len <= 45)# A tibble: 77 × 8
species island bill_len bill_dep flipper_len body_mass sex year
<fct> <fct> <dbl> <dbl> <int> <int> <fct> <int>
1 Adelie Torgersen 40.3 18 195 3250 female 2007
2 Adelie Torgersen 42 20.2 190 4250 <NA> 2007
3 Adelie Torgersen 41.1 17.6 182 3200 female 2007
4 Adelie Torgersen 42.5 20.7 197 4500 male 2007
5 Adelie Biscoe 40.6 18.6 183 3550 male 2007
6 Adelie Biscoe 40.5 17.9 187 3200 female 2007
7 Adelie Biscoe 40.5 18.9 180 3950 male 2007
8 Adelie Dream 40.9 18.9 184 3900 male 2007
9 Adelie Dream 42.2 18.5 180 3550 female 2007
10 Adelie Dream 40.8 18.4 195 3900 male 2007
# ℹ 67 more rows
- … are of the species Adelie or Gentoo.
filter(penguins, species %in% c("Adelie", "Gentoo"))# A tibble: 276 × 8
species island bill_len bill_dep flipper_len body_mass sex year
<fct> <fct> <dbl> <dbl> <int> <int> <fct> <int>
1 Adelie Torgersen 39.1 18.7 181 3750 male 2007
2 Adelie Torgersen 39.5 17.4 186 3800 female 2007
3 Adelie Torgersen 40.3 18 195 3250 female 2007
4 Adelie Torgersen NA NA NA NA <NA> 2007
5 Adelie Torgersen 36.7 19.3 193 3450 female 2007
6 Adelie Torgersen 39.3 20.6 190 3650 male 2007
7 Adelie Torgersen 38.9 17.8 181 3625 female 2007
8 Adelie Torgersen 39.2 19.6 195 4675 male 2007
9 Adelie Torgersen 34.1 18.1 193 3475 <NA> 2007
10 Adelie Torgersen 42 20.2 190 4250 <NA> 2007
# ℹ 266 more rows
# or
# filter(penguins, species == "Adelie" | species == "Gentoo")- … lived on the island Dream in the year 2007.
filter(penguins, island == "Dream" & year == 2007)# A tibble: 46 × 8
species island bill_len bill_dep flipper_len body_mass sex year
<fct> <fct> <dbl> <dbl> <int> <int> <fct> <int>
1 Adelie Dream 39.5 16.7 178 3250 female 2007
2 Adelie Dream 37.2 18.1 178 3900 male 2007
3 Adelie Dream 39.5 17.8 188 3300 female 2007
4 Adelie Dream 40.9 18.9 184 3900 male 2007
5 Adelie Dream 36.4 17 195 3325 female 2007
6 Adelie Dream 39.2 21.1 196 4150 male 2007
7 Adelie Dream 38.8 20 190 3950 male 2007
8 Adelie Dream 42.2 18.5 180 3550 female 2007
9 Adelie Dream 37.6 19.3 181 3300 female 2007
10 Adelie Dream 39.8 19.1 184 4650 male 2007
# ℹ 36 more rows
Remove missing values
- Remove all penguins with missing values for
sex.
drop_na(penguins, sex)# A tibble: 333 × 8
species island bill_len bill_dep flipper_len body_mass sex year
<fct> <fct> <dbl> <dbl> <int> <int> <fct> <int>
1 Adelie Torgersen 39.1 18.7 181 3750 male 2007
2 Adelie Torgersen 39.5 17.4 186 3800 female 2007
3 Adelie Torgersen 40.3 18 195 3250 female 2007
4 Adelie Torgersen 36.7 19.3 193 3450 female 2007
5 Adelie Torgersen 39.3 20.6 190 3650 male 2007
6 Adelie Torgersen 38.9 17.8 181 3625 female 2007
7 Adelie Torgersen 39.2 19.6 195 4675 male 2007
8 Adelie Torgersen 41.1 17.6 182 3200 female 2007
9 Adelie Torgersen 38.6 21.2 191 3800 male 2007
10 Adelie Torgersen 34.6 21.1 198 4400 male 2007
# ℹ 323 more rows
Select columns
- Select only the variables
species,sex, andyear.
select(penguins, species, sex, year)# A tibble: 344 × 3
species sex year
<fct> <fct> <int>
1 Adelie male 2007
2 Adelie female 2007
3 Adelie female 2007
4 Adelie <NA> 2007
5 Adelie female 2007
6 Adelie male 2007
7 Adelie female 2007
8 Adelie male 2007
9 Adelie <NA> 2007
10 Adelie <NA> 2007
# ℹ 334 more rows
- Select only columns that start with
"bill".
select(penguins, starts_with("bill"))# A tibble: 344 × 2
bill_len bill_dep
<dbl> <dbl>
1 39.1 18.7
2 39.5 17.4
3 40.3 18
4 NA NA
5 36.7 19.3
6 39.3 20.6
7 38.9 17.8
8 39.2 19.6
9 34.1 18.1
10 42 20.2
# ℹ 334 more rows
Add new columns
- Add a column with the ratio of bill length to bill depth.
mutate(penguins, ratio = bill_len / bill_dep)# A tibble: 344 × 9
species island bill_len bill_dep flipper_len body_mass sex year ratio
<fct> <fct> <dbl> <dbl> <int> <int> <fct> <int> <dbl>
1 Adelie Torgersen 39.1 18.7 181 3750 male 2007 2.09
2 Adelie Torgersen 39.5 17.4 186 3800 female 2007 2.27
3 Adelie Torgersen 40.3 18 195 3250 female 2007 2.24
4 Adelie Torgersen NA NA NA NA <NA> 2007 NA
5 Adelie Torgersen 36.7 19.3 193 3450 female 2007 1.90
6 Adelie Torgersen 39.3 20.6 190 3650 male 2007 1.91
7 Adelie Torgersen 38.9 17.8 181 3625 female 2007 2.19
8 Adelie Torgersen 39.2 19.6 195 4675 male 2007 2
9 Adelie Torgersen 34.1 18.1 193 3475 <NA> 2007 1.88
10 Adelie Torgersen 42 20.2 190 4250 <NA> 2007 2.08
# ℹ 334 more rows
- Add a column with abbreviations for the species (Adelie = A, Gentoo = G, Chinstrap = C).
mutate(
penguins,
species_short = case_when(
species == "Adelie" ~ "A",
species == "Gentoo" ~ "G",
species == "Chinstrap" ~ "C",
.default = NA
)
)# A tibble: 344 × 9
species island bill_len bill_dep flipper_len body_mass sex year
<fct> <fct> <dbl> <dbl> <int> <int> <fct> <int>
1 Adelie Torgersen 39.1 18.7 181 3750 male 2007
2 Adelie Torgersen 39.5 17.4 186 3800 female 2007
3 Adelie Torgersen 40.3 18 195 3250 female 2007
4 Adelie Torgersen NA NA NA NA <NA> 2007
5 Adelie Torgersen 36.7 19.3 193 3450 female 2007
6 Adelie Torgersen 39.3 20.6 190 3650 male 2007
7 Adelie Torgersen 38.9 17.8 181 3625 female 2007
8 Adelie Torgersen 39.2 19.6 195 4675 male 2007
9 Adelie Torgersen 34.1 18.1 193 3475 <NA> 2007
10 Adelie Torgersen 42 20.2 190 4250 <NA> 2007
# ℹ 334 more rows
# ℹ 1 more variable: species_short <chr>
Combine with the pipe
- Use the pipe to: remove rows with missing
sex, keep only Adelie penguins, and selectspecies,sex, andbody_mass.
penguins |>
drop_na(sex) |>
filter(species == "Adelie") |>
select(species, sex, body_mass)# A tibble: 146 × 3
species sex body_mass
<fct> <fct> <int>
1 Adelie male 3750
2 Adelie female 3800
3 Adelie female 3250
4 Adelie female 3450
5 Adelie male 3650
6 Adelie female 3625
7 Adelie male 4675
8 Adelie female 3200
9 Adelie male 3800
10 Adelie male 4400
# ℹ 136 more rows
For the fast ones
Use filter_out() to exclude penguins from Torgersen island, then select only species, island, and flipper_len.
penguins |>
filter_out(island == "Torgersen") |>
select(species, island, flipper_len)# A tibble: 292 × 3
species island flipper_len
<fct> <fct> <int>
1 Adelie Biscoe 174
2 Adelie Biscoe 180
3 Adelie Biscoe 189
4 Adelie Biscoe 185
5 Adelie Biscoe 180
6 Adelie Biscoe 187
7 Adelie Biscoe 183
8 Adelie Biscoe 187
9 Adelie Biscoe 172
10 Adelie Biscoe 180
# ℹ 282 more rows
Create a size_category column with case_when based on body mass, in a pipe that also removes NAs and selects only species, body_mass, and size_category.
penguins |>
drop_na(body_mass) |>
mutate(
size_category = case_when(
body_mass < 3500 ~ "small",
body_mass < 5000 ~ "medium",
body_mass >= 5000 ~ "large"
)
) |>
select(species, body_mass, size_category)# A tibble: 342 × 3
species body_mass size_category
<fct> <int> <chr>
1 Adelie 3750 medium
2 Adelie 3800 medium
3 Adelie 3250 small
4 Adelie 3450 small
5 Adelie 3650 medium
6 Adelie 3625 medium
7 Adelie 4675 medium
8 Adelie 3475 small
9 Adelie 4250 medium
10 Adelie 3300 small
# ℹ 332 more rows