Solution to dplyr tasks

1 Get started

library(tidyverse)

2 Data transformation with dplyr

Find all penguins that …

  1. … have a bill length between 40 and 45 mm.
filter(penguins, between(bill_len, 40, 45))
# A tibble: 77 × 8
   species island    bill_len bill_dep flipper_len body_mass sex     year
   <fct>   <fct>        <dbl>    <dbl>       <int>     <int> <fct>  <int>
 1 Adelie  Torgersen     40.3     18           195      3250 female  2007
 2 Adelie  Torgersen     42       20.2         190      4250 <NA>    2007
 3 Adelie  Torgersen     41.1     17.6         182      3200 female  2007
 4 Adelie  Torgersen     42.5     20.7         197      4500 male    2007
 5 Adelie  Biscoe        40.6     18.6         183      3550 male    2007
 6 Adelie  Biscoe        40.5     17.9         187      3200 female  2007
 7 Adelie  Biscoe        40.5     18.9         180      3950 male    2007
 8 Adelie  Dream         40.9     18.9         184      3900 male    2007
 9 Adelie  Dream         42.2     18.5         180      3550 female  2007
10 Adelie  Dream         40.8     18.4         195      3900 male    2007
# ℹ 67 more rows
# same as 
# filter(penguins, bill_len < 45 & bill_len > 40)
  1. … for which we know the sex.
filter(penguins, !is.na(sex))
# A tibble: 333 × 8
   species island    bill_len bill_dep flipper_len body_mass sex     year
   <fct>   <fct>        <dbl>    <dbl>       <int>     <int> <fct>  <int>
 1 Adelie  Torgersen     39.1     18.7         181      3750 male    2007
 2 Adelie  Torgersen     39.5     17.4         186      3800 female  2007
 3 Adelie  Torgersen     40.3     18           195      3250 female  2007
 4 Adelie  Torgersen     36.7     19.3         193      3450 female  2007
 5 Adelie  Torgersen     39.3     20.6         190      3650 male    2007
 6 Adelie  Torgersen     38.9     17.8         181      3625 female  2007
 7 Adelie  Torgersen     39.2     19.6         195      4675 male    2007
 8 Adelie  Torgersen     41.1     17.6         182      3200 female  2007
 9 Adelie  Torgersen     38.6     21.2         191      3800 male    2007
10 Adelie  Torgersen     34.6     21.1         198      4400 male    2007
# ℹ 323 more rows
  1. … which are of the species Adelie or Gentoo
filter(penguins, species %in% c("Adelie", "Gentoo"))
# A tibble: 276 × 8
   species island    bill_len bill_dep flipper_len body_mass sex     year
   <fct>   <fct>        <dbl>    <dbl>       <int>     <int> <fct>  <int>
 1 Adelie  Torgersen     39.1     18.7         181      3750 male    2007
 2 Adelie  Torgersen     39.5     17.4         186      3800 female  2007
 3 Adelie  Torgersen     40.3     18           195      3250 female  2007
 4 Adelie  Torgersen     NA       NA            NA        NA <NA>    2007
 5 Adelie  Torgersen     36.7     19.3         193      3450 female  2007
 6 Adelie  Torgersen     39.3     20.6         190      3650 male    2007
 7 Adelie  Torgersen     38.9     17.8         181      3625 female  2007
 8 Adelie  Torgersen     39.2     19.6         195      4675 male    2007
 9 Adelie  Torgersen     34.1     18.1         193      3475 <NA>    2007
10 Adelie  Torgersen     42       20.2         190      4250 <NA>    2007
# ℹ 266 more rows
# or
# filter(penguins, (species == "Adelie" | species == "Gentoo"))
  1. … lived on the island Dream in the year 2007. How many of them were from each of the 3 species?
filter(penguins, island == "Dream" & year == 2007) |> 
  count(species)
# A tibble: 2 × 2
  species       n
  <fct>     <int>
1 Adelie       20
2 Chinstrap    26

Count …

  1. … the number of penguins on each island.
count(penguins, island)
# A tibble: 3 × 2
  island        n
  <fct>     <int>
1 Biscoe      168
2 Dream       124
3 Torgersen    52
  1. … the number of penguins of each species on each island.
count(penguins, island, species)
# A tibble: 5 × 3
  island    species       n
  <fct>     <fct>     <int>
1 Biscoe    Adelie       44
2 Biscoe    Gentoo      124
3 Dream     Adelie       56
4 Dream     Chinstrap    68
5 Torgersen Adelie       52

Select …

  1. … only the variables species, sex and year
select(penguins, species, sex, year)
# A tibble: 344 × 3
   species sex     year
   <fct>   <fct>  <int>
 1 Adelie  male    2007
 2 Adelie  female  2007
 3 Adelie  female  2007
 4 Adelie  <NA>    2007
 5 Adelie  female  2007
 6 Adelie  male    2007
 7 Adelie  female  2007
 8 Adelie  male    2007
 9 Adelie  <NA>    2007
10 Adelie  <NA>    2007
# ℹ 334 more rows
  1. … only columns that contain measurements in mm
select(penguins, ends_with("mm"))
# A tibble: 344 × 0
# or
#select(penguins, contains("_mm"))

Add a column …

  1. … with the ratio of bill length to bill depth
mutate(penguins,
       ratio = bill_len / bill_dep)
# A tibble: 344 × 9
   species island    bill_len bill_dep flipper_len body_mass sex     year ratio
   <fct>   <fct>        <dbl>    <dbl>       <int>     <int> <fct>  <int> <dbl>
 1 Adelie  Torgersen     39.1     18.7         181      3750 male    2007  2.09
 2 Adelie  Torgersen     39.5     17.4         186      3800 female  2007  2.27
 3 Adelie  Torgersen     40.3     18           195      3250 female  2007  2.24
 4 Adelie  Torgersen     NA       NA            NA        NA <NA>    2007 NA   
 5 Adelie  Torgersen     36.7     19.3         193      3450 female  2007  1.90
 6 Adelie  Torgersen     39.3     20.6         190      3650 male    2007  1.91
 7 Adelie  Torgersen     38.9     17.8         181      3625 female  2007  2.19
 8 Adelie  Torgersen     39.2     19.6         195      4675 male    2007  2   
 9 Adelie  Torgersen     34.1     18.1         193      3475 <NA>    2007  1.88
10 Adelie  Torgersen     42       20.2         190      4250 <NA>    2007  2.08
# ℹ 334 more rows
  1. … with abbreviations for the species (Adelie = A, Gentoo = G, Chinstrap = C).
mutate(penguins,
       species_short = case_when(
         species == "Adelie" ~ "A",
         species == "Gentoo" ~ "G",
         species == "Chinstrap" ~ "C"
       ))
# A tibble: 344 × 9
   species island    bill_len bill_dep flipper_len body_mass sex     year
   <fct>   <fct>        <dbl>    <dbl>       <int>     <int> <fct>  <int>
 1 Adelie  Torgersen     39.1     18.7         181      3750 male    2007
 2 Adelie  Torgersen     39.5     17.4         186      3800 female  2007
 3 Adelie  Torgersen     40.3     18           195      3250 female  2007
 4 Adelie  Torgersen     NA       NA            NA        NA <NA>    2007
 5 Adelie  Torgersen     36.7     19.3         193      3450 female  2007
 6 Adelie  Torgersen     39.3     20.6         190      3650 male    2007
 7 Adelie  Torgersen     38.9     17.8         181      3625 female  2007
 8 Adelie  Torgersen     39.2     19.6         195      4675 male    2007
 9 Adelie  Torgersen     34.1     18.1         193      3475 <NA>    2007
10 Adelie  Torgersen     42       20.2         190      4250 <NA>    2007
# ℹ 334 more rows
# ℹ 1 more variable: species_short <chr>

Calculate …

  1. … mean flipper length and body mass for the 3 species and male and female penguins separately
penguins |>
  summarize(
    mean_flipper = mean(flipper_len, na.rm = TRUE),
    mean_body = mean(body_mass, na.rm = TRUE),
    .by = c(species, sex)
  )  
# A tibble: 8 × 4
  species   sex    mean_flipper mean_body
  <fct>     <fct>         <dbl>     <dbl>
1 Adelie    male           192.     4043.
2 Adelie    female         188.     3369.
3 Adelie    <NA>           186.     3540 
4 Gentoo    female         213.     4680.
5 Gentoo    male           222.     5485.
6 Gentoo    <NA>           216.     4588.
7 Chinstrap female         192.     3527.
8 Chinstrap male           200.     3939.
  1. Can you do the same but remove the penguins for which we don’t know the sex first?
penguins |> 
  filter(!is.na(sex)) |> 
  summarize(
    mean_flipper = mean(flipper_len, na.rm = TRUE),
    mean_body = mean(body_mass, na.rm = TRUE),
    .by = c(species, sex)
  )
# A tibble: 6 × 4
  species   sex    mean_flipper mean_body
  <fct>     <fct>         <dbl>     <dbl>
1 Adelie    male           192.     4043.
2 Adelie    female         188.     3369.
3 Gentoo    female         213.     4680.
4 Gentoo    male           222.     5485.
5 Chinstrap female         192.     3527.
6 Chinstrap male           200.     3939.

3 For the fast ones

  1. Make a boxplot of penguin body mass with sex on the y-axis and facets for the different species. Can you remove the penguins with missing values for sex first?
penguins |>
  filter(!is.na(sex)) |>
  ggplot(aes(x = sex, y = body_mass)) +
  geom_boxplot() +
  facet_wrap(~species)

  1. Make a scatterplot with the ratio of bill length to bill depth on the y axis and flipper length on the x axis? Can you distinguish the point between male and female penguins and remove penguins with unknown sex before making the plot?
penguins |>
  mutate(ratio = bill_len / bill_dep) |>
  filter(!is.na(sex)) |>
  ggplot(aes(x = flipper_len, y = ratio, color = sex)) +
  geom_point() +
  scale_color_manual(values = c("cyan4", "darkorange")) +
  labs(
    x = "Flipper lenght (mm)",
    y = "Ratio bill length / bill depth (-)"
  ) +
  theme_minimal()