Solution to dplyr tasks

1 Get started

library(tidyverse)
library(palmerpenguins)

2 Data transformation with dplyr

Find all penguins that …

  1. … have a bill length between 40 and 45 mm.
filter(penguins, between(bill_length_mm, 40, 45))
# A tibble: 77 × 8
   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
 1 Adelie  Torgersen           40.3          18                 195        3250
 2 Adelie  Torgersen           42            20.2               190        4250
 3 Adelie  Torgersen           41.1          17.6               182        3200
 4 Adelie  Torgersen           42.5          20.7               197        4500
 5 Adelie  Biscoe              40.6          18.6               183        3550
 6 Adelie  Biscoe              40.5          17.9               187        3200
 7 Adelie  Biscoe              40.5          18.9               180        3950
 8 Adelie  Dream               40.9          18.9               184        3900
 9 Adelie  Dream               42.2          18.5               180        3550
10 Adelie  Dream               40.8          18.4               195        3900
# ℹ 67 more rows
# ℹ 2 more variables: sex <fct>, year <int>
# same as 
# filter(penguins, bill_length_mm < 45 & bill_length_mm > 40)
  1. … for which we know the sex.
filter(penguins, !is.na(sex))
# A tibble: 333 × 8
   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
 1 Adelie  Torgersen           39.1          18.7               181        3750
 2 Adelie  Torgersen           39.5          17.4               186        3800
 3 Adelie  Torgersen           40.3          18                 195        3250
 4 Adelie  Torgersen           36.7          19.3               193        3450
 5 Adelie  Torgersen           39.3          20.6               190        3650
 6 Adelie  Torgersen           38.9          17.8               181        3625
 7 Adelie  Torgersen           39.2          19.6               195        4675
 8 Adelie  Torgersen           41.1          17.6               182        3200
 9 Adelie  Torgersen           38.6          21.2               191        3800
10 Adelie  Torgersen           34.6          21.1               198        4400
# ℹ 323 more rows
# ℹ 2 more variables: sex <fct>, year <int>
  1. … which are of the species Adelie or Gentoo
filter(penguins, species %in% c("Adelie", "Gentoo")
# or
# filter(penguins, (species == "Adelie" | species == "Gentoo"))
Error in parse(text = input): <text>:4:0: unexpected end of input
2: # or
3: # filter(penguins, (species == "Adelie" | species == "Gentoo"))
  ^
  1. … lived on the island Dream in the year 2007. How many of them were from each of the 3 species?
filter(penguins, island == "Dream" & year == 2007) |> 
  count(species)
# A tibble: 2 × 2
  species       n
  <fct>     <int>
1 Adelie       20
2 Chinstrap    26

Count …

  1. … the number of penguins on each island.
count(penguins, island)
# A tibble: 3 × 2
  island        n
  <fct>     <int>
1 Biscoe      168
2 Dream       124
3 Torgersen    52
  1. … the number of penguins of each species on each island.
count(penguins, island, species)
# A tibble: 5 × 3
  island    species       n
  <fct>     <fct>     <int>
1 Biscoe    Adelie       44
2 Biscoe    Gentoo      124
3 Dream     Adelie       56
4 Dream     Chinstrap    68
5 Torgersen Adelie       52

Select …

  1. … only the variables species, sex and year
select(penguins, species, sex, year)
# A tibble: 344 × 3
   species sex     year
   <fct>   <fct>  <int>
 1 Adelie  male    2007
 2 Adelie  female  2007
 3 Adelie  female  2007
 4 Adelie  <NA>    2007
 5 Adelie  female  2007
 6 Adelie  male    2007
 7 Adelie  female  2007
 8 Adelie  male    2007
 9 Adelie  <NA>    2007
10 Adelie  <NA>    2007
# ℹ 334 more rows
  1. … only columns that contain measurements in mm
select(penguins, ends_with("mm"))
# A tibble: 344 × 3
   bill_length_mm bill_depth_mm flipper_length_mm
            <dbl>         <dbl>             <int>
 1           39.1          18.7               181
 2           39.5          17.4               186
 3           40.3          18                 195
 4           NA            NA                  NA
 5           36.7          19.3               193
 6           39.3          20.6               190
 7           38.9          17.8               181
 8           39.2          19.6               195
 9           34.1          18.1               193
10           42            20.2               190
# ℹ 334 more rows
# or
#select(penguins, contains("_mm"))

Add a column …

  1. … with the ratio of bill length to bill depth
mutate(penguins,
       ratio = bill_length_mm / bill_depth_mm)
# A tibble: 344 × 9
   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
 1 Adelie  Torgersen           39.1          18.7               181        3750
 2 Adelie  Torgersen           39.5          17.4               186        3800
 3 Adelie  Torgersen           40.3          18                 195        3250
 4 Adelie  Torgersen           NA            NA                  NA          NA
 5 Adelie  Torgersen           36.7          19.3               193        3450
 6 Adelie  Torgersen           39.3          20.6               190        3650
 7 Adelie  Torgersen           38.9          17.8               181        3625
 8 Adelie  Torgersen           39.2          19.6               195        4675
 9 Adelie  Torgersen           34.1          18.1               193        3475
10 Adelie  Torgersen           42            20.2               190        4250
# ℹ 334 more rows
# ℹ 3 more variables: sex <fct>, year <int>, ratio <dbl>
  1. … with abbreviations for the species (Adelie = A, Gentoo = G, Chinstrap = C).
mutate(penguins,
       species_short = case_when(
         species == "Adelie" ~ "A",
         species == "Gentoo" ~ "G",
         species == "Chinstrap" ~ "C"
       ))
# A tibble: 344 × 9
   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
 1 Adelie  Torgersen           39.1          18.7               181        3750
 2 Adelie  Torgersen           39.5          17.4               186        3800
 3 Adelie  Torgersen           40.3          18                 195        3250
 4 Adelie  Torgersen           NA            NA                  NA          NA
 5 Adelie  Torgersen           36.7          19.3               193        3450
 6 Adelie  Torgersen           39.3          20.6               190        3650
 7 Adelie  Torgersen           38.9          17.8               181        3625
 8 Adelie  Torgersen           39.2          19.6               195        4675
 9 Adelie  Torgersen           34.1          18.1               193        3475
10 Adelie  Torgersen           42            20.2               190        4250
# ℹ 334 more rows
# ℹ 3 more variables: sex <fct>, year <int>, species_short <chr>

Calculate …

  1. … mean flipper length and body mass for the 3 species and male and female penguins separately
penguins |>
  summarize(
    mean_flipper = mean(flipper_length_mm, na.rm = TRUE),
    mean_body = mean(body_mass_g, na.rm = TRUE),
    .by = c(species, sex)
  )  
# A tibble: 8 × 4
  species   sex    mean_flipper mean_body
  <fct>     <fct>         <dbl>     <dbl>
1 Adelie    male           192.     4043.
2 Adelie    female         188.     3369.
3 Adelie    <NA>           186.     3540 
4 Gentoo    female         213.     4680.
5 Gentoo    male           222.     5485.
6 Gentoo    <NA>           216.     4588.
7 Chinstrap female         192.     3527.
8 Chinstrap male           200.     3939.
  1. Can you do the same but remove the penguins for which we don’t know the sex first?
penguins |> 
  filter(!is.na(sex)) |> 
  summarize(
    mean_flipper = mean(flipper_length_mm, na.rm = TRUE),
    mean_body = mean(body_mass_g, na.rm = TRUE),
    .by = c(species, sex)
  )
# A tibble: 6 × 4
  species   sex    mean_flipper mean_body
  <fct>     <fct>         <dbl>     <dbl>
1 Adelie    male           192.     4043.
2 Adelie    female         188.     3369.
3 Gentoo    female         213.     4680.
4 Gentoo    male           222.     5485.
5 Chinstrap female         192.     3527.
6 Chinstrap male           200.     3939.

3 Extras

  1. Make a boxplot of penguin body mass with sex on the y-axis and facets for the different species. Can you remove the penguins with missing values for sex first?
penguins |>
  filter(!is.na(sex)) |>
  ggplot(aes(x = sex, y = body_mass_g)) +
  geom_boxplot() +
  facet_wrap(~species)

  1. Make a scatterplot with the ratio of bill length to bill depth on the y axis and flipper length on the x axis? Can you distinguish the point between male and female penguins and remove penguins with unknown sex before making the plot?
penguins |>
  mutate(ratio = bill_length_mm / bill_depth_mm) |>
  filter(!is.na(sex)) |>
  ggplot(aes(x = flipper_length_mm, y = ratio, color = sex)) +
  geom_point() +
  scale_color_manual(values = c("cyan4", "darkorange")) +
  labs(
    x = "Flipper lenght (mm)",
    y = "Ratio bill length / bill depth (-)"
  ) +
  theme_minimal()