How to reshape your data in order to make the analysis easier.
Each variable is saved in its column.
Each observation is saved in its own row.
Image Credit: Hadley Wickham and Garrett Grolemund
library(tidyverse) #or library(tidyr)library(magrittr)
tidyr
packageHadley Wickham, Chief Scientist at RStudio explaining tidyr at WOMBAT organized by Monash University, Australia.
Image taken by Thiyanga S Talagala at WOMBAT Melbourne, Australia, December-2019
pivot_longer
In tidyr (2014) gather
pivot_wider
In tidyr (2014) spread
separate
unite
Main input: data frame
or tibble
.
Output: tibble
pivot_longer
pivot_longer()
Turns columns into rows.
From wide format to long format.
pivot_longer()
dengue <- tibble( dist = c("Colombo", "Gampaha", "Kalutara"), '2017' = c(20718, 10258, 34274), '2018' = c(16573, 5857, 31647), '2019' = c(8395, 3155, 10961)); dengue
# A tibble: 3 × 4 dist `2017` `2018` `2019` <chr> <dbl> <dbl> <dbl>1 Colombo 20718 16573 83952 Gampaha 10258 5857 31553 Kalutara 34274 31647 10961
dengue %>% pivot_longer(2:4, names_to="Year", values_to = "Dengue counts")
# A tibble: 9 × 3 dist Year `Dengue counts` <chr> <chr> <dbl>1 Colombo 2017 207182 Colombo 2018 165733 Colombo 2019 83954 Gampaha 2017 102585 Gampaha 2018 58576 Gampaha 2019 31557 Kalutara 2017 342748 Kalutara 2018 316479 Kalutara 2019 10961
pivot_wider
pivot_wider()
pivot_wider()
Corona <- tibble(country = rep(c("USA", "Brazil", "Russia"), each=2),status = rep(c("Death", "Recovered"), 3),count = c(99381, 451745, 22746, 149911, 3633, 118798))
Corona
# A tibble: 6 × 3 country status count <chr> <chr> <dbl>1 USA Death 993812 USA Recovered 4517453 Brazil Death 227464 Brazil Recovered 1499115 Russia Death 36336 Russia Recovered 118798
pivot_wider()
Corona
# A tibble: 6 × 3 country status count <chr> <chr> <dbl>1 USA Death 993812 USA Recovered 4517453 Brazil Death 227464 Brazil Recovered 1499115 Russia Death 36336 Russia Recovered 118798
Corona %>% pivot_wider(names_from=status, values_from=count)
# A tibble: 3 × 3 country Death Recovered <chr> <dbl> <dbl>1 USA 99381 4517452 Brazil 22746 1499113 Russia 3633 118798
corona_wide_format <- Corona %>% pivot_wider(names_from=status, values_from=count)corona_wide_format
# A tibble: 3 × 3 country Death Recovered <chr> <dbl> <dbl>1 USA 99381 4517452 Brazil 22746 1499113 Russia 3633 118798
pivot_longer
vs pivot_wider
pivot_longer
and pivot_wider
profit <- tibble(year = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),quarter = c( 1, 2, 3, 4, 2, 3, 4),income = c(2, NA, 3, NA, 4, 5, 6))profit
# A tibble: 7 × 3 year quarter income <dbl> <dbl> <dbl>1 2015 1 22 2015 2 NA3 2015 3 34 2015 4 NA5 2016 2 46 2016 3 57 2016 4 6
pivot_longer
and pivot_wider
# A tibble: 7 × 3 year quarter income <dbl> <dbl> <dbl>1 2015 1 22 2015 2 NA3 2015 3 34 2015 4 NA5 2016 2 46 2016 3 57 2016 4 6
profit %>%pivot_wider(names_from = year, values_from = income)
# A tibble: 4 × 3 quarter `2015` `2016` <dbl> <dbl> <dbl>1 1 2 NA2 2 NA 43 3 3 54 4 NA 6
# A tibble: 4 × 3 quarter `2015` `2016` <dbl> <dbl> <dbl>1 1 2 NA2 2 NA 43 3 3 54 4 NA 6
profit %>%pivot_wider(names_from = year, values_from = income) %>%pivot_longer(cols = c(`2015`, `2016`),names_to = "year",values_to = "income",values_drop_na = TRUE)
# A tibble: 5 × 3 quarter year income <dbl> <chr> <dbl>1 1 2015 22 2 2016 43 3 2015 34 3 2016 55 4 2016 6
separate
separate()
Melbourne <- tibble(Date = c("10-5-2020", "11-5-2020", "12-5-2020","13-5-2020"), Tmin = c(5, 9, 9, 7), Tmax = c(18, 16, 16, 17), Rainfall= c(30, 40, 10, 5)); Melbourne
# A tibble: 4 × 4 Date Tmin Tmax Rainfall <chr> <dbl> <dbl> <dbl>1 10-5-2020 5 18 302 11-5-2020 9 16 403 12-5-2020 9 16 104 13-5-2020 7 17 5
Melbourne %>% separate(Date, into=c("day", "month", "year"), sep="-")
# A tibble: 4 × 6 day month year Tmin Tmax Rainfall <chr> <chr> <chr> <dbl> <dbl> <dbl>1 10 5 2020 5 18 302 11 5 2020 9 16 403 12 5 2020 9 16 104 13 5 2020 7 17 5
separate()
df <- data.frame(x = c(NA, "a.b", "a.d", "b.c"))df
x1 <NA>2 a.b3 a.d4 b.c
df %>% separate(x, c("Text1", "Text2"))
Text1 Text21 <NA> <NA>2 a b3 a d4 b c
separate()
tbl <- tibble(input = c("a", "a b", "a-b c", NA)); tbl
# A tibble: 4 × 1 input <chr>1 a 2 a b 3 a-b c4 <NA>
separate()
tbl <- tibble(input = c("a", "a b", "a-b c", NA)); tbl
# A tibble: 4 × 1 input <chr>1 a 2 a b 3 a-b c4 <NA>
tbl %>% separate(input, c("Input1", "Input2"))
## Warning: Expected 2 pieces. Additional pieces discarded in 1 rows [3].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [1].
## # A tibble: 4 × 2## Input1 Input2## <chr> <chr> ## 1 a <NA> ## 2 a b ## 3 a b ## 4 <NA> <NA>
separate()
tbl <- tibble(input = c("a", "a b", "a-b c", NA)); tbl
# A tibble: 4 × 1 input <chr>1 a 2 a b 3 a-b c4 <NA>
separate()
tbl <- tibble(input = c("a", "a b", "a-b c", NA)); tbl
# A tibble: 4 × 1 input <chr>1 a 2 a b 3 a-b c4 <NA>
tbl %>% separate(input, c("Input1", "Input2", "Input3"))
tbl %>% separate(input, c("Input1", "Input2", "Input3"))
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2 rows [1, 2].
## # A tibble: 4 × 3## Input1 Input2 Input3## <chr> <chr> <chr> ## 1 a <NA> <NA> ## 2 a b <NA> ## 3 a b c ## 4 <NA> <NA> <NA>
unite
unite()
projects <- tibble( Country = c("USA", "USA", "AUS", "AUS"), State = c("LA", "CO", "VIC", "NSW"), Cost = c(1000, 11000, 20000,30000))projects
# A tibble: 4 × 3 Country State Cost <chr> <chr> <dbl>1 USA LA 10002 USA CO 110003 AUS VIC 200004 AUS NSW 30000
projects %>% unite("Location", c("State", "Country"))
# A tibble: 4 × 2 Location Cost <chr> <dbl>1 LA_USA 10002 CO_USA 110003 VIC_AUS 200004 NSW_AUS 30000
unite()
projects %>% unite("Location", c("State", "Country"))
# A tibble: 4 × 2 Location Cost <chr> <dbl>1 LA_USA 10002 CO_USA 110003 VIC_AUS 200004 NSW_AUS 30000
projects %>% unite("Location", c("State", "Country"), sep="-")
# A tibble: 4 × 2 Location Cost <chr> <dbl>1 LA-USA 10002 CO-USA 110003 VIC-AUS 200004 NSW-AUS 30000
separate
vs unite
Keyboard shortcuts
↑, ←, Pg Up, k | Go to previous slide |
↓, →, Pg Dn, Space, j | Go to next slide |
Home | Go to first slide |
End | Go to last slide |
Number + Return | Go to specific slide |
b / m / f | Toggle blackout / mirrored / fullscreen mode |
c | Clone slideshow |
p | Toggle presenter mode |
s | Start & Stop the presentation timer |
t | Reset the presentation timer |
?, h | Toggle this help |
Esc | Back to slideshow |