3 Data transformation
3.1 Group and sort records
Learn how to use group_by()
and arrange()
to better understand aggregated data
- How many flights are there per month?
flights %>%
group_by(month) %>%
tally()
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## month n
## <dbl> <S3: integer64>
## 1 1 605765
## 2 2 569236
## 3 3 616090
## 4 4 598126
## 5 5 606293
## 6 6 608665
## 7 7 627931
## 8 8 612279
## 9 9 540908
## 10 10 556205
## # … with more rows
- Order the results by the month number by using
arrange()
flights %>%
group_by(month) %>%
tally() %>%
arrange(month)
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## # Ordered by: month
## month n
## <dbl> <S3: integer64>
## 1 1 605765
## 2 2 569236
## 3 3 616090
## 4 4 598126
## 5 5 606293
## 6 6 608665
## 7 7 627931
## 8 8 612279
## 9 9 540908
## 10 10 556205
## # … with more rows
- Order the results by the number of flights, starting with the month with most flights by using
desc()
inside thearrange()
command
flights %>%
group_by(month) %>%
tally() %>%
arrange(desc(n))
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## # Ordered by: desc(n)
## month n
## <dbl> <S3: integer64>
## 1 7 627931
## 2 3 616090
## 3 8 612279
## 4 6 608665
## 5 5 606293
## 6 1 605765
## 7 4 598126
## 8 2 569236
## 9 10 556205
## 10 12 544958
## # … with more rows
3.2 Answering questions with dplyr
Quick review of how to translate questions into dplyr
code
- Which are the top 4 months with the most flight activity?
flights %>%
group_by(month) %>%
tally() %>%
arrange(desc(n)) %>%
head(4)
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## # Ordered by: desc(n)
## month n
## <dbl> <S3: integer64>
## 1 7 627931
## 2 3 616090
## 3 8 612279
## 4 6 608665
- What were the top 5 calendar days with most flight activity?
flights %>%
group_by(month, dayofmonth) %>%
tally() %>%
arrange(desc(n)) %>%
head(5)
## # Source: lazy query [?? x 3]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## # Groups: month
## # Ordered by: desc(n)
## month dayofmonth n
## <dbl> <dbl> <S3: integer64>
## 1 7 18 21128
## 2 7 11 21125
## 3 7 25 21102
## 4 7 10 21058
## 5 7 17 21055
- Which are the top 5 carriers (airlines) with the most flights?
flights %>%
group_by(carriername) %>%
tally() %>%
arrange(desc(n)) %>%
head(5)
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## # Ordered by: desc(n)
## carriername n
## <chr> <S3: intege>
## 1 Southwest Airlines Co. 1201754
## 2 American Airlines Inc. 604885
## 3 Skywest Airlines Inc. 567159
## 4 American Eagle Airlines Inc. 490693
## 5 US Airways Inc. (Merged with America West 9/05. Reporting f… 453589
- Figure the percent ratio of flights per month
flights %>%
group_by(month) %>%
tally() %>%
arrange(desc(n)) %>%
mutate(percent = n/sum(n, na.rm = TRUE))
## # Source: lazy query [?? x 3]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## # Ordered by: desc(n)
## month n percent
## <dbl> <S3: integer64> <dbl>
## 1 7 627931 0.0896
## 2 3 616090 0.0879
## 3 8 612279 0.0873
## 4 6 608665 0.0868
## 5 5 606293 0.0865
## 6 1 605765 0.0864
## 7 4 598126 0.0853
## 8 2 569236 0.0812
## 9 10 556205 0.0793
## 10 12 544958 0.0777
## # … with more rows
- Figure the percent ratio of flights per carrier
flights %>%
group_by(carriername) %>%
tally() %>%
arrange(desc(n)) %>%
mutate(percent = n/sum(n, na.rm = TRUE))
## # Source: lazy query [?? x 3]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## # Ordered by: desc(n)
## carriername n percent
## <chr> <S3: integ> <dbl>
## 1 Southwest Airlines Co. 1201754 0.171
## 2 American Airlines Inc. 604885 0.0863
## 3 Skywest Airlines Inc. 567159 0.0809
## 4 American Eagle Airlines Inc. 490693 0.0700
## 5 US Airways Inc. (Merged with America West 9/05. Rep… 453589 0.0647
## 6 Delta Air Lines Inc. 451931 0.0645
## 7 United Air Lines Inc. 449515 0.0641
## 8 Expressjet Airlines Inc. 374510 0.0534
## 9 Northwest Airlines Inc. 347652 0.0496
## 10 Continental Air Lines Inc. 298455 0.0426
## # … with more rows
3.3 Aggregate mulitple columns
Practice using summarise _
functions
- Use
summarise_all()
to send the same function to all fields
flights %>%
select(depdelay, arrdelay) %>%
summarise_all(mean, na.rm = TRUE)
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## depdelay arrdelay
## <dbl> <dbl>
## 1 9.97 8.17
- Use
summarise_at()
to pre-select the fields that will receive the function
flights %>%
summarise_at(c("depdelay", "arrdelay"), mean, na.rm = TRUE)
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## depdelay arrdelay
## <dbl> <dbl>
## 1 9.97 8.17
- Use
summarise_if()
to summarize only if the field meets a criterion
flights %>%
summarise_if(is.numeric, mean, na.rm = TRUE)
## Applying predicate on the first 100 rows
## # Source: lazy query [?? x 30]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## carrierdelay originlat originlong destlat destlong flightid year month
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 15.8 36.9 -95.1 36.9 -95.1 3504864. 2008 6.38
## # … with 22 more variables: dayofmonth <dbl>, dayofweek <dbl>,
## # deptime <dbl>, crsdeptime <dbl>, arrtime <dbl>, crsarrtime <dbl>,
## # flightnum <dbl>, actualelapsedtime <dbl>, crselapsedtime <dbl>,
## # airtime <dbl>, arrdelay <dbl>, depdelay <dbl>, distance <dbl>,
## # taxiin <dbl>, taxiout <dbl>, cancelled <dbl>, diverted <dbl>,
## # weatherdelay <dbl>, nasdelay <dbl>, securitydelay <dbl>,
## # lateaircraftdelay <dbl>, score <dbl>
- Combine with
group_by()
to create more complex results
flights %>%
select(month, depdelay, arrdelay) %>%
group_by(month) %>%
summarise_all(mean, na.rm = TRUE)
## # Source: lazy query [?? x 3]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## month depdelay arrdelay
## <dbl> <dbl> <dbl>
## 1 1 11.5 10.2
## 2 2 13.7 13.1
## 3 3 12.5 11.2
## 4 4 8.20 6.81
## 5 5 7.64 5.98
## 6 6 13.6 13.3
## 7 7 11.8 9.98
## 8 8 9.61 6.91
## 9 9 3.96 0.698
## 10 10 3.80 0.415
## # … with more rows
3.4 Data correlation
Calculate correlation values in database
- Correlate select numeric columns from the flights table
library(corrr)
flights_cor <- flights %>%
select(contains("delay")) %>%
correlate(use = "complete.obs")
##
## Correlation method: 'pearson'
## Missing treated using: 'complete.obs'
- Explore the
flights_cor
data
flights_cor %>%
rearrange() %>%
shave() %>%
fashion()
## rowname depdelay arrdelay carrierdelay lateaircraftdelay
## 1 depdelay
## 2 arrdelay .93
## 3 carrierdelay .55 .52
## 4 lateaircraftdelay .55 .50 -.13
## 5 nasdelay .15 .35 -.14 -.15
## 6 weatherdelay .25 .27 -.05 -.04
## 7 securitydelay .00 .00 -.01 -.01
## nasdelay weatherdelay securitydelay
## 1
## 2
## 3
## 4
## 5
## 6 -.01
## 7 -.01 -.01
3.5 View record level data
Important tips to record preview data
How many flights in July 18th were one or more hours late?
flights %>%
filter(
depdelay >= 60,
month == 7,
dayofmonth == 18
) %>%
tally()
## # Source: lazy query [?? x 1]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## n
## <S3: integer64>
## 1 1239
- Use
filter()
to retrieve only the needed data, andhead()
to limit the preview even further.
flights %>%
filter(
depdelay >= 60,
month == 7,
dayofmonth == 18
) %>%
head(100)
## # Source: lazy query [?? x 44]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## uniquecarrier carrierdelay carriername origin originname origincity
## <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 WN 0 Southwest … ABQ Albuquerq… Albuquerq…
## 2 WN 42 Southwest … ABQ Albuquerq… Albuquerq…
## 3 WN 122 Southwest … ABQ Albuquerq… Albuquerq…
## 4 WN 0 Southwest … ABQ Albuquerq… Albuquerq…
## 5 WN 71 Southwest … ABQ Albuquerq… Albuquerq…
## 6 WN 14 Southwest … ABQ Albuquerq… Albuquerq…
## 7 WN 84 Southwest … AUS Austin-Be… Austin
## 8 WN 56 Southwest … AUS Austin-Be… Austin
## 9 WN 0 Southwest … BNA Nashville… Nashville
## 10 WN 32 Southwest … BNA Nashville… Nashville
## # … with more rows, and 38 more variables: originstate <chr>,
## # origincountry <chr>, originlat <dbl>, originlong <dbl>, dest <chr>,
## # destname <chr>, destcity <chr>, deststate <chr>, destcountry <chr>,
## # destlat <dbl>, destlong <dbl>, flightid <int>, year <dbl>,
## # month <dbl>, dayofmonth <dbl>, dayofweek <dbl>, deptime <dbl>,
## # crsdeptime <dbl>, arrtime <dbl>, crsarrtime <dbl>, flightnum <dbl>,
## # tailnum <chr>, actualelapsedtime <dbl>, crselapsedtime <dbl>,
## # airtime <dbl>, arrdelay <dbl>, depdelay <dbl>, distance <dbl>,
## # taxiin <dbl>, taxiout <dbl>, cancelled <dbl>, cancellationcode <chr>,
## # diverted <dbl>, weatherdelay <dbl>, nasdelay <dbl>,
## # securitydelay <dbl>, lateaircraftdelay <dbl>, score <int>
- Use
collect()
andView()
to preview the data in the IDE. Make sure to always limit the number of returned rows. https://github.com/tidyverse/tibble/issues/373
flights %>%
filter(
depdelay >= 60,
month == 7,
dayofmonth == 18
) %>%
collect() %>%
head(100) %>%
View("my_preview")
3.6 Case statements
See how to use the flexibility of case statements for special cases
- Use
case_when()
to bucket each month into one of four seasons
flights %>%
mutate(
season = case_when(
month >= 3 && month <= 5 ~ "Spring",
month >= 6 && month <= 8 ~ "Summer",
month >= 9 && month <= 11 ~ "Fall",
TRUE ~ "Winter"
)
) %>%
group_by(season) %>%
tally()
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## season n
## <chr> <S3: integer64>
## 1 Fall 1620385
## 2 Spring 1820509
## 3 Summer 1848875
## 4 Winter 1719959
- Add a specific case for “Winter”
flights %>%
mutate(
season = case_when(
month >= 3 && month <= 5 ~ "Spring",
month >= 6 && month <= 8 ~ "Summer",
month >= 9 && month <= 11 ~ "Fall",
month == 12 | month <= 2 ~ "Winter"
)
) %>%
group_by(season) %>%
tally()
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## season n
## <chr> <S3: integer64>
## 1 Fall 1620385
## 2 Spring 1820509
## 3 Summer 1848875
## 4 Winter 1719959
- Append an entry for Monday at the end of the case statement
flights %>%
mutate(
season = case_when(
month >= 3 && month <= 5 ~ "Spring",
month >= 6 && month <= 8 ~ "Summer",
month >= 9 && month <= 11 ~ "Fall",
month == 12 | month <= 2 ~ "Winter",
dayofweek == 1 ~ "Monday"
)
) %>%
group_by(season) %>%
tally()
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## season n
## <chr> <S3: integer64>
## 1 Fall 1620385
## 2 Spring 1820509
## 3 Summer 1848875
## 4 Winter 1719959
- Move the “Monday” entry to the top of the case statement
flights %>%
mutate(
season = case_when(
dayofweek == 1 ~ "Monday",
month >= 3 && month <= 5 ~ "Spring",
month >= 6 && month <= 8 ~ "Summer",
month >= 9 && month <= 11 ~ "Fall",
month == 12 | month <= 2 ~ "Winter"
)
) %>%
group_by(season) %>%
tally()
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## season n
## <chr> <S3: integer64>
## 1 Fall 1376740
## 2 Monday 1036201
## 3 Spring 1554210
## 4 Summer 1577629
## 5 Winter 1464948
3.7 Data enrichment
Upload a small dataset in order to combine it with the datawarehouse data
- Load the
planes
data into memory
planes <- nycflights13::planes
- Using
DBI
, copy theplanes
data to the datawarehouse as a temporary table, and load it to a variable
dbWriteTable(con, "planes", planes, temporary = TRUE)
tbl_planes <- tbl(con, "planes")
- Create a “lazy” variable that joins the flights table to the new temp table
combined <- flights %>%
left_join(tbl_planes, by = "tailnum")
- View a sample of flights of planes with more than 100 seats
combined %>%
filter(seats > 100) %>%
head()
## # Source: lazy query [?? x 52]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## uniquecarrier carrierdelay carriername origin originname origincity
## <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 WN NA Southwest … ABQ Albuquerq… Albuquerq…
## 2 WN NA Southwest … ABQ Albuquerq… Albuquerq…
## 3 WN 14 Southwest … ABQ Albuquerq… Albuquerq…
## 4 WN NA Southwest … ABQ Albuquerq… Albuquerq…
## 5 WN NA Southwest … ABQ Albuquerq… Albuquerq…
## 6 WN NA Southwest … ABQ Albuquerq… Albuquerq…
## # … with 46 more variables: originstate <chr>, origincountry <chr>,
## # originlat <dbl>, originlong <dbl>, dest <chr>, destname <chr>,
## # destcity <chr>, deststate <chr>, destcountry <chr>, destlat <dbl>,
## # destlong <dbl>, flightid <int>, year.x <dbl>, month <dbl>,
## # dayofmonth <dbl>, dayofweek <dbl>, deptime <dbl>, crsdeptime <dbl>,
## # arrtime <dbl>, crsarrtime <dbl>, flightnum <dbl>, tailnum <chr>,
## # actualelapsedtime <dbl>, crselapsedtime <dbl>, airtime <dbl>,
## # arrdelay <dbl>, depdelay <dbl>, distance <dbl>, taxiin <dbl>,
## # taxiout <dbl>, cancelled <dbl>, cancellationcode <chr>,
## # diverted <dbl>, weatherdelay <dbl>, nasdelay <dbl>,
## # securitydelay <dbl>, lateaircraftdelay <dbl>, score <int>,
## # year.y <int>, type <chr>, manufacturer <chr>, model <chr>,
## # engines <int>, seats <int>, speed <int>, engine <chr>
- How many flights are from McDonnel Douglas planes
combined %>%
filter(manufacturer == "MCDONNELL DOUGLAS") %>%
tally()
## # Source: lazy query [?? x 1]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## n
## <S3: integer64>
## 1 137250
- See how many flights each plane McDonnel Douglas had
combined %>%
filter(manufacturer == "MCDONNELL DOUGLAS") %>%
group_by(tailnum) %>%
tally()
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## tailnum n
## <chr> <S3: integer64>
## 1 N424AA 1479
## 2 N426AA 1413
## 3 N433AA 1153
## 4 N434AA 1208
## 5 N435AA 1185
## 6 N436AA 1155
## 7 N437AA 1233
## 8 N438AA 1243
## 9 N439AA 1251
## 10 N454AA 1432
## # … with more rows
- Get the total number of planes, and the average, minimum & maximum number of flights for the manufacturer
combined %>%
filter(manufacturer == "MCDONNELL DOUGLAS") %>%
group_by(tailnum) %>%
tally() %>%
summarise(planes = n(),
avg_flights = mean(n, na.rm = TRUE),
max_flights = max(n, na.rm = TRUE),
min_flights = min(n, na.rm = TRUE))
## # Source: lazy query [?? x 4]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## planes avg_flights max_flights min_flights
## <S3: integer64> <dbl> <S3: integer64> <S3: integer64>
## 1 102 1346. 1850 1068
- Use
explain()
to see the query plan
combined %>%
filter(manufacturer == "MCDONNELL DOUGLAS") %>%
group_by(tailnum) %>%
tally() %>%
summarise(planes = n(),
avg_flights = mean(n, na.rm = TRUE),
max_flights = max(n, na.rm = TRUE),
min_flights = min(n, na.rm = TRUE)) %>%
explain()
## <SQL>
## SELECT COUNT(*) AS "planes", AVG("n") AS "avg_flights", MAX("n") AS "max_flights", MIN("n") AS "min_flights"
## FROM (SELECT "tailnum", COUNT(*) AS "n"
## FROM (SELECT *
## FROM (SELECT "TBL_LEFT"."uniquecarrier" AS "uniquecarrier", "TBL_LEFT"."carrierdelay" AS "carrierdelay", "TBL_LEFT"."carriername" AS "carriername", "TBL_LEFT"."origin" AS "origin", "TBL_LEFT"."originname" AS "originname", "TBL_LEFT"."origincity" AS "origincity", "TBL_LEFT"."originstate" AS "originstate", "TBL_LEFT"."origincountry" AS "origincountry", "TBL_LEFT"."originlat" AS "originlat", "TBL_LEFT"."originlong" AS "originlong", "TBL_LEFT"."dest" AS "dest", "TBL_LEFT"."destname" AS "destname", "TBL_LEFT"."destcity" AS "destcity", "TBL_LEFT"."deststate" AS "deststate", "TBL_LEFT"."destcountry" AS "destcountry", "TBL_LEFT"."destlat" AS "destlat", "TBL_LEFT"."destlong" AS "destlong", "TBL_LEFT"."flightid" AS "flightid", "TBL_LEFT"."year" AS "year.x", "TBL_LEFT"."month" AS "month", "TBL_LEFT"."dayofmonth" AS "dayofmonth", "TBL_LEFT"."dayofweek" AS "dayofweek", "TBL_LEFT"."deptime" AS "deptime", "TBL_LEFT"."crsdeptime" AS "crsdeptime", "TBL_LEFT"."arrtime" AS "arrtime", "TBL_LEFT"."crsarrtime" AS "crsarrtime", "TBL_LEFT"."flightnum" AS "flightnum", "TBL_LEFT"."tailnum" AS "tailnum", "TBL_LEFT"."actualelapsedtime" AS "actualelapsedtime", "TBL_LEFT"."crselapsedtime" AS "crselapsedtime", "TBL_LEFT"."airtime" AS "airtime", "TBL_LEFT"."arrdelay" AS "arrdelay", "TBL_LEFT"."depdelay" AS "depdelay", "TBL_LEFT"."distance" AS "distance", "TBL_LEFT"."taxiin" AS "taxiin", "TBL_LEFT"."taxiout" AS "taxiout", "TBL_LEFT"."cancelled" AS "cancelled", "TBL_LEFT"."cancellationcode" AS "cancellationcode", "TBL_LEFT"."diverted" AS "diverted", "TBL_LEFT"."weatherdelay" AS "weatherdelay", "TBL_LEFT"."nasdelay" AS "nasdelay", "TBL_LEFT"."securitydelay" AS "securitydelay", "TBL_LEFT"."lateaircraftdelay" AS "lateaircraftdelay", "TBL_LEFT"."score" AS "score", "TBL_RIGHT"."year" AS "year.y", "TBL_RIGHT"."type" AS "type", "TBL_RIGHT"."manufacturer" AS "manufacturer", "TBL_RIGHT"."model" AS "model", "TBL_RIGHT"."engines" AS "engines", "TBL_RIGHT"."seats" AS "seats", "TBL_RIGHT"."speed" AS "speed", "TBL_RIGHT"."engine" AS "engine"
## FROM datawarehouse.vflight AS "TBL_LEFT"
## LEFT JOIN "planes" AS "TBL_RIGHT"
## ON ("TBL_LEFT"."tailnum" = "TBL_RIGHT"."tailnum")
## ) "raogmwbzgh"
## WHERE ("manufacturer" = 'MCDONNELL DOUGLAS')) "ydrljxkzpa"
## GROUP BY "tailnum") "kqyzgozqmv"
##
## <PLAN>
## Aggregate (cost=295342.72..295342.73 rows=1 width=56)
## -> GroupAggregate (cost=295099.20..295242.64 rows=5004 width=14)
## Group Key: "TBL_LEFT".tailnum
## -> Sort (cost=295099.20..295130.33 rows=12454 width=6)
## Sort Key: "TBL_LEFT".tailnum
## -> Hash Join (cost=70.61..294252.06 rows=12454 width=6)
## Hash Cond: ("TBL_LEFT".tailnum = "TBL_RIGHT".tailnum)
## -> Seq Scan on flight "TBL_LEFT" (cost=0.00..267764.93 rows=7011193 width=17)
## -> Hash (cost=70.50..70.50 rows=9 width=32)
## -> Seq Scan on planes "TBL_RIGHT" (cost=0.00..70.50 rows=9 width=32)
## Filter: (manufacturer = 'MCDONNELL DOUGLAS'::text)
- Disconnect from the database
dbDisconnect(con)