4 Data Visualizations

4.1 Simple plot

Practice pushing the calculations to the database

Use collect() bring back the aggregated results into a “pass-through” variable called by_month

by_month <- flights %>%
  group_by(month) %>%
  tally() %>%
  mutate(n = as.numeric(n)) %>%
  collect() 

head(by_month)

## # A tibble: 6 x 2
##   month      n
##   <dbl>  <dbl>
## 1     1 605765
## 2     2 569236
## 3     3 616090
## 4     4 598126
## 5     5 606293
## 6     6 608665

Plot results using ggplot2

library(ggplot2)

ggplot(by_month) +
  geom_line(aes(x = month, y = n))

4.2 Plot in one code segment

Practice going from dplyr to ggplot2 without using pass-through variable, great for EDA

Using the code from the previous section, create a single piped code set which also creates the plot

flights %>%
  group_by(month) %>%
  tally() %>%
  mutate(n = as.numeric(n)) %>%
  collect() %>%
  ggplot() +                        # < Don't forget to switch to `+`
  geom_line(aes(x = month, y = n))

Change the aggregation to the average of arrdelay. Tip: Use x as the summarize variable

flights %>%
  group_by(month) %>%
  summarise(x = mean(arrdelay, na.rm = TRUE)) %>%
  mutate(x = as.numeric(x)) %>%
  collect() %>%
  ggplot() +
  geom_line(aes(x = month, y = x))

Plot the average distance. Copy the code from the previous exercise and change the variable

flights %>%
  group_by(month) %>%
  summarise(x = mean(distance, na.rm = TRUE)) %>%
  mutate(x = as.numeric(x)) %>%
  collect() %>%
  ggplot() +
  geom_line(aes(x = month, y = x))

4.3 Plot specific data segments

Combine skills from previous units to create more sophisticated plots

Start with getting the top 5 carriers

flights %>%
  group_by(uniquecarrier) %>%
  tally() %>%
  arrange(desc(n)) %>%
  head(5)

## # Source:     lazy query [?? x 2]
## # Database:   postgres [rstudio_dev@localhost:/postgres]
## # Ordered by: desc(n)
##   uniquecarrier n              
##   <chr>         <S3: integer64>
## 1 WN            1201754        
## 2 AA             604885        
## 3 OO             567159        
## 4 MQ             490693        
## 5 US             453589

Pipe the top 5 carriers to a plot

flights %>%
  group_by(uniquecarrier) %>%
  tally() %>%
  mutate(n = as.numeric(n)) %>%
  arrange(desc(n)) %>%
  head(5) %>%
  collect() %>%
  ggplot() +
    geom_col(aes(x = uniquecarrier, y = n))

Improve the plot’s look

flights %>%
  group_by(uniquecarrier) %>%
  tally() %>%
  mutate(n = as.numeric(n)) %>%
  arrange(desc(n)) %>%
  head(5) %>%
  collect() %>%
  ggplot() +      #Don't forget to switch to `+`
  geom_col(aes(x = forcats::fct_reorder(uniquecarrier, n), # Order by n
               y = n, 
               fill = n), # Add fill
           show.legend = FALSE) +  # Turn legend off
  scale_y_continuous(labels = scales::comma) + # Add commas to axis label numbers
  coord_flip() +  # Rotate cols into rows
  labs(title = "Top 5 Carriers", 
       subtitle = "Source: Datawarehouse",
       x = "Carrier Name", 
       y = "# of Flights")

4.4 Plot correlations

Use the corrr package to plot correlations

Correlate select numeric columns from the flights table

library(corrr)

flights_cor <- flights %>% 
  select(contains("delay")) %>% 
  correlate(use = "complete.obs")

## 
## Correlation method: 'pearson'
## Missing treated using: 'complete.obs'

Visualize the correlations

flights_cor %>% 
  rplot()

Visualize correlation network

flights_cor %>% 
  network_plot(min_cor = .15)

4.5 Two or more queries

Learn how to use pull() to pass a set of values to be used on a secondary query

Use pull() to get the top 5 carriers loaded in a vector

top5 <- flights %>%
  group_by(uniquecarrier) %>%
  tally() %>%
  arrange(desc(n)) %>%
  head(5) %>%
  pull(uniquecarrier)

top5

## [1] "WN" "AA" "OO" "MQ" "US"

Use %in% to pass the top5 vector to a filter

flights %>%
  filter(uniquecarrier %in% top5)

## # Source:   lazy query [?? x 31]
## # Database: postgres [rstudio_dev@localhost:/postgres]
##    flightid  year month dayofmonth dayofweek deptime crsdeptime arrtime
##       <int> <dbl> <dbl>      <dbl>     <dbl>   <dbl>      <dbl>   <dbl>
##  1  3654535  2008     7         18         5     641        645     726
##  2  3654536  2008     7         18         5    1304       1305    1341
##  3  3654537  2008     7         18         5    1938       1755    2018
##  4  3654538  2008     7         18         5    1935       1935    2127
##  5  3654539  2008     7         18         5     959       1000    1158
##  6  3654540  2008     7         18         5    1531       1530    1651
##  7  3654541  2008     7         18         5     826        830    1016
##  8  3654542  2008     7         18         5    1045       1045    1356
##  9  3654543  2008     7         18         5    1502       1405    2025
## 10  3654544  2008     7         18         5     959       1000    1001
## # … with more rows, and 23 more variables: crsarrtime <dbl>,
## #   uniquecarrier <chr>, flightnum <dbl>, tailnum <chr>,
## #   actualelapsedtime <dbl>, crselapsedtime <dbl>, airtime <dbl>,
## #   arrdelay <dbl>, depdelay <dbl>, origin <chr>, dest <chr>,
## #   distance <dbl>, taxiin <dbl>, taxiout <dbl>, cancelled <dbl>,
## #   cancellationcode <chr>, diverted <dbl>, carrierdelay <dbl>,
## #   weatherdelay <dbl>, nasdelay <dbl>, securitydelay <dbl>,
## #   lateaircraftdelay <dbl>, score <int>

Group by carrier and get the average arrival delay

flights %>%
  filter(uniquecarrier %in% top5) %>%
  group_by(uniquecarrier) %>%
  summarise(n = mean(arrdelay, na.rm = TRUE))

## # Source:   lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
##   uniquecarrier     n
##   <chr>         <dbl>
## 1 AA            12.6 
## 2 MQ             9.89
## 3 OO             6.60
## 4 US             2.85
## 5 WN             5.18

Copy the final ggplot() code from the Plot specific data segments section. Update the y labs.

flights %>%
  filter(uniquecarrier %in% top5) %>%
  group_by(uniquecarrier) %>%
  summarise(n = mean(arrdelay, na.rm = TRUE)) %>%
  # From previous section ----------------------------------------------
  collect() %>%
  ggplot() +      #Don't forget to switch to `+`
  geom_col(aes(x = forcats::fct_reorder(uniquecarrier, n), # Order by n
               y = n, 
               fill = n), # Add fill
           show.legend = FALSE) +  # Turn legend off
  coord_flip() +  # Rotate cols into rows
  labs(title = "Top 5 Carriers", 
       subtitle = "Source: Datawarehouse",
       x = "Carrier Name", 
       y = "Average Delay")

4.6 Visualize using `dbplot`

Review how to use dbplot to make it easier to plot with databases

Install and load dbplot

library(dbplot)

Create a line plot using the helper function dbplot_line()

flights %>% 
  count(month)

## # Source:   lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
##    month n              
##    <dbl> <S3: integer64>
##  1     1 605765         
##  2     2 569236         
##  3     3 616090         
##  4     4 598126         
##  5     5 606293         
##  6     6 608665         
##  7     7 627931         
##  8     8 612279         
##  9     9 540908         
## 10    10 556205         
## # … with more rows

flights %>%
  dbplot_line(month)

## Don't know how to automatically pick scale for object of type integer64. Defaulting to continuous.

## Warning: Removed 12 rows containing missing values (geom_path).

flights %>% 
  db_compute_bins(month)

## # A tibble: 12 x 2
##    month count          
##    <dbl> <S3: integer64>
##  1  8.7  540908         
##  2  6.87 627931         
##  3  3.93 598126         
##  4  4.67 606293         
##  5  1.73 569236         
##  6 10.9  523272         
##  7  2.83 616090         
##  8 11.6  544958         
##  9  5.77 608665         
## 10  9.80 556205         
## 11  1    605765         
## 12  7.97 612279

Update the plot’s labels

flights %>%
  dbplot_line(month) +
  labs(title = "Monthly flights",
       x = "Month",
       y = "Number of flights")

## Don't know how to automatically pick scale for object of type integer64. Defaulting to continuous.

## Warning: Removed 12 rows containing missing values (geom_path).

4.7 Plot a different aggregation

dbplot allows for aggregate functions, other than record count, to be used for plotting

Plot the average departure delay by day of week

flights %>%
  dbplot_bar(dayofweek, mean(depdelay, na.rm = TRUE))

Change the day numbers to day name labels

flights %>%
  dbplot_bar(dayofweek, mean(depdelay, na.rm = TRUE)) +
  scale_x_continuous(
    labels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"),
    breaks = 1:7
  )

4.8 Create a histogram

Use the package’s function to easily create a histogram

Use the dbplot_histogram() to build the histogram

flights %>%
  dbplot_histogram(distance)

## Don't know how to automatically pick scale for object of type integer64. Defaulting to continuous.

## Warning: Removed 27 rows containing missing values (position_stack).

Adjust the binwidth to 300

flights %>%
  dbplot_histogram(distance, binwidth = 300)

## Don't know how to automatically pick scale for object of type integer64. Defaulting to continuous.

## Warning: Removed 16 rows containing missing values (position_stack).

4.9 Raster plot

Use a dbplot_raster() to visualize deptime versus depdelay

flights %>%
  dbplot_raster(deptime, arrtime)

## Warning: Removed 83 rows containing missing values (geom_raster).

Change the plot’s resolution to 500

flights %>%
  dbplot_raster(deptime, arrtime, resolution = 500)

## Warning: Removed 293 rows containing missing values (geom_raster).

4.10 Using the `calculate` functions

Use the db_compute_raster() function to get the underlying results that feed the plot

departure <- flights %>%
  db_compute_raster(deptime, arrtime)

departure

## # A tibble: 3,602 x 3
##    deptime arrtime  `n()`
##      <dbl>   <dbl>  <dbl>
##  1     NA      NA  136246
##  2   1057.   1848.     60
##  3    625.    841.  12271
##  4    697.   1392.    608
##  5    817.   1105.  13782
##  6    721.   1248.    543
##  7    649.   1488.    552
##  8   1512.   2256.     59
##  9    601.    649.   5476
## 10   1680.   2112.    309
## # … with 3,592 more rows

Plot the results “manually”

departure %>%
  filter(`n()` > 1000) %>%
  ggplot() +
  geom_raster(aes(x = deptime, y = arrtime, fill = `n()`))

## Warning: Removed 1 rows containing missing values (geom_raster).

4.11 Under the hood (II)

Review how dbplot pushes histogram and raster calculations to the database

Use the db_bin() command to see the resulting tidy eval formula

db_bin(field)

## (((max(field, na.rm = TRUE) - min(field, na.rm = TRUE))/30) * 
##     ifelse(as.integer(floor((field - min(field, na.rm = TRUE))/((max(field, 
##         na.rm = TRUE) - min(field, na.rm = TRUE))/30))) == 30, 
##         as.integer(floor((field - min(field, na.rm = TRUE))/((max(field, 
##             na.rm = TRUE) - min(field, na.rm = TRUE))/30))) - 
##             1, as.integer(floor((field - min(field, na.rm = TRUE))/((max(field, 
##             na.rm = TRUE) - min(field, na.rm = TRUE))/30))))) + 
##     min(field, na.rm = TRUE)

Use translate_sql() and simulate_odbc_postgresql() to see an example of what the resulting SQL statement looks like

translate_sql(!! db_bin(field), con = simulate_odbc_postgresql())

## <SQL> (((max(`field`) OVER () - min(`field`) OVER ()) / 30.0) * CASE WHEN (CAST(FLOOR((`field` - min(`field`) OVER ()) / ((max(`field`) OVER () - min(`field`) OVER ()) / 30.0)) AS INTEGER) = 30.0) THEN (CAST(FLOOR((`field` - min(`field`) OVER ()) / ((max(`field`) OVER () - min(`field`) OVER ()) / 30.0)) AS INTEGER) - 1.0) WHEN NOT(CAST(FLOOR((`field` - min(`field`) OVER ()) / ((max(`field`) OVER () - min(`field`) OVER ()) / 30.0)) AS INTEGER) = 30.0) THEN (CAST(FLOOR((`field` - min(`field`) OVER ()) / ((max(`field`) OVER () - min(`field`) OVER ()) / 30.0)) AS INTEGER)) END) + min(`field`) OVER ()

Disconnect from the database

dbDisconnect(con)