4 Data Visualizations
4.1 Simple plot
Practice pushing the calculations to the database
- Use
collect()
bring back the aggregated results into a “pass-through” variable calledby_month
by_month <- flights %>%
group_by(month) %>%
tally() %>%
mutate(n = as.numeric(n)) %>%
collect()
head(by_month)
## # A tibble: 6 x 2
## month n
## <dbl> <dbl>
## 1 1 605765
## 2 2 569236
## 3 3 616090
## 4 4 598126
## 5 5 606293
## 6 6 608665
- Plot results using
ggplot2
library(ggplot2)
ggplot(by_month) +
geom_line(aes(x = month, y = n))
4.2 Plot in one code segment
Practice going from dplyr
to ggplot2
without using pass-through variable, great for EDA
- Using the code from the previous section, create a single piped code set which also creates the plot
flights %>%
group_by(month) %>%
tally() %>%
mutate(n = as.numeric(n)) %>%
collect() %>%
ggplot() + # < Don't forget to switch to `+`
geom_line(aes(x = month, y = n))
- Change the aggregation to the average of
arrdelay
. Tip: Usex
as the summarize variable
flights %>%
group_by(month) %>%
summarise(x = mean(arrdelay, na.rm = TRUE)) %>%
mutate(x = as.numeric(x)) %>%
collect() %>%
ggplot() +
geom_line(aes(x = month, y = x))
- Plot the average distance. Copy the code from the previous exercise and change the variable
flights %>%
group_by(month) %>%
summarise(x = mean(distance, na.rm = TRUE)) %>%
mutate(x = as.numeric(x)) %>%
collect() %>%
ggplot() +
geom_line(aes(x = month, y = x))
4.3 Plot specific data segments
Combine skills from previous units to create more sophisticated plots
- Start with getting the top 5 carriers
flights %>%
group_by(uniquecarrier) %>%
tally() %>%
arrange(desc(n)) %>%
head(5)
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## # Ordered by: desc(n)
## uniquecarrier n
## <chr> <S3: integer64>
## 1 WN 1201754
## 2 AA 604885
## 3 OO 567159
## 4 MQ 490693
## 5 US 453589
- Pipe the top 5 carriers to a plot
flights %>%
group_by(uniquecarrier) %>%
tally() %>%
mutate(n = as.numeric(n)) %>%
arrange(desc(n)) %>%
head(5) %>%
collect() %>%
ggplot() +
geom_col(aes(x = uniquecarrier, y = n))
- Improve the plot’s look
flights %>%
group_by(uniquecarrier) %>%
tally() %>%
mutate(n = as.numeric(n)) %>%
arrange(desc(n)) %>%
head(5) %>%
collect() %>%
ggplot() + #Don't forget to switch to `+`
geom_col(aes(x = forcats::fct_reorder(uniquecarrier, n), # Order by n
y = n,
fill = n), # Add fill
show.legend = FALSE) + # Turn legend off
scale_y_continuous(labels = scales::comma) + # Add commas to axis label numbers
coord_flip() + # Rotate cols into rows
labs(title = "Top 5 Carriers",
subtitle = "Source: Datawarehouse",
x = "Carrier Name",
y = "# of Flights")
4.4 Plot correlations
Use the corrr
package to plot correlations
- Correlate select numeric columns from the flights table
library(corrr)
flights_cor <- flights %>%
select(contains("delay")) %>%
correlate(use = "complete.obs")
##
## Correlation method: 'pearson'
## Missing treated using: 'complete.obs'
- Visualize the correlations
flights_cor %>%
rplot()
- Visualize correlation network
flights_cor %>%
network_plot(min_cor = .15)
4.5 Two or more queries
Learn how to use pull()
to pass a set of values to be used on a secondary query
- Use
pull()
to get the top 5 carriers loaded in a vector
top5 <- flights %>%
group_by(uniquecarrier) %>%
tally() %>%
arrange(desc(n)) %>%
head(5) %>%
pull(uniquecarrier)
top5
## [1] "WN" "AA" "OO" "MQ" "US"
- Use
%in%
to pass thetop5
vector to a filter
flights %>%
filter(uniquecarrier %in% top5)
## # Source: lazy query [?? x 31]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## flightid year month dayofmonth dayofweek deptime crsdeptime arrtime
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3654535 2008 7 18 5 641 645 726
## 2 3654536 2008 7 18 5 1304 1305 1341
## 3 3654537 2008 7 18 5 1938 1755 2018
## 4 3654538 2008 7 18 5 1935 1935 2127
## 5 3654539 2008 7 18 5 959 1000 1158
## 6 3654540 2008 7 18 5 1531 1530 1651
## 7 3654541 2008 7 18 5 826 830 1016
## 8 3654542 2008 7 18 5 1045 1045 1356
## 9 3654543 2008 7 18 5 1502 1405 2025
## 10 3654544 2008 7 18 5 959 1000 1001
## # … with more rows, and 23 more variables: crsarrtime <dbl>,
## # uniquecarrier <chr>, flightnum <dbl>, tailnum <chr>,
## # actualelapsedtime <dbl>, crselapsedtime <dbl>, airtime <dbl>,
## # arrdelay <dbl>, depdelay <dbl>, origin <chr>, dest <chr>,
## # distance <dbl>, taxiin <dbl>, taxiout <dbl>, cancelled <dbl>,
## # cancellationcode <chr>, diverted <dbl>, carrierdelay <dbl>,
## # weatherdelay <dbl>, nasdelay <dbl>, securitydelay <dbl>,
## # lateaircraftdelay <dbl>, score <int>
- Group by carrier and get the average arrival delay
flights %>%
filter(uniquecarrier %in% top5) %>%
group_by(uniquecarrier) %>%
summarise(n = mean(arrdelay, na.rm = TRUE))
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## uniquecarrier n
## <chr> <dbl>
## 1 AA 12.6
## 2 MQ 9.89
## 3 OO 6.60
## 4 US 2.85
## 5 WN 5.18
- Copy the final
ggplot()
code from the Plot specific data segments section. Update they
labs.
flights %>%
filter(uniquecarrier %in% top5) %>%
group_by(uniquecarrier) %>%
summarise(n = mean(arrdelay, na.rm = TRUE)) %>%
# From previous section ----------------------------------------------
collect() %>%
ggplot() + #Don't forget to switch to `+`
geom_col(aes(x = forcats::fct_reorder(uniquecarrier, n), # Order by n
y = n,
fill = n), # Add fill
show.legend = FALSE) + # Turn legend off
coord_flip() + # Rotate cols into rows
labs(title = "Top 5 Carriers",
subtitle = "Source: Datawarehouse",
x = "Carrier Name",
y = "Average Delay")
4.6 Visualize using dbplot
Review how to use dbplot
to make it easier to plot with databases
- Install and load
dbplot
library(dbplot)
- Create a line plot using the helper function
dbplot_line()
flights %>%
count(month)
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## month n
## <dbl> <S3: integer64>
## 1 1 605765
## 2 2 569236
## 3 3 616090
## 4 4 598126
## 5 5 606293
## 6 6 608665
## 7 7 627931
## 8 8 612279
## 9 9 540908
## 10 10 556205
## # … with more rows
flights %>%
dbplot_line(month)
## Don't know how to automatically pick scale for object of type integer64. Defaulting to continuous.
## Warning: Removed 12 rows containing missing values (geom_path).
flights %>%
db_compute_bins(month)
## # A tibble: 12 x 2
## month count
## <dbl> <S3: integer64>
## 1 8.7 540908
## 2 6.87 627931
## 3 3.93 598126
## 4 4.67 606293
## 5 1.73 569236
## 6 10.9 523272
## 7 2.83 616090
## 8 11.6 544958
## 9 5.77 608665
## 10 9.80 556205
## 11 1 605765
## 12 7.97 612279
- Update the plot’s labels
flights %>%
dbplot_line(month) +
labs(title = "Monthly flights",
x = "Month",
y = "Number of flights")
## Don't know how to automatically pick scale for object of type integer64. Defaulting to continuous.
## Warning: Removed 12 rows containing missing values (geom_path).
4.7 Plot a different aggregation
dbplot
allows for aggregate functions, other than record count, to be used for plotting
- Plot the average departure delay by day of week
flights %>%
dbplot_bar(dayofweek, mean(depdelay, na.rm = TRUE))
- Change the day numbers to day name labels
flights %>%
dbplot_bar(dayofweek, mean(depdelay, na.rm = TRUE)) +
scale_x_continuous(
labels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"),
breaks = 1:7
)
4.8 Create a histogram
Use the package’s function to easily create a histogram
- Use the
dbplot_histogram()
to build the histogram
flights %>%
dbplot_histogram(distance)
## Don't know how to automatically pick scale for object of type integer64. Defaulting to continuous.
## Warning: Removed 27 rows containing missing values (position_stack).
- Adjust the
binwidth
to 300
flights %>%
dbplot_histogram(distance, binwidth = 300)
## Don't know how to automatically pick scale for object of type integer64. Defaulting to continuous.
## Warning: Removed 16 rows containing missing values (position_stack).
4.9 Raster plot
- Use a
dbplot_raster()
to visualizedeptime
versusdepdelay
flights %>%
dbplot_raster(deptime, arrtime)
## Warning: Removed 83 rows containing missing values (geom_raster).
- Change the plot’s resolution to 500
flights %>%
dbplot_raster(deptime, arrtime, resolution = 500)
## Warning: Removed 293 rows containing missing values (geom_raster).
4.10 Using the calculate
functions
- Use the
db_compute_raster()
function to get the underlying results that feed the plot
departure <- flights %>%
db_compute_raster(deptime, arrtime)
departure
## # A tibble: 3,602 x 3
## deptime arrtime `n()`
## <dbl> <dbl> <dbl>
## 1 NA NA 136246
## 2 1057. 1848. 60
## 3 625. 841. 12271
## 4 697. 1392. 608
## 5 817. 1105. 13782
## 6 721. 1248. 543
## 7 649. 1488. 552
## 8 1512. 2256. 59
## 9 601. 649. 5476
## 10 1680. 2112. 309
## # … with 3,592 more rows
- Plot the results “manually”
departure %>%
filter(`n()` > 1000) %>%
ggplot() +
geom_raster(aes(x = deptime, y = arrtime, fill = `n()`))
## Warning: Removed 1 rows containing missing values (geom_raster).
4.11 Under the hood (II)
Review how dbplot
pushes histogram and raster calculations to the database
- Use the
db_bin()
command to see the resulting tidy eval formula
db_bin(field)
## (((max(field, na.rm = TRUE) - min(field, na.rm = TRUE))/30) *
## ifelse(as.integer(floor((field - min(field, na.rm = TRUE))/((max(field,
## na.rm = TRUE) - min(field, na.rm = TRUE))/30))) == 30,
## as.integer(floor((field - min(field, na.rm = TRUE))/((max(field,
## na.rm = TRUE) - min(field, na.rm = TRUE))/30))) -
## 1, as.integer(floor((field - min(field, na.rm = TRUE))/((max(field,
## na.rm = TRUE) - min(field, na.rm = TRUE))/30))))) +
## min(field, na.rm = TRUE)
- Use
translate_sql()
andsimulate_odbc_postgresql()
to see an example of what the resulting SQL statement looks like
translate_sql(!! db_bin(field), con = simulate_odbc_postgresql())
## <SQL> (((max(`field`) OVER () - min(`field`) OVER ()) / 30.0) * CASE WHEN (CAST(FLOOR((`field` - min(`field`) OVER ()) / ((max(`field`) OVER () - min(`field`) OVER ()) / 30.0)) AS INTEGER) = 30.0) THEN (CAST(FLOOR((`field` - min(`field`) OVER ()) / ((max(`field`) OVER () - min(`field`) OVER ()) / 30.0)) AS INTEGER) - 1.0) WHEN NOT(CAST(FLOOR((`field` - min(`field`) OVER ()) / ((max(`field`) OVER () - min(`field`) OVER ()) / 30.0)) AS INTEGER) = 30.0) THEN (CAST(FLOOR((`field` - min(`field`) OVER ()) / ((max(`field`) OVER () - min(`field`) OVER ()) / 30.0)) AS INTEGER)) END) + min(`field`) OVER ()
- Disconnect from the database
dbDisconnect(con)