Analyzing SFO Passengers • sfo

The sfo_passengers dataset provides granular information about the air traffic passengers in San Francisco International Airport between July 2005 and September 2020. The following vignette demonstrate some examples of transforming the raw data into a tidy format and analyzing it.

Data prep

We will start with reviewing the data structure:

library(sfo)

data("sfo_passengers")

For simplicity, let’s use a shorter name , d, for the dataset:


d <- sfo_passengers

Next let’s review the characteristics of the dataset:


head(d)
#>   activity_period operating_airline operating_airline_iata_code
#> 1          202009   United Airlines                          UA
#> 2          202009   United Airlines                          UA
#> 3          202009   United Airlines                          UA
#> 4          202009   United Airlines                          UA
#> 5          202009   United Airlines                          UA
#> 6          202009   United Airlines                          UA
#>   published_airline published_airline_iata_code   geo_summary geo_region
#> 1   United Airlines                          UA International     Mexico
#> 2   United Airlines                          UA International     Mexico
#> 3   United Airlines                          UA International     Mexico
#> 4   United Airlines                          UA International     Mexico
#> 5   United Airlines                          UA International     Europe
#> 6   United Airlines                          UA International     Europe
#>   activity_type_code price_category_code      terminal boarding_area
#> 1           Enplaned               Other    Terminal 3             F
#> 2           Enplaned               Other    Terminal 3             E
#> 3           Enplaned               Other International             G
#> 4           Deplaned               Other International             G
#> 5           Enplaned               Other International             G
#> 6           Deplaned               Other International             G
#>   passenger_count
#> 1            6712
#> 2             396
#> 3             376
#> 4            6817
#> 5            3851
#> 6            3700

str(d)
#> 'data.frame':    22576 obs. of  12 variables:
#>  $ activity_period            : int  202009 202009 202009 202009 202009 202009 202009 202009 202009 202009 ...
#>  $ operating_airline          : chr  "United Airlines" "United Airlines" "United Airlines" "United Airlines" ...
#>  $ operating_airline_iata_code: chr  "UA" "UA" "UA" "UA" ...
#>  $ published_airline          : chr  "United Airlines" "United Airlines" "United Airlines" "United Airlines" ...
#>  $ published_airline_iata_code: chr  "UA" "UA" "UA" "UA" ...
#>  $ geo_summary                : chr  "International" "International" "International" "International" ...
#>  $ geo_region                 : chr  "Mexico" "Mexico" "Mexico" "Mexico" ...
#>  $ activity_type_code         : chr  "Enplaned" "Enplaned" "Enplaned" "Deplaned" ...
#>  $ price_category_code        : chr  "Other" "Other" "Other" "Other" ...
#>  $ terminal                   : chr  "Terminal 3" "Terminal 3" "International" "International" ...
#>  $ boarding_area              : chr  "F" "E" "G" "G" ...
#>  $ passenger_count            : int  6712 396 376 6817 3851 3700 71 83 65 45 ...

The activity_period variable provides the year-month of the observation using integer format. That is convenient if we want to filter by year, but less convenient if we want to plot the data as time series. Therefore, we will reformat the variable from YYYYMM to YYYYMMDD, setting the day of the month as 1 with the as.Date function:

d$date <- as.Date(paste(substr(d$activity_period, 1,4), substr(d$activity_period, 5,6), "01", sep ="/"))

head(d[,c("activity_period", "date")])
#>   activity_period       date
#> 1          202009 2020-09-01
#> 2          202009 2020-09-01
#> 3          202009 2020-09-01
#> 4          202009 2020-09-01
#> 5          202009 2020-09-01
#> 6          202009 2020-09-01

min(d$date)
#> [1] "2005-07-01"
max(d$date)
#> [1] "2020-09-01"

Exploratory analysis

Now, after we reviewed and reformat the data, we can start explore the data and answer some questions such as:

The monthly number of passengers at SFO
The airlines distribution
The geographic distribution of the passengers

Monthly number of passengers

There multiple ways to view and aggregate the number of monthly passengers such as the activity_type_code (arrival vs. departure), geo_summary (international vs. domestic) etc. We will start with the total number of passengers aggregation:

library(dplyr)
library(plotly)

d_total <- d %>%
  group_by(date) %>%
  summarise(total = sum(passenger_count))

head(d_total)
#> # A tibble: 6 x 2
#>   date         total
#>   <date>       <int>
#> 1 2005-07-01 3225769
#> 2 2005-08-01 3195866
#> 3 2005-09-01 2740553
#> 4 2005-10-01 2770715
#> 5 2005-11-01 2617333
#> 6 2005-12-01 2671797

plot_ly(data = d_total,
        x = ~ date,
        y = ~ total,
        type = "scatter",
        mode = "lines") %>%
  layout(title = "Monthly Air Traffic Passengers at SFO",
         yaxis = list(title = "Number of Passengers"),
         xaxis = list(title = "Source: San Francisco data portal (DataSF)"))

Likewise, we can aggregate the data by the activity type (arrival, departure, transit):


d_activity_type <- d %>%
  group_by(date, activity_type_code) %>%
  summarise(total = sum(passenger_count), .groups = "drop") %>%
  as.data.frame()

head(d_activity_type)
#>         date activity_type_code   total
#> 1 2005-07-01           Deplaned 1627329
#> 2 2005-07-01           Enplaned 1569412
#> 3 2005-07-01     Thru / Transit   29028
#> 4 2005-08-01           Deplaned 1614468
#> 5 2005-08-01           Enplaned 1553146
#> 6 2005-08-01     Thru / Transit   28252

plot_ly(data = d_activity_type,
        x = ~ date,
        y = ~ total,
        color = ~ activity_type_code,
        type = "scatter",
        mode = "lines") %>%
  layout(title = "Monthly Air Traffic Passengers at SFO by Activity Type",
         yaxis = list(title = "Number of Passengers"),
         xaxis = list(title = "Source: San Francisco data portal (DataSF)"))

In the following plot provides the distribution of passengers by the flight geo type (international vs. domestic). We will use the pivot_wider function from the tidyr package to transform the data.frame into a wide format:

library(tidyr)

d_geo <- d %>%
  group_by(date, geo_summary) %>%
  summarise(total = sum(passenger_count), .groups = "drop") %>%
  pivot_wider(names_from = geo_summary, values_from = total) %>%
  as.data.frame()

head(d_geo)
#>         date Domestic International
#> 1 2005-07-01  2425902        799867
#> 2 2005-08-01  2405904        789962
#> 3 2005-09-01  2067923        672630
#> 4 2005-10-01  2095119        675596
#> 5 2005-11-01  1998879        618454
#> 6 2005-12-01  2003840        667957

plot_ly(data = d_geo, 
        x = ~ date, 
        y = ~ Domestic, 
        name = "Domestic", 
        type = "scatter", 
        mode = "none", 
        stackgroup = "one", 
        groupnorm = "percent", fillcolor = "#4C74C9") %>%
  add_trace(y = ~ International,
            name = "International",
            fillcolor = "#50CB86") %>%
  layout(title = 'United States Personal Expenditures by Categories',
         yaxis = list(title = "Proportion of Passengers", 
                      showgrid = FALSE,
                      ticksuffix = "%"),
         xaxis = list(title = "Source: San Francisco data portal (DataSF)",
                      showgrid = FALSE))

Airlines Analysis

The dataset provides two indicator about the airline company:

operating_airline filed describe the aircraft operator name, and
published_airline filed describe the name of the airline that issues the ticket and books revenue

In most of the cases it is the same company, in other cases, such as code-sharing operator it may two different companies. Let’s review the characteristics of those variables:


length(unique(d$operating_airline))
#> [1] 94
length(unique(d$published_airline))
#> [1] 84

You can noticed that there are less published airlines compared to operating airlines. This potentially mean that some operating airlines have lines with code sharing. For simplicity, we will focus on the operating_airline variable. Let’s review the unique combination of this variable:

sort(unique(d$operating_airline))
#>  [1] "ABC Aerolineas S.A. de C.V. dba Interjet"
#>  [2] "Aer Lingus"                              
#>  [3] "Aeromexico"                              
#>  [4] "Air Berlin"                              
#>  [5] "Air Canada"                              
#>  [6] "Air Canada Jazz"                         
#>  [7] "Air China"                               
#>  [8] "Air France"                              
#>  [9] "Air India Limited"                       
#> [10] "Air Italy S.P.A"                         
#> [11] "Air New Zealand"                         
#> [12] "Air Pacific Limited dba Fiji Airways"    
#> [13] "AirTran Airways"                         
#> [14] "Alaska Airlines"                         
#> [15] "All Nippon Airways"                      
#> [16] "Allegiant Air"                           
#> [17] "American Airlines"                       
#> [18] "American Eagle Airlines"                 
#> [19] "Ameriflight"                             
#> [20] "Asiana Airlines"                         
#> [21] "ATA Airlines"                            
#> [22] "Atlantic Southeast Airlines"             
#> [23] "Atlas Air, Inc"                          
#> [24] "BelAir Airlines"                         
#> [25] "Boeing Company"                          
#> [26] "British Airways"                         
#> [27] "Cathay Pacific"                          
#> [28] "China Airlines"                          
#> [29] "China Eastern"                           
#> [30] "China Southern"                          
#> [31] "Compass Airlines"                        
#> [32] "COPA Airlines, Inc."                     
#> [33] "Delta Air Lines"                         
#> [34] "El Al Israel Airlines LTD."              
#> [35] "Emirates"                                
#> [36] "Etihad Airways"                          
#> [37] "EVA Airways"                             
#> [38] "Evergreen International Airlines"        
#> [39] "ExpressJet Airlines"                     
#> [40] "Finnair"                                 
#> [41] "French Bee"                              
#> [42] "Frontier Airlines"                       
#> [43] "Hawaiian Airlines"                       
#> [44] "Hong Kong Airlines Limited"              
#> [45] "Horizon Air"                             
#> [46] "Iberia"                                  
#> [47] "Icelandair (Inactive)"                   
#> [48] "Icelandair EHF"                          
#> [49] "Independence Air"                        
#> [50] "Japan Airlines"                          
#> [51] "Jazz Aviation"                           
#> [52] "Jet Airways"                             
#> [53] "JetBlue Airways"                         
#> [54] "KLM Royal Dutch Airlines"                
#> [55] "Korean Air Lines"                        
#> [56] "LAN Peru"                                
#> [57] "Lufthansa German Airlines"               
#> [58] "Mesa Airlines"                           
#> [59] "Mesaba Airlines"                         
#> [60] "Mexicana Airlines"                       
#> [61] "Miami Air International"                 
#> [62] "Midwest Airlines"                        
#> [63] "Northwest Airlines"                      
#> [64] "Norwegian Air Shuttle ASA"               
#> [65] "Norwegian Air UK Ltd"                    
#> [66] "Pacific Aviation"                        
#> [67] "Philippine Airlines"                     
#> [68] "Qantas Airways"                          
#> [69] "Republic Airlines"                       
#> [70] "SAS Airlines"                            
#> [71] "Servisair"                               
#> [72] "Singapore Airlines"                      
#> [73] "SkyWest Airlines"                        
#> [74] "Southwest Airlines"                      
#> [75] "Spirit Airlines"                         
#> [76] "Sun Country Airlines"                    
#> [77] "Swiss International"                     
#> [78] "Swissport USA"                           
#> [79] "TACA International Airlines, S.A."       
#> [80] "TAP Air Portugal"                        
#> [81] "Thomas Cook Airlines"                    
#> [82] "Trego Dugan Aviation"                    
#> [83] "Turkish Airlines"                        
#> [84] "United Airlines"                         
#> [85] "United Airlines - Pre 07/01/2013"        
#> [86] "US Airways"                              
#> [87] "Virgin America"                          
#> [88] "Virgin Atlantic"                         
#> [89] "Volaris Airlines"                        
#> [90] "WestJet Airlines"                        
#> [91] "World Airways"                           
#> [92] "WOW Air"                                 
#> [93] "XL Airways France"                       
#> [94] "Xtra Airways"

The main thing that immediately pop out of this list that United Airlines, the main carrier at SFO, listed twice as United Airlines and United Airlines - Pre 07/01/2013. I could not find any reference that explain the second label. If analyzing data prior to July 2013, I would relabel those cases as United Airlines. For this analysis we will focus on 2019, the most recent full year. Let’s start by filtering the data for 2019 flights:


d19 <- d %>% filter(activity_period >= 201901 & activity_period < 202001) %>%
  group_by(published_airline, geo_summary) %>%
  summarise(total = sum(passenger_count), .groups = "drop")

Before we will summarize the data by airlines, let’s see the distribution of flights by geo type (domestic vs. international):


d19 %>% group_by(geo_summary) %>%
  summarise(total_geo = sum(total)) %>%
  plot_ly(labels = ~ geo_summary,
          values = ~ total_geo) %>%
  add_pie(hole = 0.6)

Close 75% of the air passengers traffic in SFO during 2019 were domestic. That is 42M passengers out of 57.4M passengers. Let’s use tree map plot to see the distribution of passengers by airline and geo:

  plot_ly(data = d19 %>% filter(geo_summary == "Domestic"),
          labels = ~ published_airline, 
          values = ~ total,
          type = "treemap",
          parents=  ~geo_summary,
          domain = list(row=0),
          name = "Domestic",
          textinfo="label+value+percent parent") %>%
    add_trace(data = d19 %>% filter(geo_summary == "International"),
            labels = ~ published_airline, 
            values = ~ total,
            type = "treemap",
            parents=  ~geo_summary,
            domain = list(row=1),
            name = "International",
            textinfo="label+value+percent parent") %>%
    layout(title = "Passengers Distribution During 2019 by Geo",
      grid=list(columns=1, rows=2))

Air Traffic Summary

Last but not least, it would be interesting to leverage the different categorical variables to create a summary plot with sankey. We will use the sankey_ly function, a plotly wrapper that create automatically the data transformation and sankey plot with plotly. We will rank the top 20 airlines during 2019 and plot the passengers distribution by airline, travel type (domestic or international), travel geo, activity type (deplaned, enplaned and transit) and fare type (low or other):

rank <- sfo_passengers %>%
  filter(activity_period >= 201901 & activity_period <= 201912) %>%
  group_by(operating_airline) %>%
  summarise(total = sum(passenger_count), .groups = "drop") %>%
  arrange(-total) %>%
  slice_head(n = 20) %>%
  select(operating_airline)


sfo_passengers %>%
  filter(activity_period >= 201901 & activity_period <= 201912, operating_airline %in% rank$operating_airline) %>%
  group_by(operating_airline, geo_summary, geo_region, activity_type_code, 
           price_category_code, terminal) %>%
  summarise(total = sum(passenger_count), .groups = "drop") %>%
  sankey_ly(cols = c("operating_airline","geo_summary",
                      "geo_region", "activity_type_code",
                      "price_category_code"),
          num_col = "total") %>%
  layout(title = "SFO Air Traffic Passenger Dist. During 2019")