Vaccine Dataset Data Pipeline

Author

Rami Krispin

Published

March 20, 2023

Note: As of March 10, 2023, the source data no longer get updated. Therefore, this data pipeline is disabled.

This is the coronavirus R package data pipeline.

Functions

# Get the table information
tbl_info <- function(input){
  
  obj.name <- base::deparse(base::substitute(input))
  
  input <- as.data.frame(input)
  
  dup <- sum(duplicated(input))
  
  df <- data.frame(cols_name = names(input),
                   cols_class = lapply(input, class) |> unlist(),
                   cols_NAs = lapply(names(input), function(i){sum(is.na(input[, i]))}) |> unlist(),
                   cols_min = lapply(names(input), function(i){if(is.numeric(input[, i])){
                     min(input[, i], na.rm = TRUE)
                   } else {
                     NA
                   }}) |> unlist(),
                   cols_max = lapply(names(input), function(i){if(is.numeric(input[, i])){
                     max(input[, i], na.rm = TRUE)
                   } else {
                     NA
                   }}) |> unlist(),
                   cols_unique = lapply(names(input), function(i){length(unique(input[, i]))}) |> unlist(),
                   stringsAsFactors = FALSE)
  
  rownames(df) <- NULL
  
  t <- htmltools::div(class = "tbl-info",
                      htmltools::div(class = "tbl-info",
                                     htmltools::h4(class = "tbl-info", 
                                                   paste("Table Info - ", obj.name)),
                                     paste("Number of  columns:", ncol(input)),
                                     htmltools::br(),
                                     paste("Number of  rows:", nrow(input)),
                                     htmltools::br(),
                                     paste("Duplicated rows:", dup)
                      ),
                      reactable::reactable(df, defaultPageSize = nrow(df)))
  
  return(t)
}

Parameters

url <- "https://raw.githubusercontent.com/govex/COVID-19/master/data_tables/vaccine_data/global_data/time_series_covid19_vaccine_global.csv"

branch <- system(command = "git rev-parse --abbrev-ref HEAD", intern = TRUE)

Pulling raw data

covid19_vaccine <- covid19_vaccine_temp <-  NULL

  tryCatch(
    covid19_vaccine_temp <- readr::read_csv(file = url,
                                       col_types = readr::cols(
  Date = readr::col_date(format = ""),
  UID = readr::col_number(),
  Province_State = readr::col_character(),
  Country_Region = readr::col_character(),
  Doses_admin = readr::col_integer(),
  People_at_least_one_dose = readr::col_number()
)) |>
      as.data.frame(),
    error = function(c) base::message(c)
  )
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
  if(is.null(covid19_vaccine_temp)){
    stop("Could not pull the covid19_vaccine_temp dataset, check the error")
  } else if(nrow(covid19_vaccine_temp) < 132000 || ncol(covid19_vaccine_temp) != 6){
    stop("The dimensions of the covid19_vaccine_temp dataset are invalid")
  } else if(class(covid19_vaccine_temp$Date) != "Date"){
    stop("The class of the Date column is invalid")
  } 

 names(covid19_vaccine_temp) <- tolower(names(covid19_vaccine_temp))
 
 
 tbl_info(covid19_vaccine_temp)

Table Info - covid19_vaccine_temp

Number of columns: 6
Number of rows: 142597
Duplicated rows: 0

Merge with GIS codes

load("../data_raw/gis_mapping.RData")
# Removing duplicated iso codes for countries that located in two continents
# Azerbaijan -> Asia
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "AZ" & 
                                                 continent_mapping$iso3 == "AZE" &
                                                 continent_mapping$continent_name == "Europe"),] 
# Armenia -> Asia
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "AM" & 
                                                 continent_mapping$iso3 == "ARM" &
                                                 continent_mapping$continent_name == "Europe"),] 
# Cyprus -> Europe
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "CY" & 
                                                 continent_mapping$iso3 == "CYP" &
                                                 continent_mapping$continent_name == "Asia"),] 
# Georgia -> Asia
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "GE" & 
                                                 continent_mapping$iso3 == "GEO" &
                                                 continent_mapping$continent_name == "Europe"),] 

# Kazakhstan -> Asia
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "KZ" & 
                                                 continent_mapping$iso3 == "KAZ" &
                                                 continent_mapping$continent_name == "Europe"),] 

#United States Minor Outlying Islands -> Oceania
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "UM" & 
                                                 continent_mapping$iso3 == "UMI" &
                                                 continent_mapping$continent_name == "North America"),] 

# Russian Federation -> Europe
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "RU" & 
                                                 continent_mapping$iso3 == "RUS" &
                                                 continent_mapping$continent_name == "Asia"),] 

# Turkey -> Asia
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "TR" & 
                                                 continent_mapping$iso3 == "TUR" &
                                                 continent_mapping$continent_name == "Europe"),] 


covid19_vaccine_temp$uid <- ifelse(covid19_vaccine_temp$country_region == "Taiwan*" & is.na(covid19_vaccine_temp$uid),
                                   158,
                                   covid19_vaccine_temp$uid)



covid19_vaccine <- covid19_vaccine_temp |> dplyr::left_join(gis_code_mapping |> dplyr::select(- country_region, - province_state),
                                                             by = c("uid")) |>
  dplyr::select(-admin2) |>
  dplyr::mutate(iso2 = ifelse(country_region == "Namibia", "NA", iso2)) |>
  dplyr::left_join(continent_mapping |> dplyr::select(-country_name),
                   by = c("uid", "iso2", "iso3")) |>
  dplyr::select(date, country_region, continent_name,
                continent_code, combined_key, doses_admin,
                people_at_least_one_dose, population, uid, 
                iso2, iso3, code3, fips, lat, long)


table(is.na(covid19_vaccine$continent_name))

 FALSE   TRUE 
138287   4310 
x <- covid19_vaccine[which(is.na(covid19_vaccine$continent_name)), ]
unique(x$country_region)
[1] "World"  "Sudan"  "Kosovo" "Nauru"  "Tonga"  "Tuvalu"
# Fixing missing values - continent
covid19_vaccine$continent_code <- ifelse(covid19_vaccine$country_region == "Nauru" | covid19_vaccine$country_region == "Tonga" | covid19_vaccine$country_region == "Tuvalu", "OC", covid19_vaccine$continent_code)

covid19_vaccine$continent_name <- ifelse(covid19_vaccine$country_region == "Nauru" | covid19_vaccine$country_region == "Tonga" | covid19_vaccine$country_region == "Tuvalu", "Oceania", covid19_vaccine$continent_name)


covid19_vaccine$continent_code <- ifelse(covid19_vaccine$country_region == "Sudan", "AF", covid19_vaccine$continent_code)

covid19_vaccine$continent_name <- ifelse(covid19_vaccine$country_region == "Sudan", "Africa", covid19_vaccine$continent_name)


covid19_vaccine$continent_code <- ifelse(covid19_vaccine$country_region == "Kosovo", "EU", covid19_vaccine$continent_code)

covid19_vaccine$continent_name <- ifelse(covid19_vaccine$country_region == "Kosovo", "Europe", covid19_vaccine$continent_name)



tbl_info(covid19_vaccine)

Table Info - covid19_vaccine

Number of columns: 15
Number of rows: 142597
Duplicated rows: 0

Data validation

# Check if there are duplication in the US data
  x <- covid19_vaccine |> dplyr::filter(iso2 == "US")
  if(any(table(duplicated(x[, c( "date")]))) &&
     all(c("US", "US (Aggregate)") %in% unique(x$country_region))){
    covid19_vaccine <- covid19_vaccine |>
      dplyr::filter(country_region != "US (Aggregate)")
  }



  
load(sprintf("../data_pipelines/log_%s.RData", branch))
tail(log)
                   time         dataset  nrows  last_date update success
411 2023-03-19 16:24:57 covid19_vaccine 142597 2023-03-09  FALSE    TRUE
412 2023-03-20 01:37:01     coronavirus 973836 2023-03-09  FALSE    TRUE
413 2023-03-20 01:37:11 covid19_vaccine 142597 2023-03-09  FALSE    TRUE
414 2023-03-20 08:25:07     coronavirus 973836 2023-03-09  FALSE    TRUE
415 2023-03-20 08:25:16 covid19_vaccine 142597 2023-03-09  FALSE    TRUE
416 2023-03-20 16:28:35     coronavirus 973836 2023-03-09  FALSE    TRUE
    backfile branch
411    FALSE   main
412    FALSE   main
413    FALSE   main
414    FALSE   main
415    FALSE   main
416    FALSE   main
log_last <- log |> dplyr::filter(update == TRUE & success == TRUE, dataset == "covid19_vaccine") |>
  dplyr::filter(time == max(time))
  
  if(nrow(covid19_vaccine) > log_last$nrows ){
    cat("Updating the vaccine data...")
    usethis::use_data(covid19_vaccine, overwrite = TRUE, compress = "xz")
    write.csv(covid19_vaccine, "../csv/covid19_vaccine.csv", row.names = FALSE)
      
    log <- rbind(log, data.frame(time = Sys.time(),
                    dataset = "covid19_vaccine",
                    nrows = nrow(covid19_vaccine),
                    last_date = max(covid19_vaccine$date),
                    update = TRUE,
                    success = TRUE,
                    backfile = FALSE,
                    branch = branch,
                    stringsAsFactors = FALSE))
  save(log, file = sprintf("../data_pipelines/log_%s.RData", branch))
    
    
  } else {
        log <- rbind(log, data.frame(time = Sys.time(),
                    dataset = "covid19_vaccine",
                    nrows = nrow(covid19_vaccine),
                    last_date = max(covid19_vaccine$date),
                    update = FALSE,
                    success = TRUE,
                    backfile = FALSE,
                    branch = branch,
                    stringsAsFactors = FALSE))
  save(log, file = sprintf("../data_pipelines/log_%s.RData", branch))
    cat("No updates available...")
  }
No updates available...