# Get the table information
tbl_info <- function(input){
obj.name <- base::deparse(base::substitute(input))
input <- as.data.frame(input)
dup <- sum(duplicated(input))
df <- data.frame(cols_name = names(input),
cols_class = lapply(input, class) |> unlist(),
cols_NAs = lapply(names(input), function(i){sum(is.na(input[, i]))}) |> unlist(),
cols_min = lapply(names(input), function(i){if(is.numeric(input[, i])){
min(input[, i], na.rm = TRUE)
} else {
NA
}}) |> unlist(),
cols_max = lapply(names(input), function(i){if(is.numeric(input[, i])){
max(input[, i], na.rm = TRUE)
} else {
NA
}}) |> unlist(),
cols_unique = lapply(names(input), function(i){length(unique(input[, i]))}) |> unlist(),
stringsAsFactors = FALSE)
rownames(df) <- NULL
t <- htmltools::div(class = "tbl-info",
htmltools::div(class = "tbl-info",
htmltools::h4(class = "tbl-info",
paste("Table Info - ", obj.name)),
paste("Number of columns:", ncol(input)),
htmltools::br(),
paste("Number of rows:", nrow(input)),
htmltools::br(),
paste("Duplicated rows:", dup)
),
reactable::reactable(df, defaultPageSize = nrow(df)))
return(t)
}Vaccine Dataset Data Pipeline
Note: As of March 10, 2023, the source data no longer get updated. Therefore, this data pipeline is disabled.
This is the coronavirus R package data pipeline.
Functions
Parameters
url <- "https://raw.githubusercontent.com/govex/COVID-19/master/data_tables/vaccine_data/global_data/time_series_covid19_vaccine_global.csv"
branch <- system(command = "git rev-parse --abbrev-ref HEAD", intern = TRUE)Pulling raw data
covid19_vaccine <- covid19_vaccine_temp <- NULL
tryCatch(
covid19_vaccine_temp <- readr::read_csv(file = url,
col_types = readr::cols(
Date = readr::col_date(format = ""),
UID = readr::col_number(),
Province_State = readr::col_character(),
Country_Region = readr::col_character(),
Doses_admin = readr::col_integer(),
People_at_least_one_dose = readr::col_number()
)) |>
as.data.frame(),
error = function(c) base::message(c)
)Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)
if(is.null(covid19_vaccine_temp)){
stop("Could not pull the covid19_vaccine_temp dataset, check the error")
} else if(nrow(covid19_vaccine_temp) < 132000 || ncol(covid19_vaccine_temp) != 6){
stop("The dimensions of the covid19_vaccine_temp dataset are invalid")
} else if(class(covid19_vaccine_temp$Date) != "Date"){
stop("The class of the Date column is invalid")
}
names(covid19_vaccine_temp) <- tolower(names(covid19_vaccine_temp))
tbl_info(covid19_vaccine_temp)Table Info - covid19_vaccine_temp
Number of columns: 6Number of rows: 142597
Duplicated rows: 0
Merge with GIS codes
load("../data_raw/gis_mapping.RData")
# Removing duplicated iso codes for countries that located in two continents
# Azerbaijan -> Asia
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "AZ" &
continent_mapping$iso3 == "AZE" &
continent_mapping$continent_name == "Europe"),]
# Armenia -> Asia
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "AM" &
continent_mapping$iso3 == "ARM" &
continent_mapping$continent_name == "Europe"),]
# Cyprus -> Europe
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "CY" &
continent_mapping$iso3 == "CYP" &
continent_mapping$continent_name == "Asia"),]
# Georgia -> Asia
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "GE" &
continent_mapping$iso3 == "GEO" &
continent_mapping$continent_name == "Europe"),]
# Kazakhstan -> Asia
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "KZ" &
continent_mapping$iso3 == "KAZ" &
continent_mapping$continent_name == "Europe"),]
#United States Minor Outlying Islands -> Oceania
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "UM" &
continent_mapping$iso3 == "UMI" &
continent_mapping$continent_name == "North America"),]
# Russian Federation -> Europe
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "RU" &
continent_mapping$iso3 == "RUS" &
continent_mapping$continent_name == "Asia"),]
# Turkey -> Asia
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "TR" &
continent_mapping$iso3 == "TUR" &
continent_mapping$continent_name == "Europe"),]
covid19_vaccine_temp$uid <- ifelse(covid19_vaccine_temp$country_region == "Taiwan*" & is.na(covid19_vaccine_temp$uid),
158,
covid19_vaccine_temp$uid)
covid19_vaccine <- covid19_vaccine_temp |> dplyr::left_join(gis_code_mapping |> dplyr::select(- country_region, - province_state),
by = c("uid")) |>
dplyr::select(-admin2) |>
dplyr::mutate(iso2 = ifelse(country_region == "Namibia", "NA", iso2)) |>
dplyr::left_join(continent_mapping |> dplyr::select(-country_name),
by = c("uid", "iso2", "iso3")) |>
dplyr::select(date, country_region, continent_name,
continent_code, combined_key, doses_admin,
people_at_least_one_dose, population, uid,
iso2, iso3, code3, fips, lat, long)
table(is.na(covid19_vaccine$continent_name))
FALSE TRUE
138287 4310
x <- covid19_vaccine[which(is.na(covid19_vaccine$continent_name)), ]
unique(x$country_region)[1] "World" "Sudan" "Kosovo" "Nauru" "Tonga" "Tuvalu"
# Fixing missing values - continent
covid19_vaccine$continent_code <- ifelse(covid19_vaccine$country_region == "Nauru" | covid19_vaccine$country_region == "Tonga" | covid19_vaccine$country_region == "Tuvalu", "OC", covid19_vaccine$continent_code)
covid19_vaccine$continent_name <- ifelse(covid19_vaccine$country_region == "Nauru" | covid19_vaccine$country_region == "Tonga" | covid19_vaccine$country_region == "Tuvalu", "Oceania", covid19_vaccine$continent_name)
covid19_vaccine$continent_code <- ifelse(covid19_vaccine$country_region == "Sudan", "AF", covid19_vaccine$continent_code)
covid19_vaccine$continent_name <- ifelse(covid19_vaccine$country_region == "Sudan", "Africa", covid19_vaccine$continent_name)
covid19_vaccine$continent_code <- ifelse(covid19_vaccine$country_region == "Kosovo", "EU", covid19_vaccine$continent_code)
covid19_vaccine$continent_name <- ifelse(covid19_vaccine$country_region == "Kosovo", "Europe", covid19_vaccine$continent_name)
tbl_info(covid19_vaccine)Table Info - covid19_vaccine
Number of columns: 15Number of rows: 142597
Duplicated rows: 0
Data validation
# Check if there are duplication in the US data
x <- covid19_vaccine |> dplyr::filter(iso2 == "US")
if(any(table(duplicated(x[, c( "date")]))) &&
all(c("US", "US (Aggregate)") %in% unique(x$country_region))){
covid19_vaccine <- covid19_vaccine |>
dplyr::filter(country_region != "US (Aggregate)")
}
load(sprintf("../data_pipelines/log_%s.RData", branch))
tail(log) time dataset nrows last_date update success
411 2023-03-19 16:24:57 covid19_vaccine 142597 2023-03-09 FALSE TRUE
412 2023-03-20 01:37:01 coronavirus 973836 2023-03-09 FALSE TRUE
413 2023-03-20 01:37:11 covid19_vaccine 142597 2023-03-09 FALSE TRUE
414 2023-03-20 08:25:07 coronavirus 973836 2023-03-09 FALSE TRUE
415 2023-03-20 08:25:16 covid19_vaccine 142597 2023-03-09 FALSE TRUE
416 2023-03-20 16:28:35 coronavirus 973836 2023-03-09 FALSE TRUE
backfile branch
411 FALSE main
412 FALSE main
413 FALSE main
414 FALSE main
415 FALSE main
416 FALSE main
log_last <- log |> dplyr::filter(update == TRUE & success == TRUE, dataset == "covid19_vaccine") |>
dplyr::filter(time == max(time))
if(nrow(covid19_vaccine) > log_last$nrows ){
cat("Updating the vaccine data...")
usethis::use_data(covid19_vaccine, overwrite = TRUE, compress = "xz")
write.csv(covid19_vaccine, "../csv/covid19_vaccine.csv", row.names = FALSE)
log <- rbind(log, data.frame(time = Sys.time(),
dataset = "covid19_vaccine",
nrows = nrow(covid19_vaccine),
last_date = max(covid19_vaccine$date),
update = TRUE,
success = TRUE,
backfile = FALSE,
branch = branch,
stringsAsFactors = FALSE))
save(log, file = sprintf("../data_pipelines/log_%s.RData", branch))
} else {
log <- rbind(log, data.frame(time = Sys.time(),
dataset = "covid19_vaccine",
nrows = nrow(covid19_vaccine),
last_date = max(covid19_vaccine$date),
update = FALSE,
success = TRUE,
backfile = FALSE,
branch = branch,
stringsAsFactors = FALSE))
save(log, file = sprintf("../data_pipelines/log_%s.RData", branch))
cat("No updates available...")
}No updates available...