# Get the table information
<- function(input){
tbl_info
<- base::deparse(base::substitute(input))
obj.name
<- as.data.frame(input)
input
<- sum(duplicated(input))
dup
<- data.frame(cols_name = names(input),
df cols_class = lapply(input, class) |> unlist(),
cols_NAs = lapply(names(input), function(i){sum(is.na(input[, i]))}) |> unlist(),
cols_min = lapply(names(input), function(i){if(is.numeric(input[, i])){
min(input[, i], na.rm = TRUE)
else {
} NA
|> unlist(),
}}) cols_max = lapply(names(input), function(i){if(is.numeric(input[, i])){
max(input[, i], na.rm = TRUE)
else {
} NA
|> unlist(),
}}) cols_unique = lapply(names(input), function(i){length(unique(input[, i]))}) |> unlist(),
stringsAsFactors = FALSE)
rownames(df) <- NULL
<- htmltools::div(class = "tbl-info",
t ::div(class = "tbl-info",
htmltools::h4(class = "tbl-info",
htmltoolspaste("Table Info - ", obj.name)),
paste("Number of columns:", ncol(input)),
::br(),
htmltoolspaste("Number of rows:", nrow(input)),
::br(),
htmltoolspaste("Duplicated rows:", dup)
),::reactable(df, defaultPageSize = nrow(df)))
reactable
return(t)
}
Vaccine Dataset Data Pipeline
Note: As of March 10, 2023, the source data no longer get updated. Therefore, this data pipeline is disabled.
This is the coronavirus R package data pipeline.
Functions
Parameters
<- "https://raw.githubusercontent.com/govex/COVID-19/master/data_tables/vaccine_data/global_data/time_series_covid19_vaccine_global.csv"
url
<- system(command = "git rev-parse --abbrev-ref HEAD", intern = TRUE) branch
Pulling raw data
<- covid19_vaccine_temp <- NULL
covid19_vaccine
tryCatch(
<- readr::read_csv(file = url,
covid19_vaccine_temp col_types = readr::cols(
Date = readr::col_date(format = ""),
UID = readr::col_number(),
Province_State = readr::col_character(),
Country_Region = readr::col_character(),
Doses_admin = readr::col_integer(),
People_at_least_one_dose = readr::col_number()
|>
)) as.data.frame(),
error = function(c) base::message(c)
)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)
if(is.null(covid19_vaccine_temp)){
stop("Could not pull the covid19_vaccine_temp dataset, check the error")
else if(nrow(covid19_vaccine_temp) < 132000 || ncol(covid19_vaccine_temp) != 6){
} stop("The dimensions of the covid19_vaccine_temp dataset are invalid")
else if(class(covid19_vaccine_temp$Date) != "Date"){
} stop("The class of the Date column is invalid")
}
names(covid19_vaccine_temp) <- tolower(names(covid19_vaccine_temp))
tbl_info(covid19_vaccine_temp)
Table Info - covid19_vaccine_temp
Number of columns: 6Number of rows: 142597
Duplicated rows: 0
Merge with GIS codes
load("../data_raw/gis_mapping.RData")
# Removing duplicated iso codes for countries that located in two continents
# Azerbaijan -> Asia
<- continent_mapping[- which(continent_mapping$iso2 == "AZ" &
continent_mapping $iso3 == "AZE" &
continent_mapping$continent_name == "Europe"),]
continent_mapping# Armenia -> Asia
<- continent_mapping[- which(continent_mapping$iso2 == "AM" &
continent_mapping $iso3 == "ARM" &
continent_mapping$continent_name == "Europe"),]
continent_mapping# Cyprus -> Europe
<- continent_mapping[- which(continent_mapping$iso2 == "CY" &
continent_mapping $iso3 == "CYP" &
continent_mapping$continent_name == "Asia"),]
continent_mapping# Georgia -> Asia
<- continent_mapping[- which(continent_mapping$iso2 == "GE" &
continent_mapping $iso3 == "GEO" &
continent_mapping$continent_name == "Europe"),]
continent_mapping
# Kazakhstan -> Asia
<- continent_mapping[- which(continent_mapping$iso2 == "KZ" &
continent_mapping $iso3 == "KAZ" &
continent_mapping$continent_name == "Europe"),]
continent_mapping
#United States Minor Outlying Islands -> Oceania
<- continent_mapping[- which(continent_mapping$iso2 == "UM" &
continent_mapping $iso3 == "UMI" &
continent_mapping$continent_name == "North America"),]
continent_mapping
# Russian Federation -> Europe
<- continent_mapping[- which(continent_mapping$iso2 == "RU" &
continent_mapping $iso3 == "RUS" &
continent_mapping$continent_name == "Asia"),]
continent_mapping
# Turkey -> Asia
<- continent_mapping[- which(continent_mapping$iso2 == "TR" &
continent_mapping $iso3 == "TUR" &
continent_mapping$continent_name == "Europe"),]
continent_mapping
$uid <- ifelse(covid19_vaccine_temp$country_region == "Taiwan*" & is.na(covid19_vaccine_temp$uid),
covid19_vaccine_temp158,
$uid)
covid19_vaccine_temp
<- covid19_vaccine_temp |> dplyr::left_join(gis_code_mapping |> dplyr::select(- country_region, - province_state),
covid19_vaccine by = c("uid")) |>
::select(-admin2) |>
dplyr::mutate(iso2 = ifelse(country_region == "Namibia", "NA", iso2)) |>
dplyr::left_join(continent_mapping |> dplyr::select(-country_name),
dplyrby = c("uid", "iso2", "iso3")) |>
::select(date, country_region, continent_name,
dplyr
continent_code, combined_key, doses_admin,
people_at_least_one_dose, population, uid,
iso2, iso3, code3, fips, lat, long)
table(is.na(covid19_vaccine$continent_name))
FALSE TRUE
138287 4310
<- covid19_vaccine[which(is.na(covid19_vaccine$continent_name)), ]
x unique(x$country_region)
[1] "World" "Sudan" "Kosovo" "Nauru" "Tonga" "Tuvalu"
# Fixing missing values - continent
$continent_code <- ifelse(covid19_vaccine$country_region == "Nauru" | covid19_vaccine$country_region == "Tonga" | covid19_vaccine$country_region == "Tuvalu", "OC", covid19_vaccine$continent_code)
covid19_vaccine
$continent_name <- ifelse(covid19_vaccine$country_region == "Nauru" | covid19_vaccine$country_region == "Tonga" | covid19_vaccine$country_region == "Tuvalu", "Oceania", covid19_vaccine$continent_name)
covid19_vaccine
$continent_code <- ifelse(covid19_vaccine$country_region == "Sudan", "AF", covid19_vaccine$continent_code)
covid19_vaccine
$continent_name <- ifelse(covid19_vaccine$country_region == "Sudan", "Africa", covid19_vaccine$continent_name)
covid19_vaccine
$continent_code <- ifelse(covid19_vaccine$country_region == "Kosovo", "EU", covid19_vaccine$continent_code)
covid19_vaccine
$continent_name <- ifelse(covid19_vaccine$country_region == "Kosovo", "Europe", covid19_vaccine$continent_name)
covid19_vaccine
tbl_info(covid19_vaccine)
Table Info - covid19_vaccine
Number of columns: 15Number of rows: 142597
Duplicated rows: 0
Data validation
# Check if there are duplication in the US data
<- covid19_vaccine |> dplyr::filter(iso2 == "US")
x if(any(table(duplicated(x[, c( "date")]))) &&
all(c("US", "US (Aggregate)") %in% unique(x$country_region))){
<- covid19_vaccine |>
covid19_vaccine ::filter(country_region != "US (Aggregate)")
dplyr
}
load(sprintf("../data_pipelines/log_%s.RData", branch))
tail(log)
time dataset nrows last_date update success
411 2023-03-19 16:24:57 covid19_vaccine 142597 2023-03-09 FALSE TRUE
412 2023-03-20 01:37:01 coronavirus 973836 2023-03-09 FALSE TRUE
413 2023-03-20 01:37:11 covid19_vaccine 142597 2023-03-09 FALSE TRUE
414 2023-03-20 08:25:07 coronavirus 973836 2023-03-09 FALSE TRUE
415 2023-03-20 08:25:16 covid19_vaccine 142597 2023-03-09 FALSE TRUE
416 2023-03-20 16:28:35 coronavirus 973836 2023-03-09 FALSE TRUE
backfile branch
411 FALSE main
412 FALSE main
413 FALSE main
414 FALSE main
415 FALSE main
416 FALSE main
<- log |> dplyr::filter(update == TRUE & success == TRUE, dataset == "covid19_vaccine") |>
log_last ::filter(time == max(time))
dplyr
if(nrow(covid19_vaccine) > log_last$nrows ){
cat("Updating the vaccine data...")
::use_data(covid19_vaccine, overwrite = TRUE, compress = "xz")
usethiswrite.csv(covid19_vaccine, "../csv/covid19_vaccine.csv", row.names = FALSE)
<- rbind(log, data.frame(time = Sys.time(),
log dataset = "covid19_vaccine",
nrows = nrow(covid19_vaccine),
last_date = max(covid19_vaccine$date),
update = TRUE,
success = TRUE,
backfile = FALSE,
branch = branch,
stringsAsFactors = FALSE))
save(log, file = sprintf("../data_pipelines/log_%s.RData", branch))
else {
} <- rbind(log, data.frame(time = Sys.time(),
log dataset = "covid19_vaccine",
nrows = nrow(covid19_vaccine),
last_date = max(covid19_vaccine$date),
update = FALSE,
success = TRUE,
backfile = FALSE,
branch = branch,
stringsAsFactors = FALSE))
save(log, file = sprintf("../data_pipelines/log_%s.RData", branch))
cat("No updates available...")
}
No updates available...