# function for pursing the JH covid19 cases csv file
parse_url <- function(url, type){
raw <- readr::read_csv(file = url) |>
as.data.frame()
# Transforming the data from wide to long
# Creating new data frame
df <- raw[, 1:4]
names(df) <- gsub(pattern= "/", replacement = ".", x = names(df))
# Take diff
for(i in 5:ncol(raw)){
raw[,i] <- as.integer(raw[,i])
if(i == 5){
df[[names(raw)[i]]] <- raw[, i]
} else {
df[[names(raw)[i]]] <- raw[, i] - raw[, i - 1]
}
}
df1 <- df |> tidyr::pivot_longer(cols = dplyr::starts_with(c("1", "2", "3", "4", "5", "6", "7", "8", "9")),
names_to = "date_temp",
values_to = "cases_temp") |>
dplyr::mutate(date = lubridate::mdy(date_temp)) |>
dplyr::group_by(Province.State, Country.Region, Lat, Long, date) |>
dplyr::summarise(cases = sum(cases_temp),
.groups = "drop") |>
dplyr::ungroup() |>
dplyr::mutate(type = type,
Country.Region = trimws(Country.Region),
Province.State = trimws(Province.State))
if(type == "recovered"){
df1$cases <- ifelse(df1$date > as.Date("2021-08-04") & df1$cases == 0 |
df1$date > as.Date("2021-08-04") & df1$cases < 0, NA, df1$cases)
}
return(df1)
}
# Get the table information
tbl_info <- function(input){
obj.name <- base::deparse(base::substitute(input))
input <- as.data.frame(input)
dup <- sum(duplicated(input))
df <- data.frame(cols_name = names(input),
cols_class = lapply(input, class) |> unlist(),
cols_NAs = lapply(names(input), function(i){sum(is.na(input[, i]))}) |> unlist(),
cols_min = lapply(names(input), function(i){if(is.numeric(input[, i])){
min(input[, i], na.rm = TRUE)
} else {
NA
}}) |> unlist(),
cols_max = lapply(names(input), function(i){if(is.numeric(input[, i])){
max(input[, i], na.rm = TRUE)
} else {
NA
}}) |> unlist(),
cols_unique = lapply(names(input), function(i){length(unique(input[, i]))}) |> unlist(),
stringsAsFactors = FALSE)
rownames(df) <- NULL
t <- htmltools::div(class = "tbl-info",
htmltools::div(class = "tbl-info",
htmltools::h4(class = "tbl-info",
paste("Table Info - ", obj.name)),
paste("Number of columns:", ncol(input)),
htmltools::br(),
paste("Number of rows:", nrow(input)),
htmltools::br(),
paste("Duplicated rows:", dup)
),
reactable::reactable(df, defaultPageSize = nrow(df)))
return(t)
}Coronavirus Dataset Data Pipeline
Note: As of March 10, 2023, the source data no longer get updated. Therefore, this data pipeline is disabled.
This is the coronavirus R package data pipeline.
Functions
Parameters
# Set success flag
s <- TRUE
msg <- NULL
# Confirmed cases
conf_url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
conf_df <- NULL
# Death cases
death_url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
death_df <- NULL
# Recovery cases
rec_url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"
rec_df <- NULLConfirmed Cases
conf_df <- parse_url(url = conf_url, type = "confirmed")Rows: 289 Columns: 1147
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Province/State, Country/Region
dbl (1145): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
if(is.null(conf_df) || !is.data.frame(conf_df) || nrow(conf_df) == 0){
s <- FALSE
} else{
tbl_info(conf_df)
}Table Info - conf_df
Number of columns: 7Number of rows: 330327
Duplicated rows: 0
Death Cases
death_df <- parse_url(url = death_url, type = "death")Rows: 289 Columns: 1147
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Province/State, Country/Region
dbl (1145): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
if(is.null(death_df) || !is.data.frame(death_df) || nrow(death_df) == 0){
s <- FALSE
} else{
tbl_info(death_df)
}Table Info - death_df
Number of columns: 7Number of rows: 330327
Duplicated rows: 0
Recovery Cases
rec_df <- parse_url(url = rec_url, type = "recovery")Rows: 274 Columns: 1147
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Province/State, Country/Region
dbl (1145): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Setting the first day with no recovery cases reported as zero
rec_df$cases <- ifelse(rec_df$date == as.Date("2021-08-05"), 0, rec_df$cases)
if(is.null(rec_df) || !is.data.frame(rec_df) || nrow(rec_df) == 0){
s <- FALSE
} else{
tbl_info(rec_df)
}Table Info - rec_df
Number of columns: 7Number of rows: 313182
Duplicated rows: 0
Append the data
coronavirus_temp <- dplyr::bind_rows(conf_df, death_df, rec_df) |>
dplyr::select(date, province = Province.State, country = Country.Region, lat = Lat, long = Long, type, cases) |>
as.data.frame()
tbl_info(coronavirus_temp)Table Info - coronavirus_temp
Number of columns: 7Number of rows: 973836
Duplicated rows: 0
Add GIS codes
load("../data_raw/gis_mapping.RData")
gis_codes_coronavirues$iso2[which(gis_codes_coronavirues$country_region == "Namibia")] <- "NA"
# Removing duplicated iso codes for countries that located in two continents
# Azerbaijan -> Asia
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "AZ" &
continent_mapping$iso3 == "AZE" &
continent_mapping$continent_name == "Europe"),]
# Armenia -> Asia
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "AM" &
continent_mapping$iso3 == "ARM" &
continent_mapping$continent_name == "Europe"),]
# Cyprus -> Europe
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "CY" &
continent_mapping$iso3 == "CYP" &
continent_mapping$continent_name == "Asia"),]
# Georgia -> Asia
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "GE" &
continent_mapping$iso3 == "GEO" &
continent_mapping$continent_name == "Europe"),]
# Kazakhstan -> Asia
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "KZ" &
continent_mapping$iso3 == "KAZ" &
continent_mapping$continent_name == "Europe"),]
#United States Minor Outlying Islands -> Oceania
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "UM" &
continent_mapping$iso3 == "UMI" &
continent_mapping$continent_name == "North America"),]
# Russian Federation -> Europe
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "RU" &
continent_mapping$iso3 == "RUS" &
continent_mapping$continent_name == "Asia"),]
# Turkey -> Asia
continent_mapping <- continent_mapping[- which(continent_mapping$iso2 == "TR" &
continent_mapping$iso3 == "TUR" &
continent_mapping$continent_name == "Europe"),]
coronavirus <- coronavirus_temp |>
dplyr::left_join(gis_codes_coronavirues |>
dplyr::select(-lat, - long) |>
dplyr::select(province = province_state, country = country_region,
dplyr::everything()),
by = c("province", "country")) |>
dplyr::left_join(continent_mapping |>
dplyr::select(iso2, iso3, continent_name, continent_code),
by = c("iso2", "iso3")) |>
dplyr::mutate(continent_name = ifelse(country == "Kosovo", "Europe", continent_name),
continent_code = ifelse(country == "Kosovo", "EU", continent_code)) |>
dplyr::select(-admin2, -fips)
tbl_info(coronavirus)Table Info - coronavirus
Number of columns: 15Number of rows: 973836
Duplicated rows: 0
Split the data by year
coronavirus$year <- lubridate::year(coronavirus$date)
table(coronavirus$year)
2020 2021 2022 2023
293940 310980 310980 57936
coronavirus_2020 <- coronavirus |>
dplyr::filter(year == 2020) |>
dplyr::select(-year)
coronavirus_2021 <- coronavirus |>
dplyr::filter(year == 2021) |>
dplyr::select(-year)
coronavirus_2022 <- coronavirus |>
dplyr::filter(year == 2022) |>
dplyr::select(-year)
coronavirus_2022 <- coronavirus |>
dplyr::filter(year == 2022) |>
dplyr::select(-year)
coronavirus_2023 <- coronavirus |>
dplyr::filter(year == 2023) |>
dplyr::select(-year)
coronavirus <- coronavirus |>
dplyr::select(-year)Data validation
# Checking merge
if(nrow(coronavirus) != nrow(coronavirus_temp)){
s <- FALSE
msg <- c(msg, "Merge fail - the number of rows of the coronavirus_temp is different from the coronavirus")
}
# Checking the table dimensions
if(nrow(coronavirus) < 900000){
s <- FALSE
msg <- c(msg, "The number of rows of the coronavirus table is too small")
}
if(ncol(coronavirus) != 15){
s <- FALSE
msg <- c(msg, "The number of columns of the coronavirus table is invalid")
}
if(min(coronavirus$date) != as.Date("2020-01-22")){
s <- FALSE
msg <- c(msg, "The starting date is invalid")
}
if(nrow(coronavirus_2020) != 293940){
s <- FALSE
msg <- c(msg, "The number of rows of the 2020 data is not valid")
}
if(nrow(coronavirus_2021) != 310980){
s <- FALSE
msg <- c(msg, "The number of rows of the 2021 data is not valid")
}
if(nrow(coronavirus_2022) != 310980){
s <- FALSE
msg <- c(msg, "The number of rows of the 2022 data is not valid")
}Saving the data
branch <- system(command = "git rev-parse --abbrev-ref HEAD", intern = TRUE)
load(sprintf("../data_pipelines/log_%s.RData", branch))
tail(log) time dataset nrows last_date update success
410 2023-03-19 16:24:45 coronavirus 973836 2023-03-09 FALSE TRUE
411 2023-03-19 16:24:57 covid19_vaccine 142597 2023-03-09 FALSE TRUE
412 2023-03-20 01:37:01 coronavirus 973836 2023-03-09 FALSE TRUE
413 2023-03-20 01:37:11 covid19_vaccine 142597 2023-03-09 FALSE TRUE
414 2023-03-20 08:25:07 coronavirus 973836 2023-03-09 FALSE TRUE
415 2023-03-20 08:25:16 covid19_vaccine 142597 2023-03-09 FALSE TRUE
backfile branch
410 FALSE main
411 FALSE main
412 FALSE main
413 FALSE main
414 FALSE main
415 FALSE main
log_last <- log |> dplyr::filter(update == TRUE & success == TRUE, dataset == "coronavirus") |>
dplyr::filter(time == max(time))
if(s && nrow(coronavirus) > log_last$nrows){
log <- rbind(log, data.frame(time = Sys.time(),
dataset = "coronavirus",
nrows = nrow(coronavirus),
last_date = max(coronavirus$date),
update = TRUE,
success = TRUE,
backfile = FALSE,
branch = branch,
stringsAsFactors = FALSE))
save(log, file = sprintf("../data_pipelines/log_%s.RData", branch))
usethis::use_data(coronavirus, overwrite = TRUE, compress = "xz")
# Save most recent year
# write.csv(coronavirus_2020, "../csv/coronavirus_2020.csv", row.names = FALSE)
# write.csv(coronavirus_2021, "../csv/coronavirus_2021.csv", row.names = FALSE)
# write.csv(coronavirus_2022, "../csv/coronavirus_2022.csv", row.names = FALSE)
write.csv(coronavirus_2023, "../csv/coronavirus_2023.csv", row.names = FALSE)
} else if(s && nrow(coronavirus) == log_last$nrows){
cat("No updates available \n")
log <- rbind(log, data.frame(time = Sys.time(),
dataset = "coronavirus",
nrows = nrow(coronavirus),
last_date = max(coronavirus$date),
update = FALSE,
success = TRUE,
backfile = FALSE,
branch = branch,
stringsAsFactors = FALSE))
save(log, file = sprintf("../data_pipelines/log_%s.RData", branch))
} else if(s && nrow(coronavirus) < log_last$nrows) {
cat("Something went wrong, the number of rows of the new object are less than the previous object \n")
log <- rbind(log, data.frame(time = Sys.time(),
dataset = "coronavirus",
nrows = nrow(coronavirus),
last_date = max(coronavirus$date),
update = FALSE,
success = FALSE,
backfile = FALSE,
branch = branch,
stringsAsFactors = FALSE))
save(log, file = sprintf("../data_pipelines/log_%s.RData", branch))
} else if(!s){
cat(msg, sep = "\n")
}No updates available