# function for pursing the JH covid19 cases csv file
<- function(url, type){
parse_url <- readr::read_csv(file = url) |>
raw as.data.frame()
# Transforming the data from wide to long
# Creating new data frame
<- raw[, 1:4]
df
names(df) <- gsub(pattern= "/", replacement = ".", x = names(df))
# Take diff
for(i in 5:ncol(raw)){
<- as.integer(raw[,i])
raw[,i]
if(i == 5){
names(raw)[i]]] <- raw[, i]
df[[else {
} names(raw)[i]]] <- raw[, i] - raw[, i - 1]
df[[
}
}
<- df |> tidyr::pivot_longer(cols = dplyr::starts_with(c("1", "2", "3", "4", "5", "6", "7", "8", "9")),
df1 names_to = "date_temp",
values_to = "cases_temp") |>
::mutate(date = lubridate::mdy(date_temp)) |>
dplyr::group_by(Province.State, Country.Region, Lat, Long, date) |>
dplyr::summarise(cases = sum(cases_temp),
dplyr.groups = "drop") |>
::ungroup() |>
dplyr::mutate(type = type,
dplyrCountry.Region = trimws(Country.Region),
Province.State = trimws(Province.State))
if(type == "recovered"){
$cases <- ifelse(df1$date > as.Date("2021-08-04") & df1$cases == 0 |
df1$date > as.Date("2021-08-04") & df1$cases < 0, NA, df1$cases)
df1
}
return(df1)
}
# Get the table information
<- function(input){
tbl_info
<- base::deparse(base::substitute(input))
obj.name
<- as.data.frame(input)
input
<- sum(duplicated(input))
dup
<- data.frame(cols_name = names(input),
df cols_class = lapply(input, class) |> unlist(),
cols_NAs = lapply(names(input), function(i){sum(is.na(input[, i]))}) |> unlist(),
cols_min = lapply(names(input), function(i){if(is.numeric(input[, i])){
min(input[, i], na.rm = TRUE)
else {
} NA
|> unlist(),
}}) cols_max = lapply(names(input), function(i){if(is.numeric(input[, i])){
max(input[, i], na.rm = TRUE)
else {
} NA
|> unlist(),
}}) cols_unique = lapply(names(input), function(i){length(unique(input[, i]))}) |> unlist(),
stringsAsFactors = FALSE)
rownames(df) <- NULL
<- htmltools::div(class = "tbl-info",
t ::div(class = "tbl-info",
htmltools::h4(class = "tbl-info",
htmltoolspaste("Table Info - ", obj.name)),
paste("Number of columns:", ncol(input)),
::br(),
htmltoolspaste("Number of rows:", nrow(input)),
::br(),
htmltoolspaste("Duplicated rows:", dup)
),::reactable(df, defaultPageSize = nrow(df)))
reactable
return(t)
}
Coronavirus Dataset Data Pipeline
Note: As of March 10, 2023, the source data no longer get updated. Therefore, this data pipeline is disabled.
This is the coronavirus R package data pipeline.
Functions
Parameters
# Set success flag
<- TRUE
s <- NULL
msg # Confirmed cases
<- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
conf_url <- NULL
conf_df # Death cases
<- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
death_url <- NULL
death_df # Recovery cases
<- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"
rec_url <- NULL rec_df
Confirmed Cases
<- parse_url(url = conf_url, type = "confirmed") conf_df
Rows: 289 Columns: 1147
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Province/State, Country/Region
dbl (1145): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
if(is.null(conf_df) || !is.data.frame(conf_df) || nrow(conf_df) == 0){
<- FALSE
s else{
} tbl_info(conf_df)
}
Table Info - conf_df
Number of columns: 7Number of rows: 330327
Duplicated rows: 0
Death Cases
<- parse_url(url = death_url, type = "death") death_df
Rows: 289 Columns: 1147
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Province/State, Country/Region
dbl (1145): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
if(is.null(death_df) || !is.data.frame(death_df) || nrow(death_df) == 0){
<- FALSE
s else{
} tbl_info(death_df)
}
Table Info - death_df
Number of columns: 7Number of rows: 330327
Duplicated rows: 0
Recovery Cases
<- parse_url(url = rec_url, type = "recovery") rec_df
Rows: 274 Columns: 1147
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Province/State, Country/Region
dbl (1145): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Setting the first day with no recovery cases reported as zero
$cases <- ifelse(rec_df$date == as.Date("2021-08-05"), 0, rec_df$cases)
rec_df
if(is.null(rec_df) || !is.data.frame(rec_df) || nrow(rec_df) == 0){
<- FALSE
s else{
} tbl_info(rec_df)
}
Table Info - rec_df
Number of columns: 7Number of rows: 313182
Duplicated rows: 0
Append the data
<- dplyr::bind_rows(conf_df, death_df, rec_df) |>
coronavirus_temp ::select(date, province = Province.State, country = Country.Region, lat = Lat, long = Long, type, cases) |>
dplyras.data.frame()
tbl_info(coronavirus_temp)
Table Info - coronavirus_temp
Number of columns: 7Number of rows: 973836
Duplicated rows: 0
Add GIS codes
load("../data_raw/gis_mapping.RData")
$iso2[which(gis_codes_coronavirues$country_region == "Namibia")] <- "NA"
gis_codes_coronavirues
# Removing duplicated iso codes for countries that located in two continents
# Azerbaijan -> Asia
<- continent_mapping[- which(continent_mapping$iso2 == "AZ" &
continent_mapping $iso3 == "AZE" &
continent_mapping$continent_name == "Europe"),]
continent_mapping# Armenia -> Asia
<- continent_mapping[- which(continent_mapping$iso2 == "AM" &
continent_mapping $iso3 == "ARM" &
continent_mapping$continent_name == "Europe"),]
continent_mapping# Cyprus -> Europe
<- continent_mapping[- which(continent_mapping$iso2 == "CY" &
continent_mapping $iso3 == "CYP" &
continent_mapping$continent_name == "Asia"),]
continent_mapping# Georgia -> Asia
<- continent_mapping[- which(continent_mapping$iso2 == "GE" &
continent_mapping $iso3 == "GEO" &
continent_mapping$continent_name == "Europe"),]
continent_mapping
# Kazakhstan -> Asia
<- continent_mapping[- which(continent_mapping$iso2 == "KZ" &
continent_mapping $iso3 == "KAZ" &
continent_mapping$continent_name == "Europe"),]
continent_mapping
#United States Minor Outlying Islands -> Oceania
<- continent_mapping[- which(continent_mapping$iso2 == "UM" &
continent_mapping $iso3 == "UMI" &
continent_mapping$continent_name == "North America"),]
continent_mapping
# Russian Federation -> Europe
<- continent_mapping[- which(continent_mapping$iso2 == "RU" &
continent_mapping $iso3 == "RUS" &
continent_mapping$continent_name == "Asia"),]
continent_mapping
# Turkey -> Asia
<- continent_mapping[- which(continent_mapping$iso2 == "TR" &
continent_mapping $iso3 == "TUR" &
continent_mapping$continent_name == "Europe"),]
continent_mapping
<- coronavirus_temp |>
coronavirus ::left_join(gis_codes_coronavirues |>
dplyr::select(-lat, - long) |>
dplyr::select(province = province_state, country = country_region,
dplyr::everything()),
dplyrby = c("province", "country")) |>
::left_join(continent_mapping |>
dplyr::select(iso2, iso3, continent_name, continent_code),
dplyrby = c("iso2", "iso3")) |>
::mutate(continent_name = ifelse(country == "Kosovo", "Europe", continent_name),
dplyrcontinent_code = ifelse(country == "Kosovo", "EU", continent_code)) |>
::select(-admin2, -fips)
dplyr
tbl_info(coronavirus)
Table Info - coronavirus
Number of columns: 15Number of rows: 973836
Duplicated rows: 0
Split the data by year
$year <- lubridate::year(coronavirus$date)
coronavirus
table(coronavirus$year)
2020 2021 2022 2023
293940 310980 310980 57936
<- coronavirus |>
coronavirus_2020 ::filter(year == 2020) |>
dplyr::select(-year)
dplyr
<- coronavirus |>
coronavirus_2021 ::filter(year == 2021) |>
dplyr::select(-year)
dplyr
<- coronavirus |>
coronavirus_2022 ::filter(year == 2022) |>
dplyr::select(-year)
dplyr
<- coronavirus |>
coronavirus_2022 ::filter(year == 2022) |>
dplyr::select(-year)
dplyr
<- coronavirus |>
coronavirus_2023 ::filter(year == 2023) |>
dplyr::select(-year)
dplyr
<- coronavirus |>
coronavirus ::select(-year) dplyr
Data validation
# Checking merge
if(nrow(coronavirus) != nrow(coronavirus_temp)){
<- FALSE
s <- c(msg, "Merge fail - the number of rows of the coronavirus_temp is different from the coronavirus")
msg
}
# Checking the table dimensions
if(nrow(coronavirus) < 900000){
<- FALSE
s <- c(msg, "The number of rows of the coronavirus table is too small")
msg
}
if(ncol(coronavirus) != 15){
<- FALSE
s <- c(msg, "The number of columns of the coronavirus table is invalid")
msg
}
if(min(coronavirus$date) != as.Date("2020-01-22")){
<- FALSE
s <- c(msg, "The starting date is invalid")
msg
}
if(nrow(coronavirus_2020) != 293940){
<- FALSE
s <- c(msg, "The number of rows of the 2020 data is not valid")
msg
}
if(nrow(coronavirus_2021) != 310980){
<- FALSE
s <- c(msg, "The number of rows of the 2021 data is not valid")
msg
}
if(nrow(coronavirus_2022) != 310980){
<- FALSE
s <- c(msg, "The number of rows of the 2022 data is not valid")
msg }
Saving the data
<- system(command = "git rev-parse --abbrev-ref HEAD", intern = TRUE)
branch load(sprintf("../data_pipelines/log_%s.RData", branch))
tail(log)
time dataset nrows last_date update success
410 2023-03-19 16:24:45 coronavirus 973836 2023-03-09 FALSE TRUE
411 2023-03-19 16:24:57 covid19_vaccine 142597 2023-03-09 FALSE TRUE
412 2023-03-20 01:37:01 coronavirus 973836 2023-03-09 FALSE TRUE
413 2023-03-20 01:37:11 covid19_vaccine 142597 2023-03-09 FALSE TRUE
414 2023-03-20 08:25:07 coronavirus 973836 2023-03-09 FALSE TRUE
415 2023-03-20 08:25:16 covid19_vaccine 142597 2023-03-09 FALSE TRUE
backfile branch
410 FALSE main
411 FALSE main
412 FALSE main
413 FALSE main
414 FALSE main
415 FALSE main
<- log |> dplyr::filter(update == TRUE & success == TRUE, dataset == "coronavirus") |>
log_last ::filter(time == max(time))
dplyrif(s && nrow(coronavirus) > log_last$nrows){
<- rbind(log, data.frame(time = Sys.time(),
log dataset = "coronavirus",
nrows = nrow(coronavirus),
last_date = max(coronavirus$date),
update = TRUE,
success = TRUE,
backfile = FALSE,
branch = branch,
stringsAsFactors = FALSE))
save(log, file = sprintf("../data_pipelines/log_%s.RData", branch))
::use_data(coronavirus, overwrite = TRUE, compress = "xz")
usethis# Save most recent year
# write.csv(coronavirus_2020, "../csv/coronavirus_2020.csv", row.names = FALSE)
# write.csv(coronavirus_2021, "../csv/coronavirus_2021.csv", row.names = FALSE)
# write.csv(coronavirus_2022, "../csv/coronavirus_2022.csv", row.names = FALSE)
write.csv(coronavirus_2023, "../csv/coronavirus_2023.csv", row.names = FALSE)
else if(s && nrow(coronavirus) == log_last$nrows){
} cat("No updates available \n")
<- rbind(log, data.frame(time = Sys.time(),
log dataset = "coronavirus",
nrows = nrow(coronavirus),
last_date = max(coronavirus$date),
update = FALSE,
success = TRUE,
backfile = FALSE,
branch = branch,
stringsAsFactors = FALSE))
save(log, file = sprintf("../data_pipelines/log_%s.RData", branch))
else if(s && nrow(coronavirus) < log_last$nrows) {
} cat("Something went wrong, the number of rows of the new object are less than the previous object \n")
<- rbind(log, data.frame(time = Sys.time(),
log dataset = "coronavirus",
nrows = nrow(coronavirus),
last_date = max(coronavirus$date),
update = FALSE,
success = FALSE,
backfile = FALSE,
branch = branch,
stringsAsFactors = FALSE))
save(log, file = sprintf("../data_pipelines/log_%s.RData", branch))
else if(!s){
} cat(msg, sep = "\n")
}
No updates available