Wooded Paths: Mapping the World of COVID-19

A few days ago I was looking at the COVID-19 data at https://www.worldometers.info/coronavirus/

The website has a table showing data on cases, deaths, recoveries and testing for almost every country in the world. You can sort the table by any of its columns, but I wanted to be able to see geographic patterns by looking at a map. So, I decided to practice my R skills by writing code to webscrape the table data and produce a world choropleth map with colors coded to per capita death rates. (Click on map for higher resolution.)

Here is the R code. I hope it is sufficiently documented to follow, and also that you can see how to modify it to make a choropleth from just about any website's table of country-level data:

# Webscrape COVID

# David Pittelli -- woodedpaths.blogspot.com

# Some code from SNHU IT 697 II

# http://selectorgadget.com/ will give CSS codes such as 'td' for table data.

# Data source: https://www.worldometers.info/coronavirus/

#install.packages('rvest')

#install.packages'rworldmap')

# Load functions************

library(rvest)

library(rworldmap)

setwd("C:/Users/David/Documents/2020 Choropleth")

rm(list=ls())

#scrape table data************

url='https://www.worldometers.info/coronavirus/'

d=read_html(url)

#extract the table data ('td' in html) ************

scraped_td = html_nodes(d, 'td')

# convert the html data to text ************

Tab = html_text(scraped_td)

# extract table head and convert to text for column names ************

# scraped_th = html_nodes(d, 'th')

# TabHead = html_text(scraped_th)

# Remove non-country leading info. Start with the "1" followed by the Country with most Deaths***

first = min(grep("^1$", Tab)) # The first "1" as whole string in Tab is probably where we want to start

# Use anchors. ^ is the start. $ is the end.

# The start position was 115, then went to 153 on or slightly before 2020 06 01 run.

last = min(grep("Total", Tab)) - 2

Tab2 = Tab[first:last]

#build matrix of data, then convert to data frame************

df=data.frame(matrix(Tab2, ncol=19,byrow=T), stringsAsFactors=FALSE)

df=df[, c(2:16)]

colnames(df) = c("Country", "TotalCases", "NewCases", "TotalDeaths", "NewDeaths", "TotalRecovered", "NewRecovered", "ActiveCases", "CriticalCases", "CPM", "DPM", "TotalTests", "TPM", "Population", "Region")

# Clear out unneeded variables************

# rm(list=setdiff(ls(), "df"))

# Fix countries with abbreviated names************

df[df$Country == "S. Korea", "Country"] = "South Korea"

df[df$Country == "UAE", "Country"] = "United Arab Emirates"

df[df$Country == "DRC", "Country"] = "Democratic Republic of the Congo"

df[df$Country == "CAR", "Country"] = "Central African Republic"

df[df$Country == "Czechia", "Country"] = "Czech Republic"

df[df$Country == "North Macedonia", "Country"] = "Macedonia"

df[df$Country == "Eswatini", "Country"] = "Swaziland"

# Add countries with missing data************

# df = rbind(df, c("Turkmenistan", rep(0, 9), NA, rep(0, 4))) # claims no cases, obviously untrue

# df = rbind(df, c("North Korea", rep(0, 15))) # claims no cases, obviously untrue

df = rbind(df, c("Antarctica", rep(0, 15)))

df = rbind(df, c("Kosovo", rep(0, 9), 66/1.845, 0, 0, 1845000, 'Europe')) # deaths / millions 7/4/20

# source: https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Kosovo#Cases

# df = rbind(df, c("Kazakhstan", rep(0, 15))) # they got data by July

# df = rbind(df, c("Tajikistan", rep(0, 15))) # they got data by July

# Remove commas from text-numbers and change from chr to numeric************

for (i in 2:14) {

df[ , i] = as.numeric(gsub(",","",df[ , i]))

}

# Change NA values to 0

df[is.na(df$DPM), "DPM"] = 0

df[is.na(df$CPM), "CPM"] = 0

# Find highest DPM for country with Population > 1 million

# Only San Marino has higher DPM (as of 4/24/2020)

hmax = max(df[df$Population > 1000000 & !is.na(df$Population) & !is.na(df$DPM), "DPM"])

# Save with a file name including today's date************

today = paste(substr(date(), 5, 10), substr(date(), 21, 24))

filename = paste(today, "COVID table.csv")

write.csv(df, filename)

#join data to a map************

COVIDMap = joinCountryData2Map(df,

nameJoinColumn="Country",

joinCode="NAME",

verbose=TRUE)

# plot the map************

colorbreaks = c(0, 0.01, 0.1, 5, hmax/32, hmax/16, hmax/8, hmax/4, hmax/2, hmax/2 + 0.01, hmax/2 + 0.02, hmax)

mapCountryData(COVIDMap,

nameColumnToPlot='DPM',

catMethod = colorbreaks,

colourPalette = "heat",

oceanCol = "light sky blue",

missingCountryCol = "dimgrey",

borderCol = "black",

mapTitle = paste("COVID-19 Deaths per Million People,", today))

# Add labels, note that lat/lon is set to erroneous number to be in right place for zoomed graphic

text(x=155, y=45, label="?", col = "dark red", cex = 2)

text(x=100,y=-90, label="David Pittelli -- woodedpaths.blogspot.com", col = "black", cex = 0.8)

Wooded Paths

Friday, April 24, 2020

Mapping the World of COVID-19

No comments: