A few days ago I was looking at the COVID-19 data at https://www.worldometers.info/coronavirus/
The website has a table showing data on cases, deaths, recoveries and testing for almost every country in the world. You can sort the table by any of its columns, but I wanted to be able to see geographic patterns by looking at a map. So, I decided to practice my R skills by writing code to webscrape the table data and produce a world choropleth map with colors coded to per capita death rates. (Click on map for higher resolution.)
Here is the R code. I hope it is sufficiently documented to follow, and also that you can see how to modify it to make a choropleth from just about any website's table of country-level data:
# Webscrape COVID
# David Pittelli -- woodedpaths.blogspot.com
# Some code from SNHU IT 697 II
# http://selectorgadget.com/ will give CSS codes such as 'td' for
table data.
# Data source: https://www.worldometers.info/coronavirus/
#install.packages('rvest')
#install.packages'rworldmap')
# Load functions************
library(rvest)
library(rworldmap)
setwd("C:/Users/David/Documents/2020 Choropleth")
rm(list=ls())
#scrape table data************
url='https://www.worldometers.info/coronavirus/'
d=read_html(url)
#extract the table data ('td' in html) ************
scraped_td = html_nodes(d, 'td')
# convert the html data to text ************
Tab = html_text(scraped_td)
# extract table head and convert to text for column names
************
# scraped_th = html_nodes(d, 'th')
# TabHead = html_text(scraped_th)
# Remove non-country leading info.
Start with the "1" followed by the Country with most Deaths***
first = min(grep("^1$", Tab)) # The first "1"
as whole string in Tab is probably where we want to start
# Use anchors. ^ is the start. $ is the end.
# The start position was 115, then went to 153 on or slightly
before 2020 06 01 run.
last = min(grep("Total", Tab)) - 2
Tab2 = Tab[first:last]
#build matrix of data, then convert to data frame************
df=data.frame(matrix(Tab2, ncol=19,byrow=T),
stringsAsFactors=FALSE)
df=df[, c(2:16)]
colnames(df) = c("Country", "TotalCases",
"NewCases", "TotalDeaths", "NewDeaths", "TotalRecovered",
"NewRecovered", "ActiveCases", "CriticalCases",
"CPM", "DPM", "TotalTests", "TPM", "Population",
"Region")
# Clear out unneeded variables************
# rm(list=setdiff(ls(), "df"))
# Fix countries with abbreviated names************
df[df$Country == "S. Korea", "Country"] =
"South Korea"
df[df$Country == "UAE", "Country"] =
"United Arab Emirates"
df[df$Country == "DRC", "Country"] =
"Democratic Republic of the Congo"
df[df$Country == "CAR", "Country"] =
"Central African Republic"
df[df$Country == "Czechia", "Country"] =
"Czech Republic"
df[df$Country == "North Macedonia", "Country"]
= "Macedonia"
df[df$Country == "Eswatini", "Country"] =
"Swaziland"
# Add countries with missing data************
# df = rbind(df, c("Turkmenistan", rep(0, 9), NA, rep(0,
4))) # claims no cases, obviously untrue
# df = rbind(df, c("North Korea", rep(0, 15))) # claims
no cases, obviously untrue
df = rbind(df, c("Antarctica", rep(0, 15)))
df = rbind(df, c("Kosovo", rep(0, 9), 66/1.845, 0, 0, 1845000,
'Europe')) # deaths / millions 7/4/20
# source: https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Kosovo#Cases
# df = rbind(df, c("Kazakhstan", rep(0, 15))) # they got
data by July
# df = rbind(df, c("Tajikistan", rep(0, 15))) # they got
data by July
# Remove commas from text-numbers and change from chr to numeric************
for (i in 2:14) {
df[ , i] = as.numeric(gsub(",","",df[ , i]))
}
# Change NA values to 0
df[is.na(df$DPM), "DPM"] = 0
df[is.na(df$CPM), "CPM"] = 0
# Find highest DPM for country with Population > 1 million
# Only San Marino has higher DPM (as of 4/24/2020)
hmax = max(df[df$Population > 1000000 & !is.na(df$Population)
& !is.na(df$DPM), "DPM"])
# Save with a file name including today's date************
today = paste(substr(date(), 5, 10), substr(date(), 21, 24))
filename = paste(today, "COVID table.csv")
write.csv(df, filename)
#join data to a map************
COVIDMap = joinCountryData2Map(df,
nameJoinColumn="Country",
joinCode="NAME",
verbose=TRUE)
# plot the map************
colorbreaks = c(0, 0.01, 0.1, 5, hmax/32, hmax/16, hmax/8, hmax/4,
hmax/2, hmax/2 + 0.01, hmax/2 + 0.02, hmax)
mapCountryData(COVIDMap,
nameColumnToPlot='DPM',
catMethod = colorbreaks,
colourPalette =
"heat",
oceanCol = "light
sky blue",
missingCountryCol =
"dimgrey",
borderCol =
"black",
mapTitle =
paste("COVID-19 Deaths per Million People,", today))
# Add labels, note that lat/lon is set to erroneous number to be
in right place for zoomed graphic
text(x=155, y=45, label="?", col = "dark red",
cex = 2)
text(x=100,y=-90, label="David Pittelli -- woodedpaths.blogspot.com",
col = "black", cex = 0.8)
No comments:
Post a Comment