Packages

library(coronavirus)
library(tidyverse)
library(magrittr)

Question 1: Plotting negative values

Method 1: Discontinuity Point

spain_corona <- coronavirus %>% filter(country == "Spain")
recovered_spain_corona <- spain_corona %>% filter(type=="confirmed")
summary(recovered_spain_corona)
      date              province           country               lat    
 Min.   :2020-01-22   Length:112         Length:112         Min.   :40  
 1st Qu.:2020-02-18   Class :character   Class :character   1st Qu.:40  
 Median :2020-03-17   Mode  :character   Mode  :character   Median :40  
 Mean   :2020-03-17                                         Mean   :40  
 3rd Qu.:2020-04-14                                         3rd Qu.:40  
 Max.   :2020-05-12                                         Max.   :40  
      long        type               cases         
 Min.   :-4   Length:112         Min.   :-10034.0  
 1st Qu.:-4   Class :character   1st Qu.:     0.0  
 Median :-4   Mode  :character   Median :   746.5  
 Mean   :-4                      Mean   :  2036.0  
 3rd Qu.:-4                      3rd Qu.:  3989.2  
 Max.   :-4                      Max.   :  9630.0  
# code missing values to NA
recovered_spain_corona <- recovered_spain_corona %>% mutate(cases = replace(cases, which(cases < 0), NA))
summary(recovered_spain_corona) # Now you can see in the summary table there are no negative values instead we have NA's
      date              province           country               lat    
 Min.   :2020-01-22   Length:112         Length:112         Min.   :40  
 1st Qu.:2020-02-18   Class :character   Class :character   1st Qu.:40  
 Median :2020-03-17   Mode  :character   Mode  :character   Median :40  
 Mean   :2020-03-17                                         Mean   :40  
 3rd Qu.:2020-04-14                                         3rd Qu.:40  
 Max.   :2020-05-12                                         Max.   :40  
                                                                        
      long        type               cases     
 Min.   :-4   Length:112         Min.   :   0  
 1st Qu.:-4   Class :character   1st Qu.:   0  
 Median :-4   Mode  :character   Median : 772  
 Mean   :-4                      Mean   :2145  
 3rd Qu.:-4                      3rd Qu.:4010  
 Max.   :-4                      Max.   :9630  
                                 NA's   :1     
ggplot(recovered_spain_corona, aes(x=date, y=cases)) +
  geom_line() # Now you can see discontinuity in the line.
Figure 1: Time series plot of recovered cases (discontinuity represents missing values/ negative values)

Figure 1: Time series plot of recovered cases (discontinuity represents missing values/ negative values)

Method 2: Impute missing value

# Locate missing value
which(is.na(recovered_spain_corona$cases)) # Using base R
[1] 94
# I use the average of the 93rd and the 95th to calculate the missing value/negative value.
recovered_spain_corona$cases[94] = mean(c(recovered_spain_corona$cases[93], recovered_spain_corona$cases[95]))
length(recovered_spain_corona$cases)
[1] 112
#  I add a column col that indicates desired colour.
recovered_spain_corona$col <- as.factor(c(rep("black", 92),rep("red", 2), rep("black", 112-94)))
summary(recovered_spain_corona)
      date              province           country               lat    
 Min.   :2020-01-22   Length:112         Length:112         Min.   :40  
 1st Qu.:2020-02-18   Class :character   Class :character   1st Qu.:40  
 Median :2020-03-17   Mode  :character   Mode  :character   Median :40  
 Mean   :2020-03-17                                         Mean   :40  
 3rd Qu.:2020-04-14                                         3rd Qu.:40  
 Max.   :2020-05-12                                         Max.   :40  
      long        type               cases         col     
 Min.   :-4   Length:112         Min.   :   0   black:110  
 1st Qu.:-4   Class :character   1st Qu.:   0   red  :  2  
 Median :-4   Mode  :character   Median : 828              
 Mean   :-4                      Mean   :2159              
 3rd Qu.:-4                      3rd Qu.:3989              
 Max.   :-4                      Max.   :9630              
ggplot(recovered_spain_corona, aes(x=date, y=cases)) +
  geom_line(aes(colour=col, group=1)) + 
  scale_colour_identity()
Figure 2: Time series plot of recovered cases (imputed values are shown in red.)

Figure 2: Time series plot of recovered cases (imputed values are shown in red.)

Question 2: Example for geom_freqpoly

Frequency polygons

ggplot(recovered_spain_corona, aes(cases)) +
  geom_freqpoly(bins = 9)
Figure 3: Distribution of number of recovered cases

Figure 3: Distribution of number of recovered cases

Histogram

ggplot(recovered_spain_corona, aes(cases)) +
  geom_histogram(bins = 9, col="white", alpha=0.5)
Figure 4: Distribution of number of recovered cases

Figure 4: Distribution of number of recovered cases

Histogram + Frequency polygon

A frequency polygon is a graph constructed by joining the midpoints of each interval or bin.

ggplot(recovered_spain_corona, aes(cases)) +
  geom_histogram(bins = 9, col="white", alpha=0.5) + 
  geom_freqpoly(bins = 9, col="red")
Figure 5: Distribution of number of recovered cases

Figure 5: Distribution of number of recovered cases

Question 3: Multiple Time Series

us_corona <- coronavirus %>% filter(country == "US")
head(us_corona)
        date province country     lat     long      type cases
1 2020-01-22               US 37.0902 -95.7129 confirmed     1
2 2020-01-23               US 37.0902 -95.7129 confirmed     0
3 2020-01-24               US 37.0902 -95.7129 confirmed     1
4 2020-01-25               US 37.0902 -95.7129 confirmed     0
5 2020-01-26               US 37.0902 -95.7129 confirmed     3
6 2020-01-27               US 37.0902 -95.7129 confirmed     0
ggplot(us_corona, aes(x=date, y=cases, col=type)) + geom_line()