library(coronavirus)
library(tidyverse)
library(magrittr)
spain_corona <- coronavirus %>% filter(country == "Spain")
recovered_spain_corona <- spain_corona %>% filter(type=="confirmed")
summary(recovered_spain_corona)
date province country lat
Min. :2020-01-22 Length:112 Length:112 Min. :40
1st Qu.:2020-02-18 Class :character Class :character 1st Qu.:40
Median :2020-03-17 Mode :character Mode :character Median :40
Mean :2020-03-17 Mean :40
3rd Qu.:2020-04-14 3rd Qu.:40
Max. :2020-05-12 Max. :40
long type cases
Min. :-4 Length:112 Min. :-10034.0
1st Qu.:-4 Class :character 1st Qu.: 0.0
Median :-4 Mode :character Median : 746.5
Mean :-4 Mean : 2036.0
3rd Qu.:-4 3rd Qu.: 3989.2
Max. :-4 Max. : 9630.0
# code missing values to NA
recovered_spain_corona <- recovered_spain_corona %>% mutate(cases = replace(cases, which(cases < 0), NA))
summary(recovered_spain_corona) # Now you can see in the summary table there are no negative values instead we have NA's
date province country lat
Min. :2020-01-22 Length:112 Length:112 Min. :40
1st Qu.:2020-02-18 Class :character Class :character 1st Qu.:40
Median :2020-03-17 Mode :character Mode :character Median :40
Mean :2020-03-17 Mean :40
3rd Qu.:2020-04-14 3rd Qu.:40
Max. :2020-05-12 Max. :40
long type cases
Min. :-4 Length:112 Min. : 0
1st Qu.:-4 Class :character 1st Qu.: 0
Median :-4 Mode :character Median : 772
Mean :-4 Mean :2145
3rd Qu.:-4 3rd Qu.:4010
Max. :-4 Max. :9630
NA's :1
ggplot(recovered_spain_corona, aes(x=date, y=cases)) +
geom_line() # Now you can see discontinuity in the line.
# Locate missing value
which(is.na(recovered_spain_corona$cases)) # Using base R
[1] 94
# I use the average of the 93rd and the 95th to calculate the missing value/negative value.
recovered_spain_corona$cases[94] = mean(c(recovered_spain_corona$cases[93], recovered_spain_corona$cases[95]))
length(recovered_spain_corona$cases)
[1] 112
# I add a column col that indicates desired colour.
recovered_spain_corona$col <- as.factor(c(rep("black", 92),rep("red", 2), rep("black", 112-94)))
summary(recovered_spain_corona)
date province country lat
Min. :2020-01-22 Length:112 Length:112 Min. :40
1st Qu.:2020-02-18 Class :character Class :character 1st Qu.:40
Median :2020-03-17 Mode :character Mode :character Median :40
Mean :2020-03-17 Mean :40
3rd Qu.:2020-04-14 3rd Qu.:40
Max. :2020-05-12 Max. :40
long type cases col
Min. :-4 Length:112 Min. : 0 black:110
1st Qu.:-4 Class :character 1st Qu.: 0 red : 2
Median :-4 Mode :character Median : 828
Mean :-4 Mean :2159
3rd Qu.:-4 3rd Qu.:3989
Max. :-4 Max. :9630
ggplot(recovered_spain_corona, aes(x=date, y=cases)) +
geom_line(aes(colour=col, group=1)) +
scale_colour_identity()
ggplot(recovered_spain_corona, aes(cases)) +
geom_freqpoly(bins = 9)
ggplot(recovered_spain_corona, aes(cases)) +
geom_histogram(bins = 9, col="white", alpha=0.5)
A frequency polygon is a graph constructed by joining the midpoints of each interval or bin.
ggplot(recovered_spain_corona, aes(cases)) +
geom_histogram(bins = 9, col="white", alpha=0.5) +
geom_freqpoly(bins = 9, col="red")
us_corona <- coronavirus %>% filter(country == "US")
head(us_corona)
date province country lat long type cases
1 2020-01-22 US 37.0902 -95.7129 confirmed 1
2 2020-01-23 US 37.0902 -95.7129 confirmed 0
3 2020-01-24 US 37.0902 -95.7129 confirmed 1
4 2020-01-25 US 37.0902 -95.7129 confirmed 0
5 2020-01-26 US 37.0902 -95.7129 confirmed 3
6 2020-01-27 US 37.0902 -95.7129 confirmed 0
ggplot(us_corona, aes(x=date, y=cases, col=type)) + geom_line()