library(magrittr)
library(readr)
library(cluster)
library(factoextra)
library(readxl)
library(dendextend)
library(gridExtra)
library(dplyr)
library(ggplot2)
library(arules)
library(arulesViz)
library(tidyverse)
library(scales)
The dataset have 58 variables which include the daily cases, deaths vaccinations and other information related to countries.
covid <- read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv",
col_types = cols(iso_code = col_skip(), date = col_skip()))
dim(covid)
## [1] 108872 58
1.Renaming the variable location as Country and grouping the dataset by Country. 2.Summarizing the columns using the max function as the required variables data is cummulated for each day in the dataset.
covid <-covid %>%
rename(
Country = location,
)
covid <- covid %>%
group_by(Country) %>%
summarize_at(c("total_cases", "total_deaths","total_tests",
"total_vaccinations","population","population_density","median_age","aged_65_older","aged_70_older"
), max, na.rm = TRUE) %>% ungroup()
is.na(covid)<-sapply(covid, is.infinite)
covid
## # A tibble: 232 x 10
## Country total_cases total_deaths total_tests total_vaccinati~ population
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 151291 6978 NA 1767239 38928341
## 2 Africa 7117990 179913 NA 78303927 1340598113
## 3 Albania 134487 2460 737014 1280239 2877800
## 4 Algeria 184191 4654 NA 4146091 43851043
## 5 Andorra 14891 129 204740 82349 77265
## 6 Angola 43998 1063 NA 1695588 32866268
## 7 Anguilla NA NA NA 18457 15002
## 8 Antigua and~ 1372 43 16700 69903 97928
## 9 Argentina 5052884 108388 12442184 35247776 45195777
## 10 Armenia 233001 4664 1372405 194902 2963234
## # ... with 222 more rows, and 4 more variables: population_density <dbl>,
## # median_age <dbl>, aged_65_older <dbl>, aged_70_older <dbl>
1.Omitting the null values from required columns. 2.Creating a new variable Death by Population.
covid_age<-covid %>% filter(!is.na(covid$median_age) & !is.na(covid$total_deaths) & !is.na(covid$aged_65_older) & !is.na(covid$population))
covid_age$deaths_by_population <- (covid_age$total_deaths/covid_age$population)
covid_age
## # A tibble: 178 x 11
## Country total_cases total_deaths total_tests total_vaccinati~ population
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 151291 6978 NA 1767239 38928341
## 2 Albania 134487 2460 737014 1280239 2877800
## 3 Algeria 184191 4654 NA 4146091 43851043
## 4 Angola 43998 1063 NA 1695588 32866268
## 5 Antigua and~ 1372 43 16700 69903 97928
## 6 Argentina 5052884 108388 12442184 35247776 45195777
## 7 Armenia 233001 4664 1372405 194902 2963234
## 8 Australia 37749 947 26751476 13958045 25499881
## 9 Austria 665035 10752 69283203 10039144 9006400
## 10 Azerbaijan 357058 5095 4133796 5478042 10139175
## # ... with 168 more rows, and 5 more variables: population_density <dbl>,
## # median_age <dbl>, aged_65_older <dbl>, aged_70_older <dbl>,
## # deaths_by_population <dbl>
par(mfrow=c(1,2))
scatter.smooth(y=covid_age$deaths_by_population , x=covid_age$median_age, main="Deaths vs Median Age",xlab = "Median Age", ylab = "Death / population")
scatter.smooth(y=covid_age$deaths_by_population , x=covid_age$aged_65_older, main="Deaths vs Age >= 65" ,xlab = "Percentage of people aged over 65", ylab = "Death / population")
From the above plots it can be observed that the countries with high median ages has less deaths when compared to low median age. The reason might be, the countries with high median age has more older people than young and it is clearly explained in the second scatter plot that shows death rate increases in countries which has more people aged over 65.
par(mfrow=c(1,2))
boxplot(covid_age$median_age,main="Median Age", sub=paste("Outlier rows:", boxplot.stats(covid_age$median_age)$out))
boxplot(covid_age$aged_65_older,main="Age>=65", sub=paste("Outlier rows:", boxplot.stats(covid_age$aged_65_older)$out))
The box plot above show that there are no outliers in the data and doesnt require any outliers removal.
cor(covid_age$deaths_by_population, covid_age$median_age)
## [1] 0.5105989
cor(covid_age$deaths_by_population, covid_age$aged_65_older)
## [1] 0.5079691
The above correlation values corrobarate that the countries with high Median age and more population aged over 65 have more deaths.
Splitting the data to train and test.
index <- sort(sample(nrow(covid_age), 0.8 * nrow(covid_age)))
data_train <-covid_age[index,]
data_test <-covid_age[-index,]
linearModMA <- lm(deaths_by_population~ median_age, data = data_train)
summary(linearModMA)
##
## Call:
## lm(formula = deaths_by_population ~ median_age, data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0014919 -0.0003739 -0.0001591 0.0002427 0.0052933
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.269e-04 2.327e-04 -3.123 0.00217 **
## median_age 4.855e-05 7.395e-06 6.565 9.43e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0007971 on 140 degrees of freedom
## Multiple R-squared: 0.2354, Adjusted R-squared: 0.2299
## F-statistic: 43.1 on 1 and 140 DF, p-value: 9.427e-10
Using the above model to predict the test data and comparing the predicated data with the original data
predMA <- predict(linearModMA, data_test)
actualMA <- data.frame(cbind(actuals=(data_test$deaths_by_population), predicted=predMA))
actualMA
## actuals predicted
## 1 1.792524e-04 0.0001761205
## 2 1.061320e-04 0.0006858743
## 3 2.181436e-03 0.0013024337
## 4 8.651455e-04 0.0004868276
## 5 9.320973e-06 0.0001858301
## 6 7.790358e-04 0.0005256660
## 7 8.132696e-06 0.0001275725
## 8 5.040333e-05 0.0001858301
## 9 2.029085e-05 0.0001615561
## 10 1.891432e-03 0.0009917266
## 11 3.223041e-06 0.0011519349
## 12 3.252516e-04 0.0013655460
## 13 4.238228e-04 0.0006130523
## 14 9.641631e-04 0.0013461269
## 15 3.792763e-04 0.0006616003
## 16 1.561726e-03 0.0011519349
## 17 4.990119e-05 0.0004528440
## 18 3.109308e-03 0.0013801104
## 19 5.747014e-04 0.0007586963
## 20 1.357755e-03 0.0014043844
## 21 1.165932e-03 0.0007829703
## 22 1.825182e-04 0.0003508932
## 23 4.106989e-04 0.0007586963
## 24 9.716016e-04 0.0013315625
## 25 5.295079e-05 0.0001324273
## 26 2.680572e-04 0.0004965372
## 27 1.784571e-03 0.0013606912
## 28 1.126622e-03 0.0011956281
## 29 7.103035e-05 0.0002586521
## 30 2.387267e-04 0.0008218087
## 31 1.050907e-03 0.0012733049
## 32 4.170146e-05 0.0013801104
## 33 2.551691e-04 0.0009286142
## 34 1.261870e-03 0.0013655460
## 35 1.289627e-05 0.0004042960
## 36 2.199566e-05 0.0001469917
ggplot(data = data_test) + geom_point(aes(x = median_age,y = deaths_by_population)) + geom_point(aes(x = median_age,y = actualMA$predicted), color="blue") + geom_line(aes(x = median_age,y = actualMA$predicted), color = "red") + ggtitle("Death per population vs median age ")+ylab("Deaths")+xlab("Median Age")
The above linear regression plot corroborates the evidence that the positive association between the country’s deaths and their median age.
linearModOA <- lm(deaths_by_population ~ aged_65_older , data = data_train)
summary(linearModOA)
##
## Call:
## lm(formula = deaths_by_population ~ aged_65_older, data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0018881 -0.0003551 -0.0002329 0.0002692 0.0053366
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.514e-04 1.149e-04 1.317 0.19
## aged_65_older 6.869e-05 1.091e-05 6.294 3.73e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0008048 on 140 degrees of freedom
## Multiple R-squared: 0.2205, Adjusted R-squared: 0.215
## F-statistic: 39.61 on 1 and 140 DF, p-value: 3.728e-09
predOA <- predict(linearModOA, data_test)
actualOA <- data.frame(cbind(actuals=(data_test$deaths_by_population), predicted=predOA))
actualOA
## actuals predicted
## 1 1.792524e-04 0.0003286609
## 2 1.061320e-04 0.0005780111
## 3 2.181436e-03 0.0014270383
## 4 8.651455e-04 0.0004160365
## 5 9.320973e-06 0.0003742034
## 6 7.790358e-04 0.0004220814
## 7 8.132696e-06 0.0003168460
## 8 5.040333e-05 0.0003687767
## 9 2.029085e-05 0.0004024356
## 10 1.891432e-03 0.0009129510
## 11 3.223041e-06 0.0008823146
## 12 3.252516e-04 0.0011637437
## 13 4.238228e-04 0.0007196530
## 14 9.641631e-04 0.0014875556
## 15 3.792763e-04 0.0005789041
## 16 1.561726e-03 0.0011723989
## 17 4.990119e-05 0.0004810874
## 18 3.109308e-03 0.0014274505
## 19 5.747014e-04 0.0006315905
## 20 1.357755e-03 0.0015083004
## 21 1.165932e-03 0.0007362077
## 22 1.825182e-04 0.0004608921
## 23 4.106989e-04 0.0004343772
## 24 9.716016e-04 0.0014857696
## 25 5.295079e-05 0.0003682959
## 26 2.680572e-04 0.0004812935
## 27 1.784571e-03 0.0013775117
## 28 1.126622e-03 0.0011252765
## 29 7.103035e-05 0.0003556567
## 30 2.387267e-04 0.0003777066
## 31 1.050907e-03 0.0013442650
## 32 4.170146e-05 0.0011071419
## 33 2.551691e-04 0.0008430230
## 34 1.261870e-03 0.0014177650
## 35 1.289627e-05 0.0003894529
## 36 2.199566e-05 0.0003956351
ggplot(data = data_test) + geom_point(aes(x = aged_65_older,y = deaths_by_population)) + geom_point(aes(x = aged_65_older,y = actualOA$predicted), color="blue") + geom_line(aes(x = aged_65_older,y = actualOA$predicted), color = "red")+
ggtitle("Death per population vs people aged 65 or older country percentages")+ylab("Deaths")+xlab("Percentage of people")
The above linear regression plot corroborates the evidence that the positive association between the country’s deaths and 65 or older population.
The dataset used below is the combination of the protein for different categories of food (all calculated as percentage of total intake amount) , world population obesity and undernourished rate from around the data in an effort to learn how healthy eating style helps in combating Covid.
covid_pro <- read_csv("C:/Users/keert/Desktop/R programming/Protein_Supply_Quantity_Data.csv")
## Rows: 170 Columns: 32
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (3): Country, Undernourished, Unit (all except Population)
## dbl (29): Alcoholic Beverages, Animal Products, Animal fats, Aquatic Product...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(covid_pro)
## [1] 170 32
covid_final <- left_join(covid,covid_pro, by="Country")
Obesity_clean<-covid_final%>% filter( !is.na(covid_final$Obesity)& !is.na(covid_final$total_deaths) &!is.na(covid_final$population))
Obesity_clean
## # A tibble: 151 x 41
## Country total_cases total_deaths total_tests total_vaccinati~ population
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 151291 6978 NA 1767239 38928341
## 2 Albania 134487 2460 737014 1280239 2877800
## 3 Algeria 184191 4654 NA 4146091 43851043
## 4 Angola 43998 1063 NA 1695588 32866268
## 5 Antigua and~ 1372 43 16700 69903 97928
## 6 Argentina 5052884 108388 12442184 35247776 45195777
## 7 Armenia 233001 4664 1372405 194902 2963234
## 8 Australia 37749 947 26751476 13958045 25499881
## 9 Austria 665035 10752 69283203 10039144 9006400
## 10 Azerbaijan 357058 5095 4133796 5478042 10139175
## # ... with 141 more rows, and 35 more variables: population_density <dbl>,
## # median_age <dbl>, aged_65_older <dbl>, aged_70_older <dbl>,
## # Alcoholic Beverages <dbl>, Animal Products <dbl>, Animal fats <dbl>,
## # Aquatic Products, Other <dbl>, Cereals - Excluding Beer <dbl>, Eggs <dbl>,
## # Fish, Seafood <dbl>, Fruits - Excluding Wine <dbl>, Meat <dbl>,
## # Milk - Excluding Butter <dbl>, Offals <dbl>, Oilcrops <dbl>, Pulses <dbl>,
## # Spices <dbl>, Starchy Roots <dbl>, Stimulants <dbl>, Sugar Crops <dbl>, ...
Discretizing the values in the obesity variable into “low”, “medium”, “high”, “very high” levels
Obesity_clean$Obese_level <- discretize(Obesity_clean$Obesity, method = "frequency", breaks = 4, labels = c("low", "medium", "high", "very high"))
ggplot(Obesity_clean, aes(x=Obesity, y=total_cases/population, color=Obese_level, group=Obese_level)) + geom_line()
From the above plot it can be observed that as the obesity increases the number of total cases in the population is also increased. But there is an unusual spike at the low obese population with high number of cases, which states that India is a low obese level country with unusual increase in total cases in recent times.
ggplot(Obesity_clean, aes(x=Obesity, y=total_deaths/population, color=Obese_level, group=Obese_level)) + geom_line()
from the above plot it can be observed that the unusual spike observed in the above plot does not appear in this plot, from which we can also conclude that low obese people who were contracted the infection recovered.
1.Selecting the required variables from the Protein intake dataset and omitting the Null values. 2.Converting the column names to row names to plot the graph below for the cluster analysis. 3. Rescaling the variables to manipulate certain numeric columns to a particular range.
covidcluster <- select(covid_final,Country,`Animal fats`,`Animal Products`,Eggs,Meat,`Vegetal Products`,Obesity,total_deaths,population)
covidcluster<-na.omit(covidcluster)
covidcluster<-covidcluster %>% column_to_rownames(., var = "Country")
covidcluster$deaths_by_population <- (covidcluster$total_deaths/covidcluster$population)
covidcluster<- select(covidcluster,deaths_by_population,`Animal fats`,`Animal Products`, Eggs,Meat,`Vegetal Products`,Obesity)
covidcluster$deaths_by_population<-rescale(covidcluster$deaths_by_population)
covidcluster$`Animal fats`<-rescale(covidcluster$`Animal fats`)
covidcluster$`Animal Products`<-rescale(covidcluster$`Animal Products`)
covidcluster$Eggs<-rescale(covidcluster$Eggs)
covidcluster$Meat<-rescale(covidcluster$Meat)
covidcluster$`Vegetal Products`<-rescale(covidcluster$`Vegetal Products`)
covidcluster$Obesity<-rescale(covidcluster$Obesity)
fviz_nbclust(covidcluster, kmeans, method = "silhouette")
So using the silhouette we can see that the optimal number of cluster k =2
k2 <- kmeans(covidcluster, centers = 2, nstart = 25)
fviz_cluster(k2, data = covidcluster)
The above graph shows that the group of countries cluster 1 and cluster 2 have similar similar kind of food intake, we can classify observations in k groups, based on their similarity.
covidcluster %>%
as_tibble() %>%
mutate(cluster = k2$cluster,
Country = row.names(covidcluster)) %>%
ggplot(aes(`Animal fats`,deaths_by_population, color = factor(cluster), label = Country)) +
geom_text()
It can observed that the cluster 1 has less intake of protein from animal fats and less number of deaths, whereas in cluster 2, the deaths increased with the increase in the protien intake from Animal fats.
covidcluster2 <- aggregate(covidcluster, by=list(cluster=k2$cluster), mean)
covidcluster3 <- covidcluster2 %>% pivot_longer(names_to = "Factors", values_to = "Value", c(deaths_by_population,`Animal fats`, `Animal Products`,Eggs , Meat ,`Vegetal Products`,Obesity))
ggplot(covidcluster3, aes(Factors, Value, fill = factor(cluster))) + geom_col(position = "dodge")+ggtitle("Death per population vs Protien from various sources ")
The clustered centers plot of the protiens intake from various sources and the deaths and observed as the proteins intake through vegetal Products increased, the number of deaths decreased. But for all other sources like animal fats, animal products, eggs and Meat their consumption increased the deaths.
covid_ar<-covid_final
Discretizing the values of the required variables in dataset into “low”, “moderate”, “high”
covid_ar$total_deaths.bracket <- discretize(covid_ar$total_deaths, method = "frequency", breaks = 2, labels = c("low", "high"))
covid_ar$Animalfats.consumption <- discretize(covid_ar$`Animal fats`, method = "frequency", breaks = 3, labels = c("low", "moderate", "high"))
covid_ar$AnimalProducts.consumption <- discretize(covid_ar$`Animal Products`, method = "frequency", breaks = 3, labels = c("low", "moderate", "high"))
covid_ar$Eggs.consumption <- discretize(covid_ar$Eggs, method = "frequency", breaks = 3, labels = c("low", "moderate", "high"))
covid_ar$Meat.consumption <- discretize(covid_ar$Meat, method = "frequency", breaks = 3, labels = c("low", "moderate", "high"))
covid_ar$VegetalProducts.consumption <- discretize(covid_ar$`Vegetal Products`, method = "frequency", breaks = 3, labels = c("low", "moderate", "high"))
covid_ar$Obesity.level <- discretize(covid_ar$Obesity, method = "frequency", breaks = 3, labels = c("low", "moderate", "high"))
covid_ar$CerealsExcludingBeer.consumption <- discretize(covid_ar$`Cereals - Excluding Beer`, method = "frequency", breaks = 3, labels = c("low", "moderate", "high"))
covid_ar$FishSeafood.consumption <- discretize(covid_ar$`Fish, Seafood`, method = "frequency", breaks = 3, labels = c("low", "moderate", "high"))
covid_ar$FruitsExcludingWine.consumption <- discretize(covid_ar$`Fruits - Excluding Wine`, method = "frequency", breaks = 3, labels = c("low", "moderate", "high"))
covid_ar$MilkExcludingButter.consumption <- discretize(covid_ar$`Milk - Excluding Butter`, method = "frequency", breaks = 3, labels = c("low", "moderate", "high"))
Convert the data into transactional data
covid_ar_tr <- as(covid_ar[, sapply(covid_ar, is.factor)], "transactions")
str(covid_ar_tr)
## Formal class 'transactions' [package "arules"] with 3 slots
## ..@ data :Formal class 'ngCMatrix' [package "Matrix"] with 5 slots
## .. .. ..@ i : int [1:1772] 1 2 5 8 11 16 17 22 23 27 ...
## .. .. ..@ p : int [1:233] 0 11 12 23 34 35 46 46 57 68 ...
## .. .. ..@ Dim : int [1:2] 32 232
## .. .. ..@ Dimnames:List of 2
## .. .. .. ..$ : NULL
## .. .. .. ..$ : NULL
## .. .. ..@ factors : list()
## ..@ itemInfo :'data.frame': 32 obs. of 3 variables:
## .. ..$ labels : chr [1:32] "total_deaths.bracket=low" "total_deaths.bracket=high" "Animalfats.consumption=low" "Animalfats.consumption=moderate" ...
## .. ..$ variables: Factor w/ 11 levels "Animalfats.consumption",..: 10 10 1 1 1 2 2 2 4 4 ...
## .. ..$ levels : Factor w/ 3 levels "high","low","moderate": 2 1 2 3 1 2 3 1 2 3 ...
## ..@ itemsetInfo:'data.frame': 232 obs. of 1 variable:
## .. ..$ transactionID: chr [1:232] "1" "2" "3" "4" ...
covid_ar_tr
## transactions in sparse format with
## 232 transactions (rows) and
## 32 items (columns)
rules <- apriori(data = covid_ar_tr, parameter = list(support = 0.05, confidence = 1), appearance = list (rhs=c("total_deaths.bracket=low")) )
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 1 0.1 1 none FALSE TRUE 5 0.05 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 11
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[32 item(s), 232 transaction(s)] done [0.00s].
## sorting and recoding items ... [32 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(head(sort(rules, by="support"), 5))
From the results we can say that low consumption of eggs, milk excluding butter ,animal products and low obesity levels resulted in less number of deaths. On the other hand, high consumption of Vegetal products and fruits excluding wine also resulted in the less number of deaths.