# Clear workspace
rm(list = ls())
# Load required libraries
library(readxl)
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
# GitHub raw file URL
url <- "https://github.com/DACSS-601-SHIVAM/LAST-HW-EX2/raw/refs/heads/main/HDR25_Statistical_Annex_HDI_Table.xlsx"
# Create temporary file
temp_file <- tempfile(fileext = ".xlsx")
# Download Excel file
download.file(url, temp_file, mode = "wb")
trying URL 'https://github.com/DACSS-601-SHIVAM/LAST-HW-EX2/raw/refs/heads/main/HDR25_Statistical_Annex_HDI_Table.xlsx'
Content type 'application/octet-stream' length 44086 bytes (43 KB)
downloaded 43 KB
# Read Excel file (skip metadata rows)
raw_data <- read_excel(temp_file, skip = 4)
# Inspect structure
str(raw_data)
tibble [274 × 15] (S3: tbl_df/tbl/data.frame)
$ ...1 : chr [1:274] "HDI rank" NA NA "1" ...
$ ...2 : chr [1:274] "Country" NA "Very high human development" "Iceland" ...
$ Human Development Index (HDI) : chr [1:274] "Value" "2023" NA "0.97199999999999998" ...
$ ...4 : logi [1:274] NA NA NA NA NA NA ...
$ Life expectancy at birth : chr [1:274] "(years)" "2023" NA "82.691000000000003" ...
$ ...6 : chr [1:274] NA NA NA NA ...
$ Expected years of schooling : chr [1:274] "(years)" "2023" NA "18.850589750000001" ...
$ ...8 : chr [1:274] NA "a" NA "c" ...
$ Mean years of schooling : chr [1:274] "(years)" "2023" NA "13.908926279999999" ...
$ ...10 : chr [1:274] NA "a" NA "d" ...
$ Gross national income (GNI) per capita: chr [1:274] "(2021 PPP $)" "2023" NA "69116.937359999996" ...
$ ...12 : chr [1:274] NA NA NA NA ...
$ GNI per capita rank minus HDI rank : chr [1:274] NA "2023" NA "12" ...
$ ...14 : chr [1:274] NA "b" NA NA ...
$ HDI rank : chr [1:274] NA "2022" NA "3" ...
hdi_clean <- raw_data %>%
# Select only the relevant columns by position
select(
country = 2, # Country name
life_expectancy = 5, # Life expectancy at birth
expected_schooling = 7, # Expected years of schooling
mean_schooling = 9, # Mean years of schooling
gni_per_capita = 11 # Gross national income per capita
) %>%
# Remove rows with missing values
drop_na() %>%
# Remove the first row containing units rather than data
slice(-1) %>%
# Convert character columns to numeric values
mutate(across(
c(life_expectancy, expected_schooling, mean_schooling, gni_per_capita),
as.numeric
)) %>%
# Remove any rows that became NA after numeric conversion
drop_na()
# Check structure of cleaned dataset
str(hdi_clean)
tibble [208 × 5] (S3: tbl_df/tbl/data.frame)
$ country : chr [1:208] "Iceland" "Norway" "Switzerland" "Denmark" ...
$ life_expectancy : num [1:208] 82.7 83.3 84 81.9 81.4 ...
$ expected_schooling: num [1:208] 18.9 18.8 16.7 18.7 17.3 ...
$ mean_schooling : num [1:208] 13.9 13.1 13.9 13 14.3 ...
$ gni_per_capita : num [1:208] 69117 112710 81949 76008 64053 ...
# Summary statistics to confirm successful cleaning
summary(hdi_clean)
country life_expectancy expected_schooling
Length:208 Min. :54.46 Min. : 5.635
Class :character 1st Qu.:67.58 1st Qu.:11.509
Mode :character Median :73.40 Median :13.320
Mean :73.07 Mean :13.549
3rd Qu.:78.10 3rd Qu.:15.656
Max. :85.71 Max. :20.846
mean_schooling gni_per_capita
Min. : 1.412 Min. : 688.3
1st Qu.: 6.788 1st Qu.: 5863.6
Median : 9.564 Median : 15933.8
Mean : 9.107 Mean : 24197.0
3rd Qu.:11.617 3rd Qu.: 35844.6
Max. :14.296 Max. :166811.7
hdi_means <- hdi_clean %>%
# Calculate mean values for each variable
summarise(
Mean_Life_Expectancy = mean(life_expectancy),
Mean_Expected_Years_Schooling = mean(expected_schooling),
Mean_Mean_Years_Schooling = mean(mean_schooling),
Mean_GNI_per_Capita = mean(gni_per_capita)
)
# Display the computed means
hdi_means
ggplot(hdi_clean, aes(x = life_expectancy)) +
# Create histogram with 30 bins
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
# Add descriptive labels
labs(
title = "Distribution of Life Expectancy at Birth",
x = "Life Expectancy (Years)",
y = "Number of Countries"
)

hdi_clean %>%
# Convert data from wide to long format for plotting
pivot_longer(
cols = c(expected_schooling, mean_schooling),
names_to = "schooling_type",
values_to = "years"
) %>%
# Create boxplot comparing schooling measures
ggplot(aes(x = schooling_type, y = years, fill = schooling_type)) +
geom_boxplot() +
# Add plot labels
labs(
title = "Expected vs Mean Years of Schooling",
x = "",
y = "Years"
) +
# Remove legend since labels are self-explanatory
theme(legend.position = "none")

write.csv(hdi_clean, "hdi_cleaned.csv", row.names = FALSE)
write.csv(hdi_means, "hdi_means.csv", row.names = FALSE)
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpgYGB7cn0NCiMgQ2xlYXIgd29ya3NwYWNlDQpybShsaXN0ID0gbHMoKSkNCg0KIyBMb2FkIHJlcXVpcmVkIGxpYnJhcmllcw0KbGlicmFyeShyZWFkeGwpDQpsaWJyYXJ5KGRwbHlyKQ0KbGlicmFyeSh0aWR5cikNCmxpYnJhcnkoc3RyaW5ncikNCmxpYnJhcnkoZ2dwbG90MikNCg0KYGBgDQoNCmBgYHtyfQ0KIyBHaXRIdWIgcmF3IGZpbGUgVVJMDQp1cmwgPC0gImh0dHBzOi8vZ2l0aHViLmNvbS9EQUNTUy02MDEtU0hJVkFNL0xBU1QtSFctRVgyL3Jhdy9yZWZzL2hlYWRzL21haW4vSERSMjVfU3RhdGlzdGljYWxfQW5uZXhfSERJX1RhYmxlLnhsc3giDQoNCiMgQ3JlYXRlIHRlbXBvcmFyeSBmaWxlDQp0ZW1wX2ZpbGUgPC0gdGVtcGZpbGUoZmlsZWV4dCA9ICIueGxzeCIpDQoNCiMgRG93bmxvYWQgRXhjZWwgZmlsZQ0KZG93bmxvYWQuZmlsZSh1cmwsIHRlbXBfZmlsZSwgbW9kZSA9ICJ3YiIpDQoNCiMgUmVhZCBFeGNlbCBmaWxlIChza2lwIG1ldGFkYXRhIHJvd3MpDQpyYXdfZGF0YSA8LSByZWFkX2V4Y2VsKHRlbXBfZmlsZSwgc2tpcCA9IDQpDQoNCiMgSW5zcGVjdCBzdHJ1Y3R1cmUNCnN0cihyYXdfZGF0YSkNCg0KYGBgDQoNCmBgYHtyfQ0KaGRpX2NsZWFuIDwtIHJhd19kYXRhICU+JQ0KICANCiAgIyBTZWxlY3Qgb25seSB0aGUgcmVsZXZhbnQgY29sdW1ucyBieSBwb3NpdGlvbg0KICBzZWxlY3QoDQogICAgY291bnRyeSA9IDIsICAgICAgICAgICAgICAjIENvdW50cnkgbmFtZQ0KICAgIGxpZmVfZXhwZWN0YW5jeSA9IDUsICAgICAgIyBMaWZlIGV4cGVjdGFuY3kgYXQgYmlydGgNCiAgICBleHBlY3RlZF9zY2hvb2xpbmcgPSA3LCAgICMgRXhwZWN0ZWQgeWVhcnMgb2Ygc2Nob29saW5nDQogICAgbWVhbl9zY2hvb2xpbmcgPSA5LCAgICAgICAjIE1lYW4geWVhcnMgb2Ygc2Nob29saW5nDQogICAgZ25pX3Blcl9jYXBpdGEgPSAxMSAgICAgICAjIEdyb3NzIG5hdGlvbmFsIGluY29tZSBwZXIgY2FwaXRhDQogICkgJT4lDQogIA0KICAjIFJlbW92ZSByb3dzIHdpdGggbWlzc2luZyB2YWx1ZXMNCiAgZHJvcF9uYSgpICU+JQ0KICANCiAgIyBSZW1vdmUgdGhlIGZpcnN0IHJvdyBjb250YWluaW5nIHVuaXRzIHJhdGhlciB0aGFuIGRhdGENCiAgc2xpY2UoLTEpICU+JQ0KICANCiAgIyBDb252ZXJ0IGNoYXJhY3RlciBjb2x1bW5zIHRvIG51bWVyaWMgdmFsdWVzDQogIG11dGF0ZShhY3Jvc3MoDQogICAgYyhsaWZlX2V4cGVjdGFuY3ksIGV4cGVjdGVkX3NjaG9vbGluZywgbWVhbl9zY2hvb2xpbmcsIGduaV9wZXJfY2FwaXRhKSwNCiAgICBhcy5udW1lcmljDQogICkpICU+JQ0KICANCiAgIyBSZW1vdmUgYW55IHJvd3MgdGhhdCBiZWNhbWUgTkEgYWZ0ZXIgbnVtZXJpYyBjb252ZXJzaW9uDQogIGRyb3BfbmEoKQ0KDQojIENoZWNrIHN0cnVjdHVyZSBvZiBjbGVhbmVkIGRhdGFzZXQNCnN0cihoZGlfY2xlYW4pDQoNCiMgU3VtbWFyeSBzdGF0aXN0aWNzIHRvIGNvbmZpcm0gc3VjY2Vzc2Z1bCBjbGVhbmluZw0Kc3VtbWFyeShoZGlfY2xlYW4pDQoNCmBgYA0KDQoNCmBgYHtyfQ0KaGRpX21lYW5zIDwtIGhkaV9jbGVhbiAlPiUNCiAgDQogICMgQ2FsY3VsYXRlIG1lYW4gdmFsdWVzIGZvciBlYWNoIHZhcmlhYmxlDQogIHN1bW1hcmlzZSgNCiAgICBNZWFuX0xpZmVfRXhwZWN0YW5jeSA9IG1lYW4obGlmZV9leHBlY3RhbmN5KSwNCiAgICBNZWFuX0V4cGVjdGVkX1llYXJzX1NjaG9vbGluZyA9IG1lYW4oZXhwZWN0ZWRfc2Nob29saW5nKSwNCiAgICBNZWFuX01lYW5fWWVhcnNfU2Nob29saW5nID0gbWVhbihtZWFuX3NjaG9vbGluZyksDQogICAgTWVhbl9HTklfcGVyX0NhcGl0YSA9IG1lYW4oZ25pX3Blcl9jYXBpdGEpDQogICkNCg0KIyBEaXNwbGF5IHRoZSBjb21wdXRlZCBtZWFucw0KaGRpX21lYW5zDQpgYGANCg0KYGBge3J9DQpnZ3Bsb3QoaGRpX2NsZWFuLCBhZXMoeCA9IGxpZmVfZXhwZWN0YW5jeSkpICsNCiAgDQogICMgQ3JlYXRlIGhpc3RvZ3JhbSB3aXRoIDMwIGJpbnMNCiAgZ2VvbV9oaXN0b2dyYW0oYmlucyA9IDMwLCBmaWxsID0gInN0ZWVsYmx1ZSIsIGNvbG9yID0gIndoaXRlIikgKw0KICANCiAgIyBBZGQgZGVzY3JpcHRpdmUgbGFiZWxzDQogIGxhYnMoDQogICAgdGl0bGUgPSAiRGlzdHJpYnV0aW9uIG9mIExpZmUgRXhwZWN0YW5jeSBhdCBCaXJ0aCIsDQogICAgeCA9ICJMaWZlIEV4cGVjdGFuY3kgKFllYXJzKSIsDQogICAgeSA9ICJOdW1iZXIgb2YgQ291bnRyaWVzIg0KICApDQpgYGANCg0KYGBge3J9DQpoZGlfY2xlYW4gJT4lDQogIA0KICAjIENvbnZlcnQgZGF0YSBmcm9tIHdpZGUgdG8gbG9uZyBmb3JtYXQgZm9yIHBsb3R0aW5nDQogIHBpdm90X2xvbmdlcigNCiAgICBjb2xzID0gYyhleHBlY3RlZF9zY2hvb2xpbmcsIG1lYW5fc2Nob29saW5nKSwNCiAgICBuYW1lc190byA9ICJzY2hvb2xpbmdfdHlwZSIsDQogICAgdmFsdWVzX3RvID0gInllYXJzIg0KICApICU+JQ0KICANCiAgIyBDcmVhdGUgYm94cGxvdCBjb21wYXJpbmcgc2Nob29saW5nIG1lYXN1cmVzDQogIGdncGxvdChhZXMoeCA9IHNjaG9vbGluZ190eXBlLCB5ID0geWVhcnMsIGZpbGwgPSBzY2hvb2xpbmdfdHlwZSkpICsNCiAgZ2VvbV9ib3hwbG90KCkgKw0KICANCiAgIyBBZGQgcGxvdCBsYWJlbHMNCiAgbGFicygNCiAgICB0aXRsZSA9ICJFeHBlY3RlZCB2cyBNZWFuIFllYXJzIG9mIFNjaG9vbGluZyIsDQogICAgeCA9ICIiLA0KICAgIHkgPSAiWWVhcnMiDQogICkgKw0KICANCiAgIyBSZW1vdmUgbGVnZW5kIHNpbmNlIGxhYmVscyBhcmUgc2VsZi1leHBsYW5hdG9yeQ0KICB0aGVtZShsZWdlbmQucG9zaXRpb24gPSAibm9uZSIpDQpgYGANCg0KYGBge3J9DQp3cml0ZS5jc3YoaGRpX2NsZWFuLCAiaGRpX2NsZWFuZWQuY3N2Iiwgcm93Lm5hbWVzID0gRkFMU0UpDQp3cml0ZS5jc3YoaGRpX21lYW5zLCAiaGRpX21lYW5zLmNzdiIsIHJvdy5uYW1lcyA9IEZBTFNFKQ0KYGBgDQoNCg==