# Clear workspace
rm(list = ls())

# Load required libraries
library(readxl)
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
# GitHub raw file URL
url <- "https://github.com/DACSS-601-SHIVAM/LAST-HW-EX2/raw/refs/heads/main/HDR25_Statistical_Annex_HDI_Table.xlsx"

# Create temporary file
temp_file <- tempfile(fileext = ".xlsx")

# Download Excel file
download.file(url, temp_file, mode = "wb")
trying URL 'https://github.com/DACSS-601-SHIVAM/LAST-HW-EX2/raw/refs/heads/main/HDR25_Statistical_Annex_HDI_Table.xlsx'
Content type 'application/octet-stream' length 44086 bytes (43 KB)
downloaded 43 KB
# Read Excel file (skip metadata rows)
raw_data <- read_excel(temp_file, skip = 4)

# Inspect structure
str(raw_data)
tibble [274 × 15] (S3: tbl_df/tbl/data.frame)
 $ ...1                                  : chr [1:274] "HDI rank" NA NA "1" ...
 $ ...2                                  : chr [1:274] "Country" NA "Very high human development" "Iceland" ...
 $ Human Development Index (HDI)         : chr [1:274] "Value" "2023" NA "0.97199999999999998" ...
 $ ...4                                  : logi [1:274] NA NA NA NA NA NA ...
 $ Life expectancy at birth              : chr [1:274] "(years)" "2023" NA "82.691000000000003" ...
 $ ...6                                  : chr [1:274] NA NA NA NA ...
 $ Expected years of schooling           : chr [1:274] "(years)" "2023" NA "18.850589750000001" ...
 $ ...8                                  : chr [1:274] NA "a" NA "c" ...
 $ Mean years of schooling               : chr [1:274] "(years)" "2023" NA "13.908926279999999" ...
 $ ...10                                 : chr [1:274] NA "a" NA "d" ...
 $ Gross national income (GNI) per capita: chr [1:274] "(2021 PPP $)" "2023" NA "69116.937359999996" ...
 $ ...12                                 : chr [1:274] NA NA NA NA ...
 $ GNI per capita rank minus HDI rank    : chr [1:274] NA "2023" NA "12" ...
 $ ...14                                 : chr [1:274] NA "b" NA NA ...
 $ HDI rank                              : chr [1:274] NA "2022" NA "3" ...
hdi_clean <- raw_data %>%
  
  # Select only the relevant columns by position
  select(
    country = 2,              # Country name
    life_expectancy = 5,      # Life expectancy at birth
    expected_schooling = 7,   # Expected years of schooling
    mean_schooling = 9,       # Mean years of schooling
    gni_per_capita = 11       # Gross national income per capita
  ) %>%
  
  # Remove rows with missing values
  drop_na() %>%
  
  # Remove the first row containing units rather than data
  slice(-1) %>%
  
  # Convert character columns to numeric values
  mutate(across(
    c(life_expectancy, expected_schooling, mean_schooling, gni_per_capita),
    as.numeric
  )) %>%
  
  # Remove any rows that became NA after numeric conversion
  drop_na()

# Check structure of cleaned dataset
str(hdi_clean)
tibble [208 × 5] (S3: tbl_df/tbl/data.frame)
 $ country           : chr [1:208] "Iceland" "Norway" "Switzerland" "Denmark" ...
 $ life_expectancy   : num [1:208] 82.7 83.3 84 81.9 81.4 ...
 $ expected_schooling: num [1:208] 18.9 18.8 16.7 18.7 17.3 ...
 $ mean_schooling    : num [1:208] 13.9 13.1 13.9 13 14.3 ...
 $ gni_per_capita    : num [1:208] 69117 112710 81949 76008 64053 ...
# Summary statistics to confirm successful cleaning
summary(hdi_clean)
   country          life_expectancy expected_schooling
 Length:208         Min.   :54.46   Min.   : 5.635    
 Class :character   1st Qu.:67.58   1st Qu.:11.509    
 Mode  :character   Median :73.40   Median :13.320    
                    Mean   :73.07   Mean   :13.549    
                    3rd Qu.:78.10   3rd Qu.:15.656    
                    Max.   :85.71   Max.   :20.846    
 mean_schooling   gni_per_capita    
 Min.   : 1.412   Min.   :   688.3  
 1st Qu.: 6.788   1st Qu.:  5863.6  
 Median : 9.564   Median : 15933.8  
 Mean   : 9.107   Mean   : 24197.0  
 3rd Qu.:11.617   3rd Qu.: 35844.6  
 Max.   :14.296   Max.   :166811.7  
hdi_means <- hdi_clean %>%
  
  # Calculate mean values for each variable
  summarise(
    Mean_Life_Expectancy = mean(life_expectancy),
    Mean_Expected_Years_Schooling = mean(expected_schooling),
    Mean_Mean_Years_Schooling = mean(mean_schooling),
    Mean_GNI_per_Capita = mean(gni_per_capita)
  )

# Display the computed means
hdi_means
ggplot(hdi_clean, aes(x = life_expectancy)) +
  
  # Create histogram with 30 bins
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  
  # Add descriptive labels
  labs(
    title = "Distribution of Life Expectancy at Birth",
    x = "Life Expectancy (Years)",
    y = "Number of Countries"
  )

hdi_clean %>%
  
  # Convert data from wide to long format for plotting
  pivot_longer(
    cols = c(expected_schooling, mean_schooling),
    names_to = "schooling_type",
    values_to = "years"
  ) %>%
  
  # Create boxplot comparing schooling measures
  ggplot(aes(x = schooling_type, y = years, fill = schooling_type)) +
  geom_boxplot() +
  
  # Add plot labels
  labs(
    title = "Expected vs Mean Years of Schooling",
    x = "",
    y = "Years"
  ) +
  
  # Remove legend since labels are self-explanatory
  theme(legend.position = "none")

write.csv(hdi_clean, "hdi_cleaned.csv", row.names = FALSE)
write.csv(hdi_means, "hdi_means.csv", row.names = FALSE)
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpgYGB7cn0NCiMgQ2xlYXIgd29ya3NwYWNlDQpybShsaXN0ID0gbHMoKSkNCg0KIyBMb2FkIHJlcXVpcmVkIGxpYnJhcmllcw0KbGlicmFyeShyZWFkeGwpDQpsaWJyYXJ5KGRwbHlyKQ0KbGlicmFyeSh0aWR5cikNCmxpYnJhcnkoc3RyaW5ncikNCmxpYnJhcnkoZ2dwbG90MikNCg0KYGBgDQoNCmBgYHtyfQ0KIyBHaXRIdWIgcmF3IGZpbGUgVVJMDQp1cmwgPC0gImh0dHBzOi8vZ2l0aHViLmNvbS9EQUNTUy02MDEtU0hJVkFNL0xBU1QtSFctRVgyL3Jhdy9yZWZzL2hlYWRzL21haW4vSERSMjVfU3RhdGlzdGljYWxfQW5uZXhfSERJX1RhYmxlLnhsc3giDQoNCiMgQ3JlYXRlIHRlbXBvcmFyeSBmaWxlDQp0ZW1wX2ZpbGUgPC0gdGVtcGZpbGUoZmlsZWV4dCA9ICIueGxzeCIpDQoNCiMgRG93bmxvYWQgRXhjZWwgZmlsZQ0KZG93bmxvYWQuZmlsZSh1cmwsIHRlbXBfZmlsZSwgbW9kZSA9ICJ3YiIpDQoNCiMgUmVhZCBFeGNlbCBmaWxlIChza2lwIG1ldGFkYXRhIHJvd3MpDQpyYXdfZGF0YSA8LSByZWFkX2V4Y2VsKHRlbXBfZmlsZSwgc2tpcCA9IDQpDQoNCiMgSW5zcGVjdCBzdHJ1Y3R1cmUNCnN0cihyYXdfZGF0YSkNCg0KYGBgDQoNCmBgYHtyfQ0KaGRpX2NsZWFuIDwtIHJhd19kYXRhICU+JQ0KICANCiAgIyBTZWxlY3Qgb25seSB0aGUgcmVsZXZhbnQgY29sdW1ucyBieSBwb3NpdGlvbg0KICBzZWxlY3QoDQogICAgY291bnRyeSA9IDIsICAgICAgICAgICAgICAjIENvdW50cnkgbmFtZQ0KICAgIGxpZmVfZXhwZWN0YW5jeSA9IDUsICAgICAgIyBMaWZlIGV4cGVjdGFuY3kgYXQgYmlydGgNCiAgICBleHBlY3RlZF9zY2hvb2xpbmcgPSA3LCAgICMgRXhwZWN0ZWQgeWVhcnMgb2Ygc2Nob29saW5nDQogICAgbWVhbl9zY2hvb2xpbmcgPSA5LCAgICAgICAjIE1lYW4geWVhcnMgb2Ygc2Nob29saW5nDQogICAgZ25pX3Blcl9jYXBpdGEgPSAxMSAgICAgICAjIEdyb3NzIG5hdGlvbmFsIGluY29tZSBwZXIgY2FwaXRhDQogICkgJT4lDQogIA0KICAjIFJlbW92ZSByb3dzIHdpdGggbWlzc2luZyB2YWx1ZXMNCiAgZHJvcF9uYSgpICU+JQ0KICANCiAgIyBSZW1vdmUgdGhlIGZpcnN0IHJvdyBjb250YWluaW5nIHVuaXRzIHJhdGhlciB0aGFuIGRhdGENCiAgc2xpY2UoLTEpICU+JQ0KICANCiAgIyBDb252ZXJ0IGNoYXJhY3RlciBjb2x1bW5zIHRvIG51bWVyaWMgdmFsdWVzDQogIG11dGF0ZShhY3Jvc3MoDQogICAgYyhsaWZlX2V4cGVjdGFuY3ksIGV4cGVjdGVkX3NjaG9vbGluZywgbWVhbl9zY2hvb2xpbmcsIGduaV9wZXJfY2FwaXRhKSwNCiAgICBhcy5udW1lcmljDQogICkpICU+JQ0KICANCiAgIyBSZW1vdmUgYW55IHJvd3MgdGhhdCBiZWNhbWUgTkEgYWZ0ZXIgbnVtZXJpYyBjb252ZXJzaW9uDQogIGRyb3BfbmEoKQ0KDQojIENoZWNrIHN0cnVjdHVyZSBvZiBjbGVhbmVkIGRhdGFzZXQNCnN0cihoZGlfY2xlYW4pDQoNCiMgU3VtbWFyeSBzdGF0aXN0aWNzIHRvIGNvbmZpcm0gc3VjY2Vzc2Z1bCBjbGVhbmluZw0Kc3VtbWFyeShoZGlfY2xlYW4pDQoNCmBgYA0KDQoNCmBgYHtyfQ0KaGRpX21lYW5zIDwtIGhkaV9jbGVhbiAlPiUNCiAgDQogICMgQ2FsY3VsYXRlIG1lYW4gdmFsdWVzIGZvciBlYWNoIHZhcmlhYmxlDQogIHN1bW1hcmlzZSgNCiAgICBNZWFuX0xpZmVfRXhwZWN0YW5jeSA9IG1lYW4obGlmZV9leHBlY3RhbmN5KSwNCiAgICBNZWFuX0V4cGVjdGVkX1llYXJzX1NjaG9vbGluZyA9IG1lYW4oZXhwZWN0ZWRfc2Nob29saW5nKSwNCiAgICBNZWFuX01lYW5fWWVhcnNfU2Nob29saW5nID0gbWVhbihtZWFuX3NjaG9vbGluZyksDQogICAgTWVhbl9HTklfcGVyX0NhcGl0YSA9IG1lYW4oZ25pX3Blcl9jYXBpdGEpDQogICkNCg0KIyBEaXNwbGF5IHRoZSBjb21wdXRlZCBtZWFucw0KaGRpX21lYW5zDQpgYGANCg0KYGBge3J9DQpnZ3Bsb3QoaGRpX2NsZWFuLCBhZXMoeCA9IGxpZmVfZXhwZWN0YW5jeSkpICsNCiAgDQogICMgQ3JlYXRlIGhpc3RvZ3JhbSB3aXRoIDMwIGJpbnMNCiAgZ2VvbV9oaXN0b2dyYW0oYmlucyA9IDMwLCBmaWxsID0gInN0ZWVsYmx1ZSIsIGNvbG9yID0gIndoaXRlIikgKw0KICANCiAgIyBBZGQgZGVzY3JpcHRpdmUgbGFiZWxzDQogIGxhYnMoDQogICAgdGl0bGUgPSAiRGlzdHJpYnV0aW9uIG9mIExpZmUgRXhwZWN0YW5jeSBhdCBCaXJ0aCIsDQogICAgeCA9ICJMaWZlIEV4cGVjdGFuY3kgKFllYXJzKSIsDQogICAgeSA9ICJOdW1iZXIgb2YgQ291bnRyaWVzIg0KICApDQpgYGANCg0KYGBge3J9DQpoZGlfY2xlYW4gJT4lDQogIA0KICAjIENvbnZlcnQgZGF0YSBmcm9tIHdpZGUgdG8gbG9uZyBmb3JtYXQgZm9yIHBsb3R0aW5nDQogIHBpdm90X2xvbmdlcigNCiAgICBjb2xzID0gYyhleHBlY3RlZF9zY2hvb2xpbmcsIG1lYW5fc2Nob29saW5nKSwNCiAgICBuYW1lc190byA9ICJzY2hvb2xpbmdfdHlwZSIsDQogICAgdmFsdWVzX3RvID0gInllYXJzIg0KICApICU+JQ0KICANCiAgIyBDcmVhdGUgYm94cGxvdCBjb21wYXJpbmcgc2Nob29saW5nIG1lYXN1cmVzDQogIGdncGxvdChhZXMoeCA9IHNjaG9vbGluZ190eXBlLCB5ID0geWVhcnMsIGZpbGwgPSBzY2hvb2xpbmdfdHlwZSkpICsNCiAgZ2VvbV9ib3hwbG90KCkgKw0KICANCiAgIyBBZGQgcGxvdCBsYWJlbHMNCiAgbGFicygNCiAgICB0aXRsZSA9ICJFeHBlY3RlZCB2cyBNZWFuIFllYXJzIG9mIFNjaG9vbGluZyIsDQogICAgeCA9ICIiLA0KICAgIHkgPSAiWWVhcnMiDQogICkgKw0KICANCiAgIyBSZW1vdmUgbGVnZW5kIHNpbmNlIGxhYmVscyBhcmUgc2VsZi1leHBsYW5hdG9yeQ0KICB0aGVtZShsZWdlbmQucG9zaXRpb24gPSAibm9uZSIpDQpgYGANCg0KYGBge3J9DQp3cml0ZS5jc3YoaGRpX2NsZWFuLCAiaGRpX2NsZWFuZWQuY3N2Iiwgcm93Lm5hbWVzID0gRkFMU0UpDQp3cml0ZS5jc3YoaGRpX21lYW5zLCAiaGRpX21lYW5zLmNzdiIsIHJvdy5uYW1lcyA9IEZBTFNFKQ0KYGBgDQoNCg==