568CCourseProj

DATA SCIENCE QUESTION

The cost of a college education is skyrocketing. It’s no surprise that student loans now make up the largest chunk of U.S. non-housing debt. If we know how much to save for college education in the future, we will have the flexibility of making financial adjustments without compromising on the quality of life and make more informed financial decisions. With this motto, I would like to answer the following questions through modelling:  What would be the tuition fees for any 4 Year public university program in the future?  Which factor influences the education cost the most – Debt_GDP or Consumer Price Index?

INPUT AND OUTPUT VARIABLES

 Output variable: Average Tuition and Fees and Room and Board (Enrollment-Weighted) in Current Dollars from 1986-2019 for any 4 Year Public University degree.  Input variables: • Consumer price index (CPI): The Consumer Price Index (CPI) is a measure that examines the average of prices of a basket of consumer goods and services, such as transportation, food, and medical care. It is calculated by taking price changes for each item in the predetermined basket of goods and averaging them. Changes in the CPI are used to assess price changes associated with the cost of living; the CPI is one of the most frequently used statistics for identifying periods of inflation or deflation. • Debt-to-GDP ratio: A metric comparing a country’s public debt to its gross domestic product (GDP). By comparing what a country owes with what it produces, the debt-to-GDP ratio reliably indicates that particular country’s ability to pay back its debts. Often expressed as a percentage, this ratio can also be interpreted as the number of Years needed to pay back debt, if GDP is dedicated entirely to debt repayment.

DATA SOURCE

Bureau of Labor Statistics (Consumer Price Index, All Urban Consumers).

CLEANING DATA AND FEATURE ENGINEERING

 Read the education costs data set, clean the data by converting literal NA’s and hyphens to 0’s, and scrapping off the irrelevant texts.  Combine tuition fees and board room fees into total costs for public2Year, public4Year and privatenonprofit4Year courses.  Convert all the costs into numeric values  Get the distribution of costs using histogram and bar plot  Get the log –transformed distribution of costs using histogram  Calculate the mean and median costs of public2Year, public4Year and private4Year  Read the inflation and Debt_GDP datasets  Merge them with education costs dataset for the Years 1986-2019  Rename the columns appropriately.

#1) Read the worksheet number 5, skip 1st 11 rows
data <- readxl::read_xlsx('college_pricing.xlsx',sheet = 5, skip=11)

#2) Remove the last 3 lines from the dataset which are simplyextra texts
data <- head(data,-3)

#3)look for NA's
for(i in 1:dim(data)[2])
{
  print(paste(names(data)[i], length(which(is.na(data[,i])))))
}

## [1] "In Current Dollars 0"
## [1] "1986 0"
## [1] "1987 0"
## [1] "1988 0"
## [1] "1989 0"
## [1] "1990 0"
## [1] "1991 0"
## [1] "1992 0"
## [1] "1993 0"
## [1] "1994 0"
## [1] "1995 0"
## [1] "1996 0"
## [1] "1997 0"
## [1] "1998 0"
## [1] "1999 0"
## [1] "2000 0"
## [1] "2001 0"
## [1] "2002 0"
## [1] "2003 0"
## [1] "2004 0"
## [1] "2005 0"
## [1] "2006 0"
## [1] "2007 0"
## [1] "2008 0"
## [1] "2009 0"
## [1] "2010 0"
## [1] "2011 0"
## [1] "2012 0"
## [1] "2013 0"
## [1] "2014 0"
## [1] "2015 0"
## [1] "2016 0"
## [1] "2017 0"
## [1] "2018 0"
## [1] "2019 0"
## [1] "10-Year $ Change 0"
## [1] "10-Year % Change 0"

#4) Get each of row's data and remove 1st column as these are row headers
data_row1 <- data[1,-1]
data_row2 <- data[2,-1]
data_row3 <- data[3,-1]
data_row8 <- data[8,-1]
data_row9 <- data[9,-1]

#5) Get the length of the dataset from above step
length(data_row1)

## [1] 36

#6) Remove the last 2 values from the obtained row vectors in step 4 and convert all the values into #numeric values
Tuition_Public_Two_Year <- as.numeric(data_row1[c(-35,-36)])
Tuition_Public_Four_Year <- as.numeric(data_row2[c(-35,-36)])
Tuition_Private_Nonprofit_Four_Year <- as.numeric(data_row3[c(-35,-36)])
Tuition_Room_Board_Public_Four_Year <- as.numeric(data_row8[c(-35,-36)])
Tuition_Room_Board_Private_Nonprofit_Four_Year <- as.numeric(data_row9[c(-35,-36)])

#7) All entries from the row vectors in step 4 are converted to numeric
Tuition_Public_Two_Year_all <- as.numeric(data_row1)

#8) transpose the data df in order to get the row names or Years using rownames
data_transpose <- t(data)

#9) Remove the 1st and last 2 values from rownames(data_transpose) to get only the Years
Year <- rownames(data_transpose)[c(-1,-36)][-35]

#10) Look for values '—' and convert them into 0. Format row 4 and row 7 as numeric 0s
data[data == '—'] <- 0
data_row7 <- data[7,-1]
data_row4 <- data[4,-1]
Room_Board_Public_Two_Year <- as.numeric(data_row4)
Tuition_Room_Board_Public_Two_Year <- as.numeric(data_row7)

#11)Update the 7th row count as sum of  row 1 and row 4
Tuition_Room_Board_Public_Two_Year <- Tuition_Public_Two_Year_all + Room_Board_Public_Two_Year
Tuition_Room_Board_Public_Two_Year <- as.numeric(Tuition_Room_Board_Public_Two_Year[c(-35,-36)])
data[7,-1] <- Tuition_Public_Two_Year_all + Room_Board_Public_Two_Year

#12a) Plot the barplots for the datasets in step 6  where x axis= Years
barplot(Tuition_Room_Board_Public_Four_Year, names.arg=Year, xlab='Tuition and Fees and Room and Board-Public Four-Year')

#12b) Histogram
hist(Tuition_Room_Board_Public_Four_Year)

#12c) log10 tranformation to get a more normal shape
hist(log10(Tuition_Room_Board_Public_Four_Year))

# Get 10 Year change of 4-year cost
Tuition_Room_Board_Public_Four_Year[34] - Tuition_Room_Board_Public_Four_Year[1]

## [1] 16520

#Get the statistics of mean, median, sd, var, min and max values

#Read the inflation dataset
inf <- read.csv("CPI.csv",stringsAsFactors = FALSE)

# Read the Debt to GDP data
GDP <- read.csv("Debt_to_GDP.csv",stringsAsFactors = FALSE)
colnames(GDP)[colnames(GDP) == "Government.Debt.as...of.GDP"] <- "Debt/GDP"

#  Combine data for regression analysis
comdata <- data.frame(Year,Tuition_Room_Board_Public_Two_Year,Tuition_Room_Board_Public_Four_Year,Tuition_Room_Board_Private_Nonprofit_Four_Year)
alldata <- data.frame(comdata,inf$CPI,GDP$"Debt/GDP")
colnames(alldata) <- c("Year", "Public2Cost","Public4Cost","Private4Cost","CPI","Debt_GDP")

# Changing the Year to numeric value
alldata$Year <- as.numeric(as.character(alldata$Year))

# Get the statistics of 4-year cost, CPI and GDP
summary(alldata$Public4Cost)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3860    6372    9535   10874   15685   20380

summary(alldata$CPI)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   109.5   149.4   182.0   185.0   224.4   256.6

summary(alldata$Debt_GDP)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47.58   58.28   63.59   72.89   96.40  105.46

hist(alldata$Public4Cost)

hist(alldata$CPI)

hist(alldata$Debt_GDP)

#Split the public4Year tuition fees dataset into training and test datasets

#19)Training and test datasets
train_data <- alldata[1:24,]
test_data <- alldata[-c(1:24),]

FITTING MODEL

 Identify the predictors and data type of the predictors

predictor_names <- c("CPI","Debt_GDP","Year")


for(pname in predictor_names)
{
  print(typeof(train_data[,pname]))
}

## [1] "double"
## [1] "double"
## [1] "double"

train_data1 <- as.data.frame(cbind(train_data$Year,train_data$CPI,train_data$Debt_GDP))
colnames(train_data1) <- c("Year", "CPI","Debt_GDP")
cormat <- round(cor(train_data1, use="pairwise.complete.obs"),2)
head(cormat)

##          Year  CPI Debt_GDP
## Year     1.00 1.00     0.65
## CPI      1.00 1.00     0.69
## Debt_GDP 0.65 0.69     1.00

upper<-cormat
upper[upper.tri(cormat)]<-""
upper<-as.data.frame(upper)
upper

##          Year  CPI Debt_GDP
## Year        1              
## CPI         1    1         
## Debt_GDP 0.65 0.69        1

get_lower_tri<-function(cormat){
    cormat[upper.tri(cormat)] <- NA
    return(cormat)
  }
  # Get upper triangle of the correlation matrix
  get_upper_tri <- function(cormat){
    cormat[lower.tri(cormat)]<- NA
    return(cormat)
  }

reorder_cormat <- function(cormat){
# Use correlation between variables as distance
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
}

upper_tri <- get_upper_tri(cormat)
lower_tri <- get_lower_tri(cormat)

cormat <- reorder_cormat(cormat)
upper_tri <- get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(X2, X1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
    name="Pearson\nCorrelation") +
  theme_minimal()+ # minimal theme
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()
# Print the heatmap


ggheatmap <- ggheatmap + 
geom_text(aes(X2, X1, label = value), color = "black", size = 4) +
theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  panel.grid.major = element_blank(),
  panel.border = element_blank(),
  panel.background = element_blank(),
  axis.ticks = element_blank(),
  legend.justification = c(1, 3),
  legend.position = c(-0.5, 0.7),
  legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                title.position = "left", title.hjust = .8))

print(ggheatmap)

## Warning: Removed 3 rows containing missing values (geom_text).

Removed ‘year’ pedictor as year was highly correlated with CPI

LINEAR REGRESSION

Linear regression model results:  Null hypothesis test shows that there exists a linear association between Public4cost and Debt_GDP  Null hypothesis test shows that there exists a linear association between Public4cost and CPI  Null hypothesis test is statistically significant, therefore there exists a linear association between Public4cost, CPI and Debt_GDP in tandem  The coefficient of CPI changes when Debt_GDP is added to the model. This implies Debt_GDP and CPI are correlated. Pearson correlation = 0.7029612 which implies positive linear association between Debt_GDP and CPI.  CPI is strongly correlated with Public4Cost than Debt_GDP

Verify the association between price and predictor variables

# Public 4 Year cost and Debt GDP
plot(Public4Cost ~ Debt_GDP, data=train_data, col="darkred")
abline(lm(Public4Cost ~ Debt_GDP, data=train_data))

#summary(lm(Public4Cost ~ Debt_GDP, data=train_data))

# TEST FOR NULL HYPOTHESIS
# E(FEV) = alpha + beta * Debt_GDP
# H0 : beta = 0 
n=nrow(train_data) 
Y=train_data$Public4Cost
X=cbind(rep(1,n),train_data$Debt_GDP) 
p=ncol(X) 
beta=solve(t(X) %*% X) %*% t(X) %*% Y 
muhat= X %*% beta 
sigmasq=sum((Y - muhat)^2)/(n-p) 
covbeta=sigmasq * solve(t(X) %*% X) 
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)

##         beta     sebeta
## 1 -7811.7348 3425.15314
## 2   259.2587   55.80052

z=beta[2]/sebeta[2] 
p=2*pt(z,df=n-2,lower.tail=FALSE) 
data.frame(z,p)

##          z            p
## 1 4.646171 0.0001245065

#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between Public4Cost and Debt_GDP

# Public 4 Year cost and CPI
plot(Public4Cost ~ CPI, data=train_data, col="darkred")
abline(lm(Public4Cost ~ CPI, data=train_data))

#summary(lm(Public4Cost ~ CPI, data=train_data))

# TEST FOR NULL HYPOTHESIS
# E(FEV) = alpha + beta * CPI
# H0 : beta = 0 
n=nrow(train_data) 
Y=train_data$Public4Cost
X=cbind(rep(1,n),train_data$CPI) 
p=ncol(X) 
beta=solve(t(X) %*% X) %*% t(X) %*% Y 
muhat= X %*% beta 
sigmasq=sum((Y - muhat)^2)/(n-p) 
covbeta=sigmasq * solve(t(X) %*% X) 
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)

##          beta     sebeta
## 1 -7375.68519 599.942091
## 2    94.06396   3.612348

z=beta[2]/sebeta[2] 
p=2*pt(z,df=n-2,lower.tail=FALSE) 
data.frame(z,p)

##          z            p
## 1 26.03956 5.038646e-18

#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between Public4Cost and CPI

# Public 4 Year cost and Year
plot(Public4Cost ~ Year, data=train_data, col="darkred")
abline(lm(Public4Cost ~ Year, data=train_data))

#summary(lm(Public4Cost ~ Year, data=train_data))

# TEST FOR NULL HYPOTHESIS
# E(FEV) = alpha + beta * Year
# H0 : beta = 0 
n=nrow(train_data) 
Y=train_data$Public4Cost
X=cbind(rep(1,n),train_data$Year) 
p=ncol(X) 
beta=solve(t(X) %*% X) %*% t(X) %*% Y 
muhat= X %*% beta 
sigmasq=sum((Y - muhat)^2)/(n-p) 
covbeta=sigmasq * solve(t(X) %*% X) 
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)

##        beta     sebeta
## 1 -850367.8 37781.7467
## 2     429.7    18.9144

z=beta[2]/sebeta[2] 
p=2*pt(z,df=n-2,lower.tail=FALSE) 
data.frame(z,p)

##          z           p
## 1 22.71814 9.14378e-17

#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between Public4Cost and Year

Check for correation between public4cost and predictor variables

# CHECK FOR POSSIBLE CORRELATION BETWEEN OUTPUT AND PREDICTOR VARIABLES
cor.test(train_data$Public4Cost,train_data$Debt_GDP, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  train_data$Public4Cost and train_data$Debt_GDP
## t = 4.6462, df = 22, p-value = 0.0001245
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4194203 0.8623363
## sample estimates:
##      cor 
## 0.703748

cor.test(train_data$Public4Cost,train_data$CPI, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  train_data$Public4Cost and train_data$CPI
## t = 26.04, df = 22, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9631379 0.9932361
## sample estimates:
##       cor 
## 0.9841616

cor.test(train_data$Public4Cost,train_data$Year, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  train_data$Public4Cost and train_data$Year
## t = 22.718, df = 22, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9520820 0.9911668
## sample estimates:
##       cor 
## 0.9793449

Check for correation between predictor variables

# CHECK FOR POSSIBLE CORRELATION BETWEEN THE PREDICTOR VARIABLES
cor.test(train_data$CPI,train_data$Debt_GDP, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  train_data$CPI and train_data$Debt_GDP
## t = 4.4457, df = 22, p-value = 0.0002031
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3938089 0.8542573
## sample estimates:
##       cor 
## 0.6879215

cor.test(train_data$CPI,train_data$Year, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  train_data$CPI and train_data$Year
## t = 54.168, df = 22, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9912526 0.9984135
## sample estimates:
##      cor 
## 0.996272

cor.test(train_data$Year,train_data$Debt_GDP, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  train_data$Year and train_data$Debt_GDP
## t = 4.022, df = 22, p-value = 0.0005716
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3357032 0.8350646
## sample estimates:
##       cor 
## 0.6509476

summary(lm(Public4Cost ~ CPI, data=train_data))

## 
## Call:
## lm(formula = Public4Cost ~ CPI, data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -828.57 -452.81  -73.04  256.99 1384.31 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7375.685    599.942  -12.29 2.49e-11 ***
## CPI            94.064      3.612   26.04  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 562.4 on 22 degrees of freedom
## Multiple R-squared:  0.9686, Adjusted R-squared:  0.9671 
## F-statistic: 678.1 on 1 and 22 DF,  p-value: < 2.2e-16

summary(lm(Public4Cost ~ Debt_GDP, data=train_data))

## 
## Call:
## lm(formula = Public4Cost ~ Debt_GDP, data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2870.4 -2051.1  -538.2  1955.5  3811.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -7811.7     3425.2  -2.281 0.032611 *  
## Debt_GDP       259.3       55.8   4.646 0.000125 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2254 on 22 degrees of freedom
## Multiple R-squared:  0.4953, Adjusted R-squared:  0.4723 
## F-statistic: 21.59 on 1 and 22 DF,  p-value: 0.0001245

summary(lm(Public4Cost ~ CPI + Debt_GDP, data=train_data))

## 
## Call:
## lm(formula = Public4Cost ~ CPI + Debt_GDP, data = train_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -689.5 -440.7 -141.9  327.0 1074.3 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7968.704    855.736  -9.312 6.65e-09 ***
## CPI            90.729      4.983  18.207 2.43e-14 ***
## Debt_GDP       18.688     19.207   0.973    0.342    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 563 on 21 degrees of freedom
## Multiple R-squared:  0.9699, Adjusted R-squared:  0.9671 
## F-statistic: 338.7 on 2 and 21 DF,  p-value: < 2.2e-16

#The coeﬃcient of Debt_GDP decreases when CPI is added to the model 2 because CPI and Debt_GDP are correlated in the sample. The estimate of coefficient of Debt_GDP when CPI predictor variable is NOT added only depends on the predictor variable Debt_GDP WHEREAS estimate of coefficient of Debt_GDP when CPI predictor variable is added not only depends on the predictor variable Debt_GDP, but also depends on CPI predictor variable. In eﬀect we have “controlled” CPI (or adjusted for CPI). We say that CPI is “confounding” the association between Public4cost and Debt_GDP. By adjusting for CPI, we remove its confounding eﬀect. 

#Also, R^2 increases with addition of predictors because variability increases.

Interactions effect between predictors

summary(lm(Public4Cost ~ Debt_GDP * CPI, data=train_data))

## 
## Call:
## lm(formula = Public4Cost ~ Debt_GDP * CPI, data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -652.22 -229.41   57.76  162.09  522.71 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5622.6850  2315.1048   2.429   0.0247 *  
## Debt_GDP     -218.9689    41.1441  -5.322 3.29e-05 ***
## CPI            17.4671    12.5321   1.394   0.1787    
## Debt_GDP:CPI    1.2686     0.2105   6.026 6.84e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 343.8 on 20 degrees of freedom
## Multiple R-squared:  0.9893, Adjusted R-squared:  0.9877 
## F-statistic: 617.6 on 3 and 20 DF,  p-value: < 2.2e-16

# E(Cost) = alpha + beta1 * CPI + beta2 * Debt_GDP + beta3 * CPI * Debt_GDP
mr01 = lm(Public4Cost ~ CPI + Debt_GDP + Debt_GDP * CPI, data=train_data)
summary(mr01)

## 
## Call:
## lm(formula = Public4Cost ~ CPI + Debt_GDP + Debt_GDP * CPI, data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -652.22 -229.41   57.76  162.09  522.71 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5622.6850  2315.1048   2.429   0.0247 *  
## CPI            17.4671    12.5321   1.394   0.1787    
## Debt_GDP     -218.9689    41.1441  -5.322 3.29e-05 ***
## CPI:Debt_GDP    1.2686     0.2105   6.026 6.84e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 343.8 on 20 degrees of freedom
## Multiple R-squared:  0.9893, Adjusted R-squared:  0.9877 
## F-statistic: 617.6 on 3 and 20 DF,  p-value: < 2.2e-16

#—————————————- Checking the assumptions for Linear regression:  Normality of error distribution or large sample size  Constant variance of errors  Linearity  Independence

## check out fitted values against the originals
#mr01
plot(train_data$Public4Cost, mr01$fitted.values, xlim = c(2500,20900), ylim = c(2500,20900))
abline(a=0, b=1)

hist(mr01$fitted.values,col="lightblue")

#Check for normality using residuals - Data looks skewed
hist(mr01$residuals,col="lightblue")

qqnorm(mr01$residuals,col="darkred")
qqline(mr01$residuals)

# Check for constant-variance 
scatter.smooth(mr01$fitted.values, mr01$residuals,cex=0.5,col="darkred")

## MSE and RMSE
mean((mr01$fitted.values - train_data$Public4Cost)^2)

## [1] 98510.82

sqrt(mean((mr01$fitted.values - train_data$Public4Cost)^2))

## [1] 313.8643

Linear regression did not meet the normality assumptions. Also, the sample size is very small.

#Using log-tranformation of the response variable as residuals are not normal
mr02 <- (lm(log(Public4Cost) ~ CPI+Debt_GDP+CPI*Debt_GDP, data=train_data))

#——————————————–

## check out fitted values against the originals
#mr02
plot(log(train_data$Public4Cost), mr02$fitted.values, xlim = c(8,10), ylim = c(8,10))
abline(a=0, b=1)

# normality of errors
hist(mr02$fitted.values,col="lightblue")

#Check for normality using residuals - Data looks slightly normal
hist(mr02$residuals,col="lightblue")

qqnorm(mr02$residuals,col="darkred")
qqline(mr02$residuals)

# Check for constant-variance 
scatter.smooth(exp(mr02$fitted.values), exp(mr02$residuals),cex=0.5,col="darkred")

## MSE and RMSE
mean((exp(mr02$fitted.values) - train_data$Public4Cost)^2)

## [1] 92019.36

sqrt(mean((exp(mr02$fitted.values) - train_data$Public4Cost)^2))

## [1] 303.3469

GLM MODEL

##Permutation Test for small sample before GLM
glm01 <- glm(Public4Cost ~ CPI + Debt_GDP + Debt_GDP * CPI,family = poisson,data =train_data)

set.seed(0)
beta=rep(NA,10000)
for(i in 1:10000){
  x1=sample(train_data$CPI,size=length(train_data$Public4Cost),replace=FALSE)
  x = x1+train_data$Debt_GDP+x1 * train_data$Debt_GDP
  beta[i]=glm(Public4Cost ~ x, data=data.frame(Public4Cost=train_data$Public4Cost,x), family=poisson)$coef[2]
}

mean(abs(beta) > abs(glm01$coef[2]))

## [1] 0

hist(beta)
abline(v=glm01$coef[2],lty=2,col=2,lwd=2)

Permutation test p-value is statistically significant, so the sample size of 24 is good enough for doing GLM

D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)

##          D df dispersion
## 1 181.5282 20   9.076408

glmsum = summary(glm01,dispersion=D/df)
glmsum$coef

##                   Estimate   Std. Error   z value      Pr(>|z|)
## (Intercept)   6.612379e+00 2.798489e-01 23.628390 1.969005e-123
## CPI           1.375762e-02 1.401228e-03  9.818261  9.395011e-23
## Debt_GDP      6.018621e-03 4.859138e-03  1.238619  2.154866e-01
## CPI:Debt_GDP -3.085011e-05 2.347741e-05 -1.314034  1.888347e-01

## review the model summary
summary(glm01,dispersion=D/df)

## 
## Call:
## glm(formula = Public4Cost ~ CPI + Debt_GDP + Debt_GDP * CPI, 
##     family = poisson, data = train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.4761  -1.0160  -0.1702   0.7496   5.2770  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   6.612e+00  2.798e-01  23.628   <2e-16 ***
## CPI           1.376e-02  1.401e-03   9.818   <2e-16 ***
## Debt_GDP      6.019e-03  4.859e-03   1.239    0.215    
## CPI:Debt_GDP -3.085e-05  2.348e-05  -1.314    0.189    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 9.076408)
## 
##     Null deviance: 27181.71  on 23  degrees of freedom
## Residual deviance:   181.53  on 20  degrees of freedom
## AIC: 447.46
## 
## Number of Fisher Scoring iterations: 3

hist(glm01$fitted.values,col="lightblue")

# Conclusion: p < 0.05, therefore we reject null hypothesis and conclude that not all coefficient estimates are 0. There is association between cost and predictor variables in tandem

## check out fitted values against the originals
plot(train_data$Public4Cost, glm01$fitted.values, xlim = c(2500,20900), ylim = c(2500,20900),col="darkred")
abline(a=0, b=1)

## MSE and RMSE
mean((glm01$fitted.values - train_data$Public4Cost)^2)

## [1] 89654.2

sqrt(mean((glm01$fitted.values - train_data$Public4Cost)^2))

## [1] 299.4231

# Pearson and deviance residual plots
scatter.smooth(glm01$linear.predictors,residuals(glm01,type="deviance"),cex=0.5,col="darkred")

scatter.smooth(glm01$linear.predictors,residuals(glm01,type="pearson"),cex=0.5,col="darkred")

######################### ## RANDOM FOREST MODEL #########################

## first, let's run a RF using a set of "standard" inputs.  Just to get a baseline
## set your seed so your model is reproducible
rf01 <- randomForest(Public4Cost ~ CPI + Debt_GDP + Debt_GDP * CPI,
                      data = train_data, 
                      mtry = 2, ## roughly the standard pick for regression models
                      nodesize = 5, ## default for regression
                      maxnodes = NULL, ## again, default
                      ntree = 5000) ## 5 at first, to make sure it runs OK. Then enlarge

## review the model summary
hist(rf01$predicted,col="lightblue")

## check out fitted values against the originals
plot(train_data$Public4Cost, rf01$predicted, xlim = c(2500,20900), ylim = c(2500,20900),col="darkred")
abline(a=0, b=1)

## MSE and RMSE
mean((rf01$predicted - train_data$Public4Cost)^2)

## [1] 242561.4

sqrt(mean((rf01$predicted - train_data$Public4Cost)^2))

## [1] 492.5052

EVALUATE MODELS

## rf01 and glm01
# Infer predictors - Infer how Debt_GDP and CPI predicts college 4 Year degree fees
pred_glm <- predict(glm01, newdata=test_data)
pred_rf <- predict(rf01, newdata=test_data)
pred_glm_exp <- exp(pred_glm)

hist(pred_rf)

hist(pred_glm_exp)

## RMSE
sqrt(mean(pred_glm_exp - test_data$Public4Cost)^2)

## [1] 636.6545

sqrt(mean(pred_rf - test_data$Public4Cost)^2)

## [1] 4740.12

plot(test_data$Public4Cost, pred_glm_exp,xlim = c(0, 30000), ylim = c(0, 30000))
abline(a=0,b=1,col="darkred")

plot(test_data$Public4Cost, pred_rf,xlim = c(0, 30000), ylim = c(0, 30000))
abline(a=0, b=1,col="darkred")

Comparing the 2 models – rf and GLM

#--------Another approach--------------------------
#  Getting the parallel minimum between # 2 vectors consisting of ratio of predictedvalues to 
# actual values and 1
probs_glm <- pmin((pred_glm_exp / test_data$Public4Cost), 1)
glm_accuracy = mean(probs_glm) * 100
glm_accuracy

## [1] 96.06059

probs_rf <- pmin((pred_rf / test_data$Public4Cost), 1)
rf_accuracy = mean(probs_rf) * 100
rf_accuracy

## [1] 74.12801

par(xpd=TRUE)
plot(test_data$Public4Cost, probs_glm , type = "b", xlab = "Education Costs", ylab = "Prediction Score",xlim = c(15100, 20500),ylim=c(0.5,1))
lines(test_data$Public4Cost, probs_rf , type = "b", col = "red",xlim = c(15100, 20500))
legend("bottomleft",c("GLM","RF"),lty=1:2,col=1:2,cex=0.65)
text(20000,0.92,"Accuracy: 96%",col="darkblue",cex=0.8)
text(20000,0.7,"Accuracy: 74%",col="darkblue",cex=0.8)

UTILITY FUNCTION

Let consider this scenario where based on the actual tuition fees, arbitrary bounds are chosen for utility calculation.

If the education saving is greater than 20900 and less than 23100, we don’t lose anything and we are safe. If we save less than 20900, we end up borrowing more in the future as predicted value is higher than this amount. Also, if we are saving more than the predicted amount, we compromise on the quality of life today by cancelling some family trips. In both the cases where I am not saving within the predicted range, let’s say I lose 10% of the value.

The below piece of code calculates the predicted tuition costs and the utilities based on the chosen bounds for all the 4 models.

## need to predict the tuition costs given 'Debt_GDP', 'Year' and Inflation index 'CPI' data. Predict tuition costs using test data and apply a utility function to it.

  values_glm = exp(predict(glm01, newdata=test_data))
  utilities_glm = ifelse(values_glm > 20900 && values_glm < 23100, values_glm * 1.0, values_glm * 0.9)

  values_rf = predict(rf01, newdata=test_data)
  utilities_rf = ifelse(values_glm > 20900 && values_glm < 23100, values_rf * 1.0, values_rf * 0.9)
  
  # Put it in a data.frame
D_plot_glm = data.frame(values_glm, utilities_glm)
D_plot_rf = data.frame(values_rf, utilities_rf)

Comparison of Utility function for different models

tdata = data.frame(CPI=test_data$CPI,Debt_GDP=test_data$Debt_GDP,Year=test_data$Year)
df1 <- tdata %>% rowwise %>% do(W = as_tibble(.)) %>% ungroup

predicted_utility_glm = function(CPI,Debt_GDP,Year) {
  # Posterior predictive samples (values)
  values_glm = exp(predict(glm01, newdata=data.frame(CPI,Debt_GDP,Year), summary=FALSE))
  
  # Associated utilities. Posterior predictive utilities
  utilities_glm = ifelse(values_glm > 20900 && values_glm < 23100, values_glm * 1.0, values_glm * 0.9)
  utilities_glm  # Return it
}

predicted_utility_rf = function(CPI,Debt_GDP,Year) {
  # Posterior predictive samples (values)
  values_rf = predict(rf01, newdata=data.frame(CPI,Debt_GDP,Year), summary=FALSE)
  
  # Associated utilities. Posterior predictive utilities?
  utilities_rf = ifelse(values_rf  > 20900 && values_rf < 23100, values_rf * 1.0, values_rf * 0.9)
  utilities_rf  # Return it
}

D_plot_rf = tdata %>%  
  mutate(rf_utility = predicted_utility_rf(tdata$CPI,tdata$Debt_GDP,tdata$Year)) %>% unnest

## Warning: `cols` is now required.
## Please use `cols = c()`

D_plot_glm = tdata %>%  
  mutate(glm_utility = predicted_utility_glm(tdata$CPI,tdata$Debt_GDP,tdata$Year)) %>% unnest

## Warning: `cols` is now required.
## Please use `cols = c()`

#D_plot_all_one <- merge(D_plot_glm, D_plot_rf, by=c("CPI","Debt_GDP","Year"))
D_plot_all <- merge(D_plot_glm, D_plot_rf, by=c("CPI","Debt_GDP","Year"))
D_plot_l <- melt(D_plot_all, id.vars = c("CPI","Debt_GDP","Year"))
p <- ggplot(data = D_plot_l, aes(x = Debt_GDP, y = value, group = variable, fill = variable))
p <- p + geom_bar(stat = "identity", width = 7, position = "dodge")
p <- p + facet_grid(. ~ CPI+Year)
p <- p + theme_bw()
p <- p + theme(axis.text.x = element_text(angle = 90))
p

glm model zoomed in

# Plot it
ggplot(D_plot_glm) +
  # Histograms of posterior predictive 
  geom_histogram(aes(x=utilities_glm), bins=5, fill='yellow') + 
  geom_histogram(aes(x=values_glm), bins=5, col='blue', alpha=0.3) +
  
  # Lines for eted utility
  geom_vline(aes(xintercept=mean(utilities_glm), color='Expected Utility'), lwd=3, show.legend=TRUE) + 
  geom_vline(aes(xintercept=mean(values_glm), color='Expected Saving'), lwd=3) + 
  geom_vline(aes(xintercept=10000, color='Random Saving'), lwd=3) +
  
  # Styling
  labs(x = 'Savings (value)', y='', title='Expected Saving and Expected Utility') +
  scale_x_continuous(breaks=seq(10000, 25000, by=1000)) + 
  theme_bw(13) + 
  theme(axis.ticks.y = element_blank(), axis.title.y = element_blank(), axis.text.y = element_blank()) +

 # Legend
  scale_color_manual(name = "", values = c('Expected Saving'='blue', 'Expected Utility' = 'red', 'Random Saving'='green'))

save.image("FinalModelEval.RData")

load("FinalModelEval.RData")