The cost of a college education is skyrocketing. It’s no surprise that student loans now make up the largest chunk of U.S. non-housing debt. If we know how much to save for college education in the future, we will have the flexibility of making financial adjustments without compromising on the quality of life and make more informed financial decisions. With this motto, I would like to answer the following questions through modelling: What would be the tuition fees for any 4 Year public university program in the future? Which factor influences the education cost the most – Debt_GDP or Consumer Price Index?
Output variable: Average Tuition and Fees and Room and Board (Enrollment-Weighted) in Current Dollars from 1986-2019 for any 4 Year Public University degree. Input variables: • Consumer price index (CPI): The Consumer Price Index (CPI) is a measure that examines the average of prices of a basket of consumer goods and services, such as transportation, food, and medical care. It is calculated by taking price changes for each item in the predetermined basket of goods and averaging them. Changes in the CPI are used to assess price changes associated with the cost of living; the CPI is one of the most frequently used statistics for identifying periods of inflation or deflation. • Debt-to-GDP ratio: A metric comparing a country’s public debt to its gross domestic product (GDP). By comparing what a country owes with what it produces, the debt-to-GDP ratio reliably indicates that particular country’s ability to pay back its debts. Often expressed as a percentage, this ratio can also be interpreted as the number of Years needed to pay back debt, if GDP is dedicated entirely to debt repayment.
Bureau of Labor Statistics (Consumer Price Index, All Urban Consumers).
Read the education costs data set, clean the data by converting literal NA’s and hyphens to 0’s, and scrapping off the irrelevant texts. Combine tuition fees and board room fees into total costs for public2Year, public4Year and privatenonprofit4Year courses. Convert all the costs into numeric values Get the distribution of costs using histogram and bar plot Get the log –transformed distribution of costs using histogram Calculate the mean and median costs of public2Year, public4Year and private4Year Read the inflation and Debt_GDP datasets Merge them with education costs dataset for the Years 1986-2019 Rename the columns appropriately.
#1) Read the worksheet number 5, skip 1st 11 rows
data <- readxl::read_xlsx('college_pricing.xlsx',sheet = 5, skip=11)
#2) Remove the last 3 lines from the dataset which are simplyextra texts
data <- head(data,-3)
#3)look for NA's
for(i in 1:dim(data)[2])
{
print(paste(names(data)[i], length(which(is.na(data[,i])))))
}
## [1] "In Current Dollars 0"
## [1] "1986 0"
## [1] "1987 0"
## [1] "1988 0"
## [1] "1989 0"
## [1] "1990 0"
## [1] "1991 0"
## [1] "1992 0"
## [1] "1993 0"
## [1] "1994 0"
## [1] "1995 0"
## [1] "1996 0"
## [1] "1997 0"
## [1] "1998 0"
## [1] "1999 0"
## [1] "2000 0"
## [1] "2001 0"
## [1] "2002 0"
## [1] "2003 0"
## [1] "2004 0"
## [1] "2005 0"
## [1] "2006 0"
## [1] "2007 0"
## [1] "2008 0"
## [1] "2009 0"
## [1] "2010 0"
## [1] "2011 0"
## [1] "2012 0"
## [1] "2013 0"
## [1] "2014 0"
## [1] "2015 0"
## [1] "2016 0"
## [1] "2017 0"
## [1] "2018 0"
## [1] "2019 0"
## [1] "10-Year $ Change 0"
## [1] "10-Year % Change 0"
#4) Get each of row's data and remove 1st column as these are row headers
data_row1 <- data[1,-1]
data_row2 <- data[2,-1]
data_row3 <- data[3,-1]
data_row8 <- data[8,-1]
data_row9 <- data[9,-1]
#5) Get the length of the dataset from above step
length(data_row1)
## [1] 36
#6) Remove the last 2 values from the obtained row vectors in step 4 and convert all the values into #numeric values
Tuition_Public_Two_Year <- as.numeric(data_row1[c(-35,-36)])
Tuition_Public_Four_Year <- as.numeric(data_row2[c(-35,-36)])
Tuition_Private_Nonprofit_Four_Year <- as.numeric(data_row3[c(-35,-36)])
Tuition_Room_Board_Public_Four_Year <- as.numeric(data_row8[c(-35,-36)])
Tuition_Room_Board_Private_Nonprofit_Four_Year <- as.numeric(data_row9[c(-35,-36)])
#7) All entries from the row vectors in step 4 are converted to numeric
Tuition_Public_Two_Year_all <- as.numeric(data_row1)
#8) transpose the data df in order to get the row names or Years using rownames
data_transpose <- t(data)
#9) Remove the 1st and last 2 values from rownames(data_transpose) to get only the Years
Year <- rownames(data_transpose)[c(-1,-36)][-35]
#10) Look for values '—' and convert them into 0. Format row 4 and row 7 as numeric 0s
data[data == '—'] <- 0
data_row7 <- data[7,-1]
data_row4 <- data[4,-1]
Room_Board_Public_Two_Year <- as.numeric(data_row4)
Tuition_Room_Board_Public_Two_Year <- as.numeric(data_row7)
#11)Update the 7th row count as sum of row 1 and row 4
Tuition_Room_Board_Public_Two_Year <- Tuition_Public_Two_Year_all + Room_Board_Public_Two_Year
Tuition_Room_Board_Public_Two_Year <- as.numeric(Tuition_Room_Board_Public_Two_Year[c(-35,-36)])
data[7,-1] <- Tuition_Public_Two_Year_all + Room_Board_Public_Two_Year
#12a) Plot the barplots for the datasets in step 6 where x axis= Years
barplot(Tuition_Room_Board_Public_Four_Year, names.arg=Year, xlab='Tuition and Fees and Room and Board-Public Four-Year')
#12b) Histogram
hist(Tuition_Room_Board_Public_Four_Year)
#12c) log10 tranformation to get a more normal shape
hist(log10(Tuition_Room_Board_Public_Four_Year))
# Get 10 Year change of 4-year cost
Tuition_Room_Board_Public_Four_Year[34] - Tuition_Room_Board_Public_Four_Year[1]
## [1] 16520
#Get the statistics of mean, median, sd, var, min and max values
#Read the inflation dataset
inf <- read.csv("CPI.csv",stringsAsFactors = FALSE)
# Read the Debt to GDP data
GDP <- read.csv("Debt_to_GDP.csv",stringsAsFactors = FALSE)
colnames(GDP)[colnames(GDP) == "Government.Debt.as...of.GDP"] <- "Debt/GDP"
# Combine data for regression analysis
comdata <- data.frame(Year,Tuition_Room_Board_Public_Two_Year,Tuition_Room_Board_Public_Four_Year,Tuition_Room_Board_Private_Nonprofit_Four_Year)
alldata <- data.frame(comdata,inf$CPI,GDP$"Debt/GDP")
colnames(alldata) <- c("Year", "Public2Cost","Public4Cost","Private4Cost","CPI","Debt_GDP")
# Changing the Year to numeric value
alldata$Year <- as.numeric(as.character(alldata$Year))
# Get the statistics of 4-year cost, CPI and GDP
summary(alldata$Public4Cost)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3860 6372 9535 10874 15685 20380
summary(alldata$CPI)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 109.5 149.4 182.0 185.0 224.4 256.6
summary(alldata$Debt_GDP)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 47.58 58.28 63.59 72.89 96.40 105.46
hist(alldata$Public4Cost)
hist(alldata$CPI)
hist(alldata$Debt_GDP)
#Split the public4Year tuition fees dataset into training and test datasets
#19)Training and test datasets
train_data <- alldata[1:24,]
test_data <- alldata[-c(1:24),]
Identify the predictors and data type of the predictors
predictor_names <- c("CPI","Debt_GDP","Year")
for(pname in predictor_names)
{
print(typeof(train_data[,pname]))
}
## [1] "double"
## [1] "double"
## [1] "double"
train_data1 <- as.data.frame(cbind(train_data$Year,train_data$CPI,train_data$Debt_GDP))
colnames(train_data1) <- c("Year", "CPI","Debt_GDP")
cormat <- round(cor(train_data1, use="pairwise.complete.obs"),2)
head(cormat)
## Year CPI Debt_GDP
## Year 1.00 1.00 0.65
## CPI 1.00 1.00 0.69
## Debt_GDP 0.65 0.69 1.00
upper<-cormat
upper[upper.tri(cormat)]<-""
upper<-as.data.frame(upper)
upper
## Year CPI Debt_GDP
## Year 1
## CPI 1 1
## Debt_GDP 0.65 0.69 1
get_lower_tri<-function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
reorder_cormat <- function(cormat){
# Use correlation between variables as distance
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
}
upper_tri <- get_upper_tri(cormat)
lower_tri <- get_lower_tri(cormat)
cormat <- reorder_cormat(cormat)
upper_tri <- get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(X2, X1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
# Print the heatmap
ggheatmap <- ggheatmap +
geom_text(aes(X2, X1, label = value), color = "black", size = 4) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.justification = c(1, 3),
legend.position = c(-0.5, 0.7),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "left", title.hjust = .8))
print(ggheatmap)
## Warning: Removed 3 rows containing missing values (geom_text).
Removed ‘year’ pedictor as year was highly correlated with CPI
Linear regression model results: Null hypothesis test shows that there exists a linear association between Public4cost and Debt_GDP Null hypothesis test shows that there exists a linear association between Public4cost and CPI Null hypothesis test is statistically significant, therefore there exists a linear association between Public4cost, CPI and Debt_GDP in tandem The coefficient of CPI changes when Debt_GDP is added to the model. This implies Debt_GDP and CPI are correlated. Pearson correlation = 0.7029612 which implies positive linear association between Debt_GDP and CPI. CPI is strongly correlated with Public4Cost than Debt_GDP
Verify the association between price and predictor variables
# Public 4 Year cost and Debt GDP
plot(Public4Cost ~ Debt_GDP, data=train_data, col="darkred")
abline(lm(Public4Cost ~ Debt_GDP, data=train_data))
#summary(lm(Public4Cost ~ Debt_GDP, data=train_data))
# TEST FOR NULL HYPOTHESIS
# E(FEV) = alpha + beta * Debt_GDP
# H0 : beta = 0
n=nrow(train_data)
Y=train_data$Public4Cost
X=cbind(rep(1,n),train_data$Debt_GDP)
p=ncol(X)
beta=solve(t(X) %*% X) %*% t(X) %*% Y
muhat= X %*% beta
sigmasq=sum((Y - muhat)^2)/(n-p)
covbeta=sigmasq * solve(t(X) %*% X)
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)
## beta sebeta
## 1 -7811.7348 3425.15314
## 2 259.2587 55.80052
z=beta[2]/sebeta[2]
p=2*pt(z,df=n-2,lower.tail=FALSE)
data.frame(z,p)
## z p
## 1 4.646171 0.0001245065
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between Public4Cost and Debt_GDP
# Public 4 Year cost and CPI
plot(Public4Cost ~ CPI, data=train_data, col="darkred")
abline(lm(Public4Cost ~ CPI, data=train_data))
#summary(lm(Public4Cost ~ CPI, data=train_data))
# TEST FOR NULL HYPOTHESIS
# E(FEV) = alpha + beta * CPI
# H0 : beta = 0
n=nrow(train_data)
Y=train_data$Public4Cost
X=cbind(rep(1,n),train_data$CPI)
p=ncol(X)
beta=solve(t(X) %*% X) %*% t(X) %*% Y
muhat= X %*% beta
sigmasq=sum((Y - muhat)^2)/(n-p)
covbeta=sigmasq * solve(t(X) %*% X)
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)
## beta sebeta
## 1 -7375.68519 599.942091
## 2 94.06396 3.612348
z=beta[2]/sebeta[2]
p=2*pt(z,df=n-2,lower.tail=FALSE)
data.frame(z,p)
## z p
## 1 26.03956 5.038646e-18
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between Public4Cost and CPI
# Public 4 Year cost and Year
plot(Public4Cost ~ Year, data=train_data, col="darkred")
abline(lm(Public4Cost ~ Year, data=train_data))
#summary(lm(Public4Cost ~ Year, data=train_data))
# TEST FOR NULL HYPOTHESIS
# E(FEV) = alpha + beta * Year
# H0 : beta = 0
n=nrow(train_data)
Y=train_data$Public4Cost
X=cbind(rep(1,n),train_data$Year)
p=ncol(X)
beta=solve(t(X) %*% X) %*% t(X) %*% Y
muhat= X %*% beta
sigmasq=sum((Y - muhat)^2)/(n-p)
covbeta=sigmasq * solve(t(X) %*% X)
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)
## beta sebeta
## 1 -850367.8 37781.7467
## 2 429.7 18.9144
z=beta[2]/sebeta[2]
p=2*pt(z,df=n-2,lower.tail=FALSE)
data.frame(z,p)
## z p
## 1 22.71814 9.14378e-17
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between Public4Cost and Year
Check for correation between public4cost and predictor variables
# CHECK FOR POSSIBLE CORRELATION BETWEEN OUTPUT AND PREDICTOR VARIABLES
cor.test(train_data$Public4Cost,train_data$Debt_GDP, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: train_data$Public4Cost and train_data$Debt_GDP
## t = 4.6462, df = 22, p-value = 0.0001245
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4194203 0.8623363
## sample estimates:
## cor
## 0.703748
cor.test(train_data$Public4Cost,train_data$CPI, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: train_data$Public4Cost and train_data$CPI
## t = 26.04, df = 22, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9631379 0.9932361
## sample estimates:
## cor
## 0.9841616
cor.test(train_data$Public4Cost,train_data$Year, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: train_data$Public4Cost and train_data$Year
## t = 22.718, df = 22, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9520820 0.9911668
## sample estimates:
## cor
## 0.9793449
Check for correation between predictor variables
# CHECK FOR POSSIBLE CORRELATION BETWEEN THE PREDICTOR VARIABLES
cor.test(train_data$CPI,train_data$Debt_GDP, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: train_data$CPI and train_data$Debt_GDP
## t = 4.4457, df = 22, p-value = 0.0002031
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3938089 0.8542573
## sample estimates:
## cor
## 0.6879215
cor.test(train_data$CPI,train_data$Year, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: train_data$CPI and train_data$Year
## t = 54.168, df = 22, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9912526 0.9984135
## sample estimates:
## cor
## 0.996272
cor.test(train_data$Year,train_data$Debt_GDP, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: train_data$Year and train_data$Debt_GDP
## t = 4.022, df = 22, p-value = 0.0005716
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3357032 0.8350646
## sample estimates:
## cor
## 0.6509476
summary(lm(Public4Cost ~ CPI, data=train_data))
##
## Call:
## lm(formula = Public4Cost ~ CPI, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -828.57 -452.81 -73.04 256.99 1384.31
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7375.685 599.942 -12.29 2.49e-11 ***
## CPI 94.064 3.612 26.04 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 562.4 on 22 degrees of freedom
## Multiple R-squared: 0.9686, Adjusted R-squared: 0.9671
## F-statistic: 678.1 on 1 and 22 DF, p-value: < 2.2e-16
summary(lm(Public4Cost ~ Debt_GDP, data=train_data))
##
## Call:
## lm(formula = Public4Cost ~ Debt_GDP, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2870.4 -2051.1 -538.2 1955.5 3811.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7811.7 3425.2 -2.281 0.032611 *
## Debt_GDP 259.3 55.8 4.646 0.000125 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2254 on 22 degrees of freedom
## Multiple R-squared: 0.4953, Adjusted R-squared: 0.4723
## F-statistic: 21.59 on 1 and 22 DF, p-value: 0.0001245
summary(lm(Public4Cost ~ CPI + Debt_GDP, data=train_data))
##
## Call:
## lm(formula = Public4Cost ~ CPI + Debt_GDP, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -689.5 -440.7 -141.9 327.0 1074.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7968.704 855.736 -9.312 6.65e-09 ***
## CPI 90.729 4.983 18.207 2.43e-14 ***
## Debt_GDP 18.688 19.207 0.973 0.342
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 563 on 21 degrees of freedom
## Multiple R-squared: 0.9699, Adjusted R-squared: 0.9671
## F-statistic: 338.7 on 2 and 21 DF, p-value: < 2.2e-16
#The coefficient of Debt_GDP decreases when CPI is added to the model 2 because CPI and Debt_GDP are correlated in the sample. The estimate of coefficient of Debt_GDP when CPI predictor variable is NOT added only depends on the predictor variable Debt_GDP WHEREAS estimate of coefficient of Debt_GDP when CPI predictor variable is added not only depends on the predictor variable Debt_GDP, but also depends on CPI predictor variable. In effect we have “controlled” CPI (or adjusted for CPI). We say that CPI is “confounding” the association between Public4cost and Debt_GDP. By adjusting for CPI, we remove its confounding effect.
#Also, R^2 increases with addition of predictors because variability increases.
Interactions effect between predictors
summary(lm(Public4Cost ~ Debt_GDP * CPI, data=train_data))
##
## Call:
## lm(formula = Public4Cost ~ Debt_GDP * CPI, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -652.22 -229.41 57.76 162.09 522.71
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5622.6850 2315.1048 2.429 0.0247 *
## Debt_GDP -218.9689 41.1441 -5.322 3.29e-05 ***
## CPI 17.4671 12.5321 1.394 0.1787
## Debt_GDP:CPI 1.2686 0.2105 6.026 6.84e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 343.8 on 20 degrees of freedom
## Multiple R-squared: 0.9893, Adjusted R-squared: 0.9877
## F-statistic: 617.6 on 3 and 20 DF, p-value: < 2.2e-16
# E(Cost) = alpha + beta1 * CPI + beta2 * Debt_GDP + beta3 * CPI * Debt_GDP
mr01 = lm(Public4Cost ~ CPI + Debt_GDP + Debt_GDP * CPI, data=train_data)
summary(mr01)
##
## Call:
## lm(formula = Public4Cost ~ CPI + Debt_GDP + Debt_GDP * CPI, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -652.22 -229.41 57.76 162.09 522.71
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5622.6850 2315.1048 2.429 0.0247 *
## CPI 17.4671 12.5321 1.394 0.1787
## Debt_GDP -218.9689 41.1441 -5.322 3.29e-05 ***
## CPI:Debt_GDP 1.2686 0.2105 6.026 6.84e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 343.8 on 20 degrees of freedom
## Multiple R-squared: 0.9893, Adjusted R-squared: 0.9877
## F-statistic: 617.6 on 3 and 20 DF, p-value: < 2.2e-16
#—————————————- Checking the assumptions for Linear regression: Normality of error distribution or large sample size Constant variance of errors Linearity Independence
## check out fitted values against the originals
#mr01
plot(train_data$Public4Cost, mr01$fitted.values, xlim = c(2500,20900), ylim = c(2500,20900))
abline(a=0, b=1)
hist(mr01$fitted.values,col="lightblue")
#Check for normality using residuals - Data looks skewed
hist(mr01$residuals,col="lightblue")
qqnorm(mr01$residuals,col="darkred")
qqline(mr01$residuals)
# Check for constant-variance
scatter.smooth(mr01$fitted.values, mr01$residuals,cex=0.5,col="darkred")
## MSE and RMSE
mean((mr01$fitted.values - train_data$Public4Cost)^2)
## [1] 98510.82
sqrt(mean((mr01$fitted.values - train_data$Public4Cost)^2))
## [1] 313.8643
Linear regression did not meet the normality assumptions. Also, the sample size is very small.
#Using log-tranformation of the response variable as residuals are not normal
mr02 <- (lm(log(Public4Cost) ~ CPI+Debt_GDP+CPI*Debt_GDP, data=train_data))
#——————————————–
## check out fitted values against the originals
#mr02
plot(log(train_data$Public4Cost), mr02$fitted.values, xlim = c(8,10), ylim = c(8,10))
abline(a=0, b=1)
# normality of errors
hist(mr02$fitted.values,col="lightblue")
#Check for normality using residuals - Data looks slightly normal
hist(mr02$residuals,col="lightblue")
qqnorm(mr02$residuals,col="darkred")
qqline(mr02$residuals)
# Check for constant-variance
scatter.smooth(exp(mr02$fitted.values), exp(mr02$residuals),cex=0.5,col="darkred")
## MSE and RMSE
mean((exp(mr02$fitted.values) - train_data$Public4Cost)^2)
## [1] 92019.36
sqrt(mean((exp(mr02$fitted.values) - train_data$Public4Cost)^2))
## [1] 303.3469
##Permutation Test for small sample before GLM
glm01 <- glm(Public4Cost ~ CPI + Debt_GDP + Debt_GDP * CPI,family = poisson,data =train_data)
set.seed(0)
beta=rep(NA,10000)
for(i in 1:10000){
x1=sample(train_data$CPI,size=length(train_data$Public4Cost),replace=FALSE)
x = x1+train_data$Debt_GDP+x1 * train_data$Debt_GDP
beta[i]=glm(Public4Cost ~ x, data=data.frame(Public4Cost=train_data$Public4Cost,x), family=poisson)$coef[2]
}
mean(abs(beta) > abs(glm01$coef[2]))
## [1] 0
hist(beta)
abline(v=glm01$coef[2],lty=2,col=2,lwd=2)
Permutation test p-value is statistically significant, so the sample size of 24 is good enough for doing GLM
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 181.5282 20 9.076408
glmsum = summary(glm01,dispersion=D/df)
glmsum$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.612379e+00 2.798489e-01 23.628390 1.969005e-123
## CPI 1.375762e-02 1.401228e-03 9.818261 9.395011e-23
## Debt_GDP 6.018621e-03 4.859138e-03 1.238619 2.154866e-01
## CPI:Debt_GDP -3.085011e-05 2.347741e-05 -1.314034 1.888347e-01
## review the model summary
summary(glm01,dispersion=D/df)
##
## Call:
## glm(formula = Public4Cost ~ CPI + Debt_GDP + Debt_GDP * CPI,
## family = poisson, data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.4761 -1.0160 -0.1702 0.7496 5.2770
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.612e+00 2.798e-01 23.628 <2e-16 ***
## CPI 1.376e-02 1.401e-03 9.818 <2e-16 ***
## Debt_GDP 6.019e-03 4.859e-03 1.239 0.215
## CPI:Debt_GDP -3.085e-05 2.348e-05 -1.314 0.189
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 9.076408)
##
## Null deviance: 27181.71 on 23 degrees of freedom
## Residual deviance: 181.53 on 20 degrees of freedom
## AIC: 447.46
##
## Number of Fisher Scoring iterations: 3
hist(glm01$fitted.values,col="lightblue")
# Conclusion: p < 0.05, therefore we reject null hypothesis and conclude that not all coefficient estimates are 0. There is association between cost and predictor variables in tandem
## check out fitted values against the originals
plot(train_data$Public4Cost, glm01$fitted.values, xlim = c(2500,20900), ylim = c(2500,20900),col="darkred")
abline(a=0, b=1)
## MSE and RMSE
mean((glm01$fitted.values - train_data$Public4Cost)^2)
## [1] 89654.2
sqrt(mean((glm01$fitted.values - train_data$Public4Cost)^2))
## [1] 299.4231
# Pearson and deviance residual plots
scatter.smooth(glm01$linear.predictors,residuals(glm01,type="deviance"),cex=0.5,col="darkred")
scatter.smooth(glm01$linear.predictors,residuals(glm01,type="pearson"),cex=0.5,col="darkred")
######################### ## RANDOM FOREST MODEL #########################
## first, let's run a RF using a set of "standard" inputs. Just to get a baseline
## set your seed so your model is reproducible
rf01 <- randomForest(Public4Cost ~ CPI + Debt_GDP + Debt_GDP * CPI,
data = train_data,
mtry = 2, ## roughly the standard pick for regression models
nodesize = 5, ## default for regression
maxnodes = NULL, ## again, default
ntree = 5000) ## 5 at first, to make sure it runs OK. Then enlarge
## review the model summary
hist(rf01$predicted,col="lightblue")
## check out fitted values against the originals
plot(train_data$Public4Cost, rf01$predicted, xlim = c(2500,20900), ylim = c(2500,20900),col="darkred")
abline(a=0, b=1)
## MSE and RMSE
mean((rf01$predicted - train_data$Public4Cost)^2)
## [1] 242561.4
sqrt(mean((rf01$predicted - train_data$Public4Cost)^2))
## [1] 492.5052
## rf01 and glm01
# Infer predictors - Infer how Debt_GDP and CPI predicts college 4 Year degree fees
pred_glm <- predict(glm01, newdata=test_data)
pred_rf <- predict(rf01, newdata=test_data)
pred_glm_exp <- exp(pred_glm)
hist(pred_rf)
hist(pred_glm_exp)
## RMSE
sqrt(mean(pred_glm_exp - test_data$Public4Cost)^2)
## [1] 636.6545
sqrt(mean(pred_rf - test_data$Public4Cost)^2)
## [1] 4740.12
plot(test_data$Public4Cost, pred_glm_exp,xlim = c(0, 30000), ylim = c(0, 30000))
abline(a=0,b=1,col="darkred")
plot(test_data$Public4Cost, pred_rf,xlim = c(0, 30000), ylim = c(0, 30000))
abline(a=0, b=1,col="darkred")
Comparing the 2 models – rf and GLM
#--------Another approach--------------------------
# Getting the parallel minimum between # 2 vectors consisting of ratio of predictedvalues to
# actual values and 1
probs_glm <- pmin((pred_glm_exp / test_data$Public4Cost), 1)
glm_accuracy = mean(probs_glm) * 100
glm_accuracy
## [1] 96.06059
probs_rf <- pmin((pred_rf / test_data$Public4Cost), 1)
rf_accuracy = mean(probs_rf) * 100
rf_accuracy
## [1] 74.12801
par(xpd=TRUE)
plot(test_data$Public4Cost, probs_glm , type = "b", xlab = "Education Costs", ylab = "Prediction Score",xlim = c(15100, 20500),ylim=c(0.5,1))
lines(test_data$Public4Cost, probs_rf , type = "b", col = "red",xlim = c(15100, 20500))
legend("bottomleft",c("GLM","RF"),lty=1:2,col=1:2,cex=0.65)
text(20000,0.92,"Accuracy: 96%",col="darkblue",cex=0.8)
text(20000,0.7,"Accuracy: 74%",col="darkblue",cex=0.8)
Let consider this scenario where based on the actual tuition fees, arbitrary bounds are chosen for utility calculation.
If the education saving is greater than 20900 and less than 23100, we don’t lose anything and we are safe. If we save less than 20900, we end up borrowing more in the future as predicted value is higher than this amount. Also, if we are saving more than the predicted amount, we compromise on the quality of life today by cancelling some family trips. In both the cases where I am not saving within the predicted range, let’s say I lose 10% of the value.
The below piece of code calculates the predicted tuition costs and the utilities based on the chosen bounds for all the 4 models.
## need to predict the tuition costs given 'Debt_GDP', 'Year' and Inflation index 'CPI' data. Predict tuition costs using test data and apply a utility function to it.
values_glm = exp(predict(glm01, newdata=test_data))
utilities_glm = ifelse(values_glm > 20900 && values_glm < 23100, values_glm * 1.0, values_glm * 0.9)
values_rf = predict(rf01, newdata=test_data)
utilities_rf = ifelse(values_glm > 20900 && values_glm < 23100, values_rf * 1.0, values_rf * 0.9)
# Put it in a data.frame
D_plot_glm = data.frame(values_glm, utilities_glm)
D_plot_rf = data.frame(values_rf, utilities_rf)
Comparison of Utility function for different models
tdata = data.frame(CPI=test_data$CPI,Debt_GDP=test_data$Debt_GDP,Year=test_data$Year)
df1 <- tdata %>% rowwise %>% do(W = as_tibble(.)) %>% ungroup
predicted_utility_glm = function(CPI,Debt_GDP,Year) {
# Posterior predictive samples (values)
values_glm = exp(predict(glm01, newdata=data.frame(CPI,Debt_GDP,Year), summary=FALSE))
# Associated utilities. Posterior predictive utilities
utilities_glm = ifelse(values_glm > 20900 && values_glm < 23100, values_glm * 1.0, values_glm * 0.9)
utilities_glm # Return it
}
predicted_utility_rf = function(CPI,Debt_GDP,Year) {
# Posterior predictive samples (values)
values_rf = predict(rf01, newdata=data.frame(CPI,Debt_GDP,Year), summary=FALSE)
# Associated utilities. Posterior predictive utilities?
utilities_rf = ifelse(values_rf > 20900 && values_rf < 23100, values_rf * 1.0, values_rf * 0.9)
utilities_rf # Return it
}
D_plot_rf = tdata %>%
mutate(rf_utility = predicted_utility_rf(tdata$CPI,tdata$Debt_GDP,tdata$Year)) %>% unnest
## Warning: `cols` is now required.
## Please use `cols = c()`
D_plot_glm = tdata %>%
mutate(glm_utility = predicted_utility_glm(tdata$CPI,tdata$Debt_GDP,tdata$Year)) %>% unnest
## Warning: `cols` is now required.
## Please use `cols = c()`
#D_plot_all_one <- merge(D_plot_glm, D_plot_rf, by=c("CPI","Debt_GDP","Year"))
D_plot_all <- merge(D_plot_glm, D_plot_rf, by=c("CPI","Debt_GDP","Year"))
D_plot_l <- melt(D_plot_all, id.vars = c("CPI","Debt_GDP","Year"))
p <- ggplot(data = D_plot_l, aes(x = Debt_GDP, y = value, group = variable, fill = variable))
p <- p + geom_bar(stat = "identity", width = 7, position = "dodge")
p <- p + facet_grid(. ~ CPI+Year)
p <- p + theme_bw()
p <- p + theme(axis.text.x = element_text(angle = 90))
p
glm model zoomed in
# Plot it
ggplot(D_plot_glm) +
# Histograms of posterior predictive
geom_histogram(aes(x=utilities_glm), bins=5, fill='yellow') +
geom_histogram(aes(x=values_glm), bins=5, col='blue', alpha=0.3) +
# Lines for eted utility
geom_vline(aes(xintercept=mean(utilities_glm), color='Expected Utility'), lwd=3, show.legend=TRUE) +
geom_vline(aes(xintercept=mean(values_glm), color='Expected Saving'), lwd=3) +
geom_vline(aes(xintercept=10000, color='Random Saving'), lwd=3) +
# Styling
labs(x = 'Savings (value)', y='', title='Expected Saving and Expected Utility') +
scale_x_continuous(breaks=seq(10000, 25000, by=1000)) +
theme_bw(13) +
theme(axis.ticks.y = element_blank(), axis.title.y = element_blank(), axis.text.y = element_blank()) +
# Legend
scale_color_manual(name = "", values = c('Expected Saving'='blue', 'Expected Utility' = 'red', 'Random Saving'='green'))
save.image("FinalModelEval.RData")
load("FinalModelEval.RData")