CLEANING seadata AND FEATURE ENGINEERING

# Read the Seattle_listsing.csv
sdata <- read.csv("Seattle_listings.csv",stringsAsFactors = FALSE)

# Select only the 10 variables we are interested in
seadata  <- sdata[,c("price","reviews_per_month","number_of_reviews","review_scores_rating","beds","bathrooms","bedrooms","property_type","room_type","square_feet")]

# Covert the room type and property type explicitly to factors
seadata$room_type = factor(sdata$room_type)
seadata$property_type = factor(sdata$property_type)

# Remove the dollar sign and any commas and convert them to numeric values
#grep("\\$",seadata$price,value=TRUE)
seadata$price <- as.numeric(gsub('\\$|,', '', seadata$price))

# Look for NA's in all the columns - get the numbers of entries for each column that are NA's
for(i in 1:dim(seadata)[2])
{
  print(paste(names(seadata)[i], length(which(is.na(seadata[,i])))))
}
## [1] "price 0"
## [1] "reviews_per_month 1261"
## [1] "number_of_reviews 0"
## [1] "review_scores_rating 1320"
## [1] "beds 3"
## [1] "bathrooms 2"
## [1] "bedrooms 7"
## [1] "property_type 0"
## [1] "room_type 0"
## [1] "square_feet 8620"
# As square feet column has almost 95 percent of data as N/A's, drop the column as we cannot use that column
drops <- c("square_feet")
seadata <- seadata[ ,!(names(seadata) %in% drops)]

# Look for NA's in all the columns - get the numbers of entries for each column that are NA's
for(i in 1:dim(seadata)[2])
{
  print(paste(names(seadata)[i], length(which(is.na(seadata[,i])))))
}
## [1] "price 0"
## [1] "reviews_per_month 1261"
## [1] "number_of_reviews 0"
## [1] "review_scores_rating 1320"
## [1] "beds 3"
## [1] "bathrooms 2"
## [1] "bedrooms 7"
## [1] "property_type 0"
## [1] "room_type 0"
# Drop records that have reviews_per_month,bedrooms and review_score_rating as N/As
# Also remove the records having price as 9999 and 0
seadata <- seadata[!is.na(seadata$reviews_per_month),]
seadata <- seadata[!is.na(seadata$review_scores_rating),]
seadata <- seadata[!is.na(seadata$bedrooms),]
seadata <- seadata[seadata$price != 9999 & seadata$price != 0,]

# Look for NA's
for(i in 1:dim(seadata)[2])
{
  print(paste(names(seadata)[i], length(which(is.na(seadata[,i])))))
}
## [1] "price 0"
## [1] "reviews_per_month 0"
## [1] "number_of_reviews 0"
## [1] "review_scores_rating 0"
## [1] "beds 0"
## [1] "bathrooms 0"
## [1] "bedrooms 0"
## [1] "property_type 0"
## [1] "room_type 0"
# Capping beds, bathrooms, bedrooms based on99th percentile
x <- seadata$bedrooms
quantiles <- quantile(x, .99)
x[x > quantiles] <- quantiles
seadata$bedrooms <- x

x <- seadata$bathrooms
quantiles <- quantile(x, .99)
x[x > quantiles] <- quantiles
seadata$bathrooms <- x

x <- seadata$beds
quantiles <- quantile(x, .99)
x[x > quantiles] <- quantiles
seadata$beds <- x

nrow(seadata)
## [1] 7697
# Histograms of numeric columns to analyse the distribution
# All the individual distributions appear to be skewed
hist(seadata$review_scores_rating)

hist(seadata$reviews_per_month)

hist(seadata$number_of_reviews)

hist(seadata$beds)

hist(seadata$bathrooms)

hist(seadata$bedrooms)

hist(seadata$price)

predictor_names <-c("price","review_scores_rating","reviews_per_month","number_of_reviews","beds","bathrooms","bedrooms","property_type","room_type")

for(pname in predictor_names)
{
  print(typeof(seadata[,pname]))
}
## [1] "double"
## [1] "integer"
## [1] "double"
## [1] "integer"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "integer"
## [1] "integer"

Verify the association between price and predictor variables

# price and review score rating
plot(price ~ review_scores_rating, data=seadata, col="lightblue")
abline(lm(price ~ review_scores_rating, data=seadata)) 

summary(lm(price ~ review_scores_rating, data=seadata))
## 
## Call:
## lm(formula = price ~ review_scores_rating, data = seadata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -329.3  -80.5  -45.0   20.0 4835.3 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          396.5848    25.0463  15.834   <2e-16 ***
## review_scores_rating  -2.4663     0.2627  -9.387   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 172.4 on 7695 degrees of freedom
## Multiple R-squared:  0.01132,    Adjusted R-squared:  0.01119 
## F-statistic: 88.11 on 1 and 7695 DF,  p-value: < 2.2e-16
# TEST FOR NULL HYPOTHESIS
# E(price) = alpha + beta * review_score_rating
# H0 : beta = 0 
n=nrow(seadata) 
Y=seadata$price
X=cbind(rep(1,n),seadata$review_scores_rating) 
p=ncol(X) 
beta=solve(t(X) %*% X) %*% t(X) %*% Y 
muhat= X %*% beta 
sigmasq=sum((Y - muhat)^2)/(n-p) 
covbeta=sigmasq * solve(t(X) %*% X) 
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)
##         beta     sebeta
## 1 396.584770 25.0463296
## 2  -2.466339  0.2627437
z=beta[2]/sebeta[2] 
p=2*pt(-abs(z),df=n-2) 
data.frame(z,p) 
##           z            p
## 1 -9.386862 7.984815e-21
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and review_score_rating

#----------------------
# price and reviews_per_month
plot(price ~ reviews_per_month, data=seadata, col="lightblue")
abline(lm(price ~ reviews_per_month, data=seadata)) 

summary(lm(price ~ reviews_per_month, data=seadata))
## 
## Call:
## lm(formula = price ~ reviews_per_month, data = seadata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -183.5  -84.0  -37.8   23.8 4815.9 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       195.6069     2.8004   69.85   <2e-16 ***
## reviews_per_month -14.3334     0.8657  -16.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 170.4 on 7695 degrees of freedom
## Multiple R-squared:  0.0344, Adjusted R-squared:  0.03427 
## F-statistic: 274.1 on 1 and 7695 DF,  p-value: < 2.2e-16
# E(price) = alpha + beta * reviews_per_month
# H0 : beta = 0 
n=nrow(seadata) 
Y=seadata$price
X=cbind(rep(1,n),seadata$reviews_per_month) 
p=ncol(X) 
beta=solve(t(X) %*% X) %*% t(X) %*% Y 
muhat= X %*% beta 
sigmasq=sum((Y - muhat)^2)/(n-p) 
covbeta=sigmasq * solve(t(X) %*% X) 
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)
##        beta    sebeta
## 1 195.60693 2.8003704
## 2 -14.33336 0.8657254
z=beta[2]/sebeta[2] 
# Significace calculated based on t-test
p=2*pt(-abs(z),df=n-2) 
data.frame(z,p) 
##           z            p
## 1 -16.55648 1.588257e-60
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and reviews_per_month

#------------------------------

# price and number_of_reviews
plot(price ~ number_of_reviews, data=seadata, col="lightblue")
abline(lm(price ~ number_of_reviews, data=seadata))

summary(lm(price ~ number_of_reviews, data=seadata))
## 
## Call:
## lm(formula = price ~ number_of_reviews, data = seadata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -170.3  -82.6  -41.2   20.7 4829.1 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       181.27156    2.43971   74.30   <2e-16 ***
## number_of_reviews  -0.32318    0.02474  -13.06   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 171.5 on 7695 degrees of freedom
## Multiple R-squared:  0.02169,    Adjusted R-squared:  0.02156 
## F-statistic: 170.6 on 1 and 7695 DF,  p-value: < 2.2e-16
# E(price) = alpha + beta * number_of_reviews
# H0 : beta = 0 
n=nrow(seadata) 
Y=seadata$price
X=cbind(rep(1,n),seadata$number_of_reviews) 
p=ncol(X) 
beta=solve(t(X) %*% X) %*% t(X) %*% Y 
muhat= X %*% beta 
sigmasq=sum((Y - muhat)^2)/(n-p) 
covbeta=sigmasq * solve(t(X) %*% X) 
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)
##         beta     sebeta
## 1 181.271561 2.43971315
## 2  -0.323177 0.02474146
z=beta[2]/sebeta[2] 
p=2*pt(-abs(z),df=n-2) 
data.frame(z,p) 
##           z            p
## 1 -13.06217 1.390693e-38
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and number_of_reviews

#------------------------------
# price and beds
plot(price ~ beds, data=seadata, col="lightblue")
abline(lm(price ~ beds, data=seadata))

summary(lm(price ~ beds, data=seadata))
## 
## Call:
## lm(formula = price ~ beds, data = seadata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -371.4  -68.3  -33.3   16.7 4876.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   79.400      3.319   23.92   <2e-16 ***
## beds          43.854      1.453   30.19   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 164 on 7695 degrees of freedom
## Multiple R-squared:  0.1059, Adjusted R-squared:  0.1058 
## F-statistic: 911.3 on 1 and 7695 DF,  p-value: < 2.2e-16
# E(price) = alpha + beta * beds
# H0 : beta = 0 
n=nrow(seadata) 
Y=seadata$price
X=cbind(rep(1,n),seadata$beds) 
p=ncol(X) 
beta=solve(t(X) %*% X) %*% t(X) %*% Y 
muhat= X %*% beta 
sigmasq=sum((Y - muhat)^2)/(n-p) 
covbeta=sigmasq * solve(t(X) %*% X) 
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)
##       beta   sebeta
## 1 79.40048 3.319052
## 2 43.85409 1.452734
z=beta[2]/sebeta[2] 
p=2*pt(-abs(z),df=n-2) 
data.frame(z,p) 
##          z             p
## 1 30.18728 2.707849e-189
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and beds

#------------------------------

# price and bedrooms
plot(price ~ bedrooms, data=seadata, col="lightblue")
abline(lm(price ~ bedrooms, data=seadata))

summary(lm(price ~ bedrooms, data=seadata))
## 
## Call:
## lm(formula = price ~ bedrooms, data = seadata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -325.8  -73.7  -36.7   17.4 4859.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   83.619      3.192   26.20   <2e-16 ***
## bedrooms      57.041      1.879   30.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 163.9 on 7695 degrees of freedom
## Multiple R-squared:  0.107,  Adjusted R-squared:  0.1068 
## F-statistic: 921.7 on 1 and 7695 DF,  p-value: < 2.2e-16
# E(price) = alpha + beta * bedrooms
# H0 : beta = 0 
n=nrow(seadata) 
Y=seadata$price
X=cbind(rep(1,n),seadata$bedrooms) 
p=ncol(X) 
beta=solve(t(X) %*% X) %*% t(X) %*% Y 
muhat= X %*% beta 
sigmasq=sum((Y - muhat)^2)/(n-p) 
covbeta=sigmasq * solve(t(X) %*% X) 
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)
##       beta   sebeta
## 1 83.61866 3.191925
## 2 57.04113 1.878884
z=beta[2]/sebeta[2] 
p=2*pt(-abs(z),df=n-2) 
data.frame(z,p) 
##          z             p
## 1 30.35904 2.585439e-191
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and bedrooms

#------------------------------

# price and bathrooms
plot(price ~ bathrooms, data=seadata, col="lightblue")
abline(lm(price ~ bathrooms, data=seadata))

summary(lm(price ~ bathrooms, data=seadata))
## 
## Call:
## lm(formula = price ~ bathrooms, data = seadata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -310.3  -65.4  -38.4   20.5 4861.5 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   58.516      4.483   13.05   <2e-16 ***
## bathrooms     79.952      3.132   25.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 166.5 on 7695 degrees of freedom
## Multiple R-squared:  0.07808,    Adjusted R-squared:  0.07796 
## F-statistic: 651.7 on 1 and 7695 DF,  p-value: < 2.2e-16
# E(price) = alpha + beta * bathrooms
# H0 : beta = 0 
n=nrow(seadata) 
Y=seadata$price
X=cbind(rep(1,n),seadata$bathrooms) 
p=ncol(X) 
beta=solve(t(X) %*% X) %*% t(X) %*% Y 
muhat= X %*% beta 
sigmasq=sum((Y - muhat)^2)/(n-p) 
covbeta=sigmasq * solve(t(X) %*% X) 
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)
##       beta   sebeta
## 1 58.51591 4.483052
## 2 79.95192 3.131879
z=beta[2]/sebeta[2] 
#p=2*pt(z,df=n-2,lower.tail=FALSE) 
p=2*pt(-abs(z),df=n-2) 
data.frame(z,p) 
##          z             p
## 1 25.52842 4.691466e-138
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and bathrooms

#------------------------------

# price and room_type
plot(price ~ room_type, data=seadata, col="lightblue")
abline(lm(price ~ factor(room_type), data=seadata))
## Warning in abline(lm(price ~ factor(room_type), data = seadata)): only
## using the first two of 4 regression coefficients

summary(aov(price ~ factor(room_type), data=seadata))
##                     Df    Sum Sq Mean Sq F value Pr(>F)    
## factor(room_type)    3  18987575 6329192   229.3 <2e-16 ***
## Residuals         7693 212348694   27603                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# F-test using anova
seadata$I_home = as.numeric(seadata$room_type== 'Entire home/apt')
seadata$I_private = as.numeric(seadata$room_type== 'Private room')
seadata$I_shared = as.numeric(seadata$room_type== 'Shared room')
full = (lm(price ~ I_home  + I_private + I_shared, data=seadata))
reduced = (lm(price ~ 1, data=seadata))
anova(reduced,full)
## Analysis of Variance Table
## 
## Model 1: price ~ 1
## Model 2: price ~ I_home + I_private + I_shared
##   Res.Df       RSS Df Sum of Sq      F    Pr(>F)    
## 1   7696 231336270                                  
## 2   7693 212348694  3  18987575 229.29 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and room_type

#------------------------------

# price and property_type
plot(price ~ property_type, data=seadata, col="lightblue")
abline(lm(price ~ factor(property_type), data=seadata))
## Warning in abline(lm(price ~ factor(property_type), data = seadata)): only
## using the first two of 26 regression coefficients

summary(aov(price ~ factor(property_type), data=seadata))
##                         Df    Sum Sq Mean Sq F value Pr(>F)    
## factor(property_type)   25   8772988  350920    12.1 <2e-16 ***
## Residuals             7671 222563281   29014                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and property_type
# CHECK FOR POSSIBLE CORRELATION BETWEEN OUTPUT AND PREDICTOR VARIABLES
cor.test(seadata$price,seadata$number_of_reviews, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  seadata$price and seadata$number_of_reviews
## t = -13.062, df = 7695, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1690662 -0.1253533
## sample estimates:
##        cor 
## -0.1472817
cor.test(seadata$price,seadata$review_scores_rating, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  seadata$price and seadata$review_scores_rating
## t = -9.3869, df = 7695, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.12843608 -0.08425997
## sample estimates:
##        cor 
## -0.1064005
cor.test(seadata$price,seadata$reviews_per_month, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  seadata$price and seadata$reviews_per_month
## t = -16.556, df = 7695, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2069488 -0.1638032
## sample estimates:
##        cor 
## -0.1854654
cor.test(seadata$price,seadata$bedrooms, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  seadata$price and seadata$bedrooms
## t = 30.359, df = 7695, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3069551 0.3468596
## sample estimates:
##       cor 
## 0.3270531
cor.test(seadata$price,seadata$bathrooms, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  seadata$price and seadata$bathrooms
## t = 25.528, df = 7695, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2586999 0.2998946
## sample estimates:
##       cor 
## 0.2794258
cor.test(seadata$price,seadata$beds, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  seadata$price and seadata$beds
## t = 30.187, df = 7695, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3052776 0.3452303
## sample estimates:
##       cor 
## 0.3253992
#Interactions between predictors
summary(lm(price ~ review_scores_rating * reviews_per_month, data=seadata))
## 
## Call:
## lm(formula = price ~ review_scores_rating * reviews_per_month, 
##     data = seadata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -354.4  -81.8  -36.5   22.4 4815.5 
## 
## Coefficients:
##                                        Estimate Std. Error t value
## (Intercept)                            443.7847    29.8128  14.886
## review_scores_rating                    -2.6300     0.3143  -8.367
## reviews_per_month                      -84.5741    17.8577  -4.736
## review_scores_rating:reviews_per_month   0.7397     0.1856   3.986
##                                        Pr(>|t|)    
## (Intercept)                             < 2e-16 ***
## review_scores_rating                    < 2e-16 ***
## reviews_per_month                      2.22e-06 ***
## review_scores_rating:reviews_per_month 6.79e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 169.6 on 7693 degrees of freedom
## Multiple R-squared:  0.04319,    Adjusted R-squared:  0.04282 
## F-statistic: 115.7 on 3 and 7693 DF,  p-value: < 2.2e-16
# p < 0.05, therefore there is strong evidence of interaction between review_scores_rating and reviews_per_month

summary(lm(price ~ reviews_per_month * number_of_reviews, data=seadata))
## 
## Call:
## lm(formula = price ~ reviews_per_month * number_of_reviews, data = seadata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -228.3  -82.7  -33.9   21.7 4817.1 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)                         207.33444    3.18985  64.998  < 2e-16
## reviews_per_month                   -14.98059    1.19280 -12.559  < 2e-16
## number_of_reviews                    -0.44092    0.05601  -7.873 3.94e-15
## reviews_per_month:number_of_reviews   0.06338    0.00888   7.138 1.04e-12
##                                        
## (Intercept)                         ***
## reviews_per_month                   ***
## number_of_reviews                   ***
## reviews_per_month:number_of_reviews ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 169.7 on 7693 degrees of freedom
## Multiple R-squared:  0.04227,    Adjusted R-squared:  0.0419 
## F-statistic: 113.2 on 3 and 7693 DF,  p-value: < 2.2e-16
# p < 0.05, therefore there is strong evidence of interaction between number_of_reviews and reviews_per_month

summary(lm(price ~ review_scores_rating * number_of_reviews, data=seadata))
## 
## Call:
## lm(formula = price ~ review_scores_rating * number_of_reviews, 
##     data = seadata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -331.9  -81.5  -39.8   22.7 4828.7 
## 
## Coefficients:
##                                          Estimate Std. Error t value
## (Intercept)                            397.449345  26.245037  15.144
## review_scores_rating                    -2.287949   0.276576  -8.272
## number_of_reviews                       -2.183898   0.716406  -3.048
## review_scores_rating:number_of_reviews   0.019536   0.007416   2.634
##                                        Pr(>|t|)    
## (Intercept)                             < 2e-16 ***
## review_scores_rating                    < 2e-16 ***
## number_of_reviews                       0.00231 ** 
## review_scores_rating:number_of_reviews  0.00845 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 170.8 on 7693 degrees of freedom
## Multiple R-squared:  0.03032,    Adjusted R-squared:  0.02994 
## F-statistic: 80.18 on 3 and 7693 DF,  p-value: < 2.2e-16
# p < 0.05, therefore there is strong evidence of interaction between number_of_reviews and reviews_per_month

summary(aov(price ~ beds*factor(property_type), data=seadata))
##                              Df    Sum Sq  Mean Sq F value   Pr(>F)    
## beds                          1  24494959 24494959 985.518  < 2e-16 ***
## factor(property_type)        25  14503364   580135  23.341  < 2e-16 ***
## beds:factor(property_type)   20   2197903   109895   4.421 1.64e-10 ***
## Residuals                  7650 190140043    24855                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between beds and property_type

summary(aov(price ~ bedrooms*factor(property_type), data=seadata))
##                                  Df    Sum Sq  Mean Sq  F value Pr(>F)    
## bedrooms                          1  24744597 24744597 1021.685 <2e-16 ***
## factor(property_type)            25  18070135   722805   29.844 <2e-16 ***
## bedrooms:factor(property_type)   18   3194649   177481    7.328 <2e-16 ***
## Residuals                      7652 185326888    24219                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bedrooms and property_type

summary(aov(price ~ bathrooms*factor(property_type), data=seadata))
##                                   Df    Sum Sq  Mean Sq F value Pr(>F)    
## bathrooms                          1  18062456 18062456  706.86 <2e-16 ***
## factor(property_type)             25  14603117   584125   22.86 <2e-16 ***
## bathrooms:factor(property_type)   18   3137098   174283    6.82 <2e-16 ***
## Residuals                       7652 195533598    25553                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bathrooms and property_type

summary(aov(price ~ bathrooms*factor(room_type), data=seadata))
##                               Df    Sum Sq  Mean Sq F value Pr(>F)    
## bathrooms                      1  18062456 18062456  739.16 <2e-16 ***
## factor(room_type)              3  19901106  6633702  271.47 <2e-16 ***
## bathrooms:factor(room_type)    3   5480877  1826959   74.76 <2e-16 ***
## Residuals                   7689 187891830    24436                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bathrooms and room_type

summary(aov(price ~ bedrooms*factor(room_type), data=seadata))
##                              Df    Sum Sq  Mean Sq F value Pr(>F)    
## bedrooms                      1  24744597 24744597 983.023 <2e-16 ***
## factor(room_type)             3  12794900  4264967 169.433 <2e-16 ***
## bedrooms:factor(room_type)    2    224504   112252   4.459 0.0116 *  
## Residuals                  7690 193572269    25172                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bedrooms and room_type

summary(aov(price ~ beds*factor(room_type), data=seadata))
##                          Df    Sum Sq  Mean Sq F value   Pr(>F)    
## beds                      1  24494959 24494959  969.62  < 2e-16 ***
## factor(room_type)         3  10844244  3614748  143.09  < 2e-16 ***
## beds:factor(room_type)    3   1753578   584526   23.14 6.63e-15 ***
## Residuals              7689 194243488    25263                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between beds and room_type

summary(aov(price ~ reviews_per_month*factor(room_type), data=seadata))
##                                       Df    Sum Sq Mean Sq F value
## reviews_per_month                      1   7957368 7957368  303.30
## factor(room_type)                      3  20160989 6720330  256.15
## reviews_per_month:factor(room_type)    3   1489864  496621   18.93
## Residuals                           7689 201728049   26236        
##                                       Pr(>F)    
## reviews_per_month                    < 2e-16 ***
## factor(room_type)                    < 2e-16 ***
## reviews_per_month:factor(room_type) 3.16e-12 ***
## Residuals                                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between reviews_per_month and room_type

summary(aov(price ~ number_of_reviews*factor(room_type), data=seadata))
##                                       Df    Sum Sq Mean Sq F value
## number_of_reviews                      1   5018122 5018122  187.05
## factor(room_type)                      3  18768175 6256058  233.19
## number_of_reviews:factor(room_type)    3   1267214  422405   15.74
## Residuals                           7689 206282758   26828        
##                                       Pr(>F)    
## number_of_reviews                    < 2e-16 ***
## factor(room_type)                    < 2e-16 ***
## number_of_reviews:factor(room_type) 3.32e-10 ***
## Residuals                                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between number_of_reviews and room_type

summary(aov(price ~ review_scores_rating*factor(room_type), data=seadata))
##                                          Df    Sum Sq Mean Sq F value
## review_scores_rating                      1   2618975 2618975   96.91
## factor(room_type)                         3  19369803 6456601  238.91
## review_scores_rating:factor(room_type)    3   1553888  517963   19.17
## Residuals                              7689 207793604   27025        
##                                          Pr(>F)    
## review_scores_rating                    < 2e-16 ***
## factor(room_type)                       < 2e-16 ***
## review_scores_rating:factor(room_type) 2.23e-12 ***
## Residuals                                          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between review_score_rating and room_type

summary(aov(price ~ reviews_per_month*factor(property_type), data=seadata))
##                                           Df    Sum Sq Mean Sq F value
## reviews_per_month                          1   7957368 7957368 283.619
## factor(property_type)                     25   6676429  267057   9.519
## reviews_per_month:factor(property_type)   23   2154195   93661   3.338
## Residuals                               7647 214548278   28057        
##                                           Pr(>F)    
## reviews_per_month                        < 2e-16 ***
## factor(property_type)                    < 2e-16 ***
## reviews_per_month:factor(property_type) 1.16e-07 ***
## Residuals                                           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between reviews_per_month and property_type

summary(aov(price ~ number_of_reviews*factor(property_type), data=seadata))
##                                           Df    Sum Sq Mean Sq F value
## number_of_reviews                          1   5018122 5018122 176.923
## factor(property_type)                     25   7478598  299144  10.547
## number_of_reviews:factor(property_type)   23   1945017   84566   2.982
## Residuals                               7647 216894532   28363        
##                                           Pr(>F)    
## number_of_reviews                        < 2e-16 ***
## factor(property_type)                    < 2e-16 ***
## number_of_reviews:factor(property_type) 2.17e-06 ***
## Residuals                                           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between number_of_reviews and property_type

summary(aov(price ~ review_scores_rating*factor(property_type), data=seadata))
##                                              Df    Sum Sq Mean Sq F value
## review_scores_rating                          1   2618975 2618975  92.711
## factor(property_type)                        25   7425216  297009  10.514
## review_scores_rating:factor(property_type)   23   5272832  229254   8.115
## Residuals                                  7647 216019247   28249        
##                                            Pr(>F)    
## review_scores_rating                       <2e-16 ***
## factor(property_type)                      <2e-16 ***
## review_scores_rating:factor(property_type) <2e-16 ***
## Residuals                                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between review_scores_rating and property_type

summary(aov(price ~ reviews_per_month*beds, data=seadata))
##                          Df    Sum Sq  Mean Sq F value   Pr(>F)    
## reviews_per_month         1   7957368  7957368  308.15  < 2e-16 ***
## beds                      1  23598436 23598436  913.85  < 2e-16 ***
## reviews_per_month:beds    1   1123633  1123633   43.51 4.49e-11 ***
## Residuals              7693 198656833    25823                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between beds and reviews_per_month

summary(aov(price ~ reviews_per_month*bedrooms, data=seadata))
##                              Df    Sum Sq  Mean Sq F value   Pr(>F)    
## reviews_per_month             1   7957368  7957368  304.95  < 2e-16 ***
## bedrooms                      1  21449766 21449766  822.01  < 2e-16 ***
## reviews_per_month:bedrooms    1   1186626  1186626   45.48 1.66e-11 ***
## Residuals                  7693 200742510    26094                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bedrooms and reviews_per_month

summary(aov(price ~ reviews_per_month*bathrooms, data=seadata))
##                               Df    Sum Sq  Mean Sq F value Pr(>F)    
## reviews_per_month              1   7957368  7957368 294.347 <2e-16 ***
## bathrooms                      1  15265018 15265018 564.660 <2e-16 ***
## reviews_per_month:bathrooms    1    141319   141319   5.227 0.0223 *  
## Residuals                   7693 207972564    27034                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bathrooms and reviews_per_month

summary(aov(price ~ number_of_reviews*bedrooms, data=seadata))
##                              Df    Sum Sq  Mean Sq F value   Pr(>F)    
## number_of_reviews             1   5018122  5018122  189.76  < 2e-16 ***
## bedrooms                      1  22469187 22469187  849.67  < 2e-16 ***
## number_of_reviews:bedrooms    1    411585   411585   15.56 8.05e-05 ***
## Residuals                  7693 203437375    26444                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bedrooms and number_of_reviews

summary(aov(price ~ number_of_reviews*beds, data=seadata))
##                          Df    Sum Sq  Mean Sq F value   Pr(>F)    
## number_of_reviews         1   5018122  5018122   190.2  < 2e-16 ***
## beds                      1  23045612 23045612   873.5  < 2e-16 ***
## number_of_reviews:beds    1    316601   316601    12.0 0.000535 ***
## Residuals              7693 202955934    26382                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between beds and number_of_reviews

summary(aov(price ~ number_of_reviews*bathrooms, data=seadata))
##                               Df    Sum Sq  Mean Sq F value Pr(>F)    
## number_of_reviews              1   5018122  5018122 183.760 <2e-16 ***
## bathrooms                      1  16178157 16178157 592.433 <2e-16 ***
## number_of_reviews:bathrooms    1     59718    59718   2.187  0.139    
## Residuals                   7693 210080272    27308                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bathrooms and number_of_reviews

summary(aov(price ~ review_scores_rating*bathrooms, data=seadata))
##                                  Df    Sum Sq  Mean Sq F value   Pr(>F)
## review_scores_rating              1   2618975  2618975   96.55  < 2e-16
## bathrooms                         1  18290922 18290922  674.27  < 2e-16
## review_scores_rating:bathrooms    1   1738302  1738302   64.08 1.37e-15
## Residuals                      7693 208688071    27127                 
##                                   
## review_scores_rating           ***
## bathrooms                      ***
## review_scores_rating:bathrooms ***
## Residuals                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bathrooms and review_scores_rating

summary(aov(price ~ review_scores_rating*beds, data=seadata))
##                             Df    Sum Sq  Mean Sq F value   Pr(>F)    
## review_scores_rating         1   2618975  2618975   99.05  < 2e-16 ***
## beds                         1  24989683 24989683  945.09  < 2e-16 ***
## review_scores_rating:beds    1    312299   312299   11.81 0.000592 ***
## Residuals                 7693 203415314    26442                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between beds and review_scores_rating

summary(aov(price ~ review_scores_rating*bedrooms, data=seadata))
##                                 Df    Sum Sq  Mean Sq F value   Pr(>F)    
## review_scores_rating             1   2618975  2618975   99.66  < 2e-16 ***
## bedrooms                         1  26171631 26171631  995.87  < 2e-16 ***
## review_scores_rating:bedrooms    1    371456   371456   14.13 0.000171 ***
## Residuals                     7693 202174208    26280                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between review_scores_rating and bedrooms

Linear regression

mr01 <-(lm(price ~ (review_scores_rating + reviews_per_month + number_of_reviews + beds + bathrooms + bedrooms + factor(property_type) + factor(room_type))^2, data=seadata))
# plot  of fitted values and summary
#summary(mr01)
hist(mr01$fitted.values)

#Fitted vs originals
plot(seadata$price, mr01$fitted.values,col="lightblue")
abline(a=0, b=1)

## MSE and RMSE
mean((mr01$fitted.values - seadata$price)^2)
## [1] 20362.3
sqrt(mean((mr01$fitted.values - seadata$price)^2))
## [1] 142.6965
#------------------------------------

# F-test using anova
full = mr01
reduced = (lm(price ~ 1, data=seadata))
anova(reduced,full)
## Analysis of Variance Table
## 
## Model 1: price ~ 1
## Model 2: price ~ (review_scores_rating + reviews_per_month + number_of_reviews + 
##     beds + bathrooms + bedrooms + factor(property_type) + factor(room_type))^2
##   Res.Df       RSS  Df Sum of Sq      F    Pr(>F)    
## 1   7696 231336270                                   
## 2   7492 156728659 204  74607611 17.483 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Conclusion: The null hypothesis test is statistically significant, therefore not all regression coefficients are 0.

#—————————————————-

Checking the assumptions for linear regression:  Normality of error distribution or large sample size  Constant variance of errors  Linearity  Independence

#Check for normality using residuals
hist(mr01$residuals)

qqnorm(mr01$residuals)
qqline(mr01$residuals)

# Checking the constant variance assumption of Linear regression
scatter.smooth(mr01$fitted.values, mr01$residuals,cex=0.5,col="lightblue")

# LINEAR REGRESSION ASSUMPTIONS NOT MET

Log tranformed price linear model

#lm function throws error
mr02 <- lm(log(price) ~ (review_scores_rating + reviews_per_month + number_of_reviews + beds + bathrooms + bedrooms + factor(property_type) + factor(room_type))^2, data=seadata)
# plot  of fitted values and summary
#summary(mr02)
hist(mr02$fitted.values)

#plot(mr01)

#Fitted vs originals
plot(seadata$price, exp(mr02$fitted.values),col="lightblue")
abline(a=0, b=1)

## MSE and RMSE
mean((exp(mr02$fitted.values) - seadata$price)^2)
## [1] 21456.52
sqrt(mean((exp(mr02$fitted.values) - seadata$price)^2))
## [1] 146.4804
#Check for normality using residuals
hist(mr02$residuals)

qqnorm(mr02$residuals)
qqline(mr02$residuals)

# Checking the constant variance assumption of Linear regression
scatter.smooth(mr02$fitted.values, mr02$residuals,cex=0.5,col='lightblue')

# Logtrsformed price model meets th assumptions of linear regression

LOG-LINEAR REGRESSION

# price and review score rating
glm01=glm(price ~ review_scores_rating, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 808238.2 7695   105.0342
glmsum = summary(glm01,dispersion=D/df)
glmsum$coef
##                         Estimate   Std. Error   z value     Pr(>|z|)
## (Intercept)           6.23923668 0.0928332946  67.20904 0.000000e+00
## review_scores_rating -0.01215559 0.0009808655 -12.39272 2.861862e-35
#plotting the fitted model
plot(seadata$review_scores_rating,seadata$price,cex=0.5,col="lightblue")
x=min(seadata$review_scores_rating):max(seadata$review_scores_rating)
y=exp(glmsum$coef[1]+glmsum$coef[2]*x)
lines(x,y,col=4)

#Coefficient estimate of review score rating = -0.01215559 which is the average difference in the log of mean price per unit difference in the review score rating.
exp(glmsum$coef[2]) 
## [1] 0.987918
# exp(review score rating estimate) = 0.987918 is the ratio of mean prices for 2 houses differing by a unit review_score_rating.

# As the assumed mean-variance relationship might not be valid for log-linear regression, use robust se. Price vs review_scores_rating exhibits heteroscedasticity.
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##                      Estimate Std. Error  z value Pr(>|z|) robust.se
## (Intercept)            6.2392     0.0928  67.2090        0    0.1700
## review_scores_rating  -0.0122     0.0010 -12.3927        0    0.0018
z = (fit[,1][2]/fit[,5][2])
p_value = 2 * pnorm(-abs(z)) 
p_value
## review_scores_rating 
##         1.220384e-11
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and review_score_rating

#----------------------
# price and reviews_per_month
glm01=glm(price ~ reviews_per_month, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 768457.9 7695   99.86457
glmsum = summary(glm01,dispersion=D/df)
glmsum$coef
##                     Estimate  Std. Error   z value      Pr(>|z|)
## (Intercept)        5.3015947 0.012451283 425.78704  0.000000e+00
## reviews_per_month -0.1014198 0.004593682 -22.07812 5.130396e-108
#plotting the fitted model
plot(seadata$reviews_per_month,seadata$price,cex=0.5,col="lightblue")
x=min(seadata$reviews_per_month):max(seadata$reviews_per_month)
y=exp(glmsum$coef[1]+glmsum$coef[2]*x)
lines(x,y,col=4)

#Coefficient estimate of reviews_per_month = -0.1014 which is the average difference in the log of mean price per unit difference in the reviews_per_month.
exp(glmsum$coef[2]) 
## [1] 0.9035536
# exp(reviews_per_month estimate) = 0.9035536 is the ratio of mean prices for 2 houses differing by a unit reviews_per_month (or) mean price decreases by approx 10% for each difference of unit reviews_per_month

# As the assumed mean-variance relationship might not be valid for log-linear regression, use robust se. 
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##                   Estimate Std. Error  z value Pr(>|z|) robust.se
## (Intercept)         5.3016     0.0125 425.7870        0    0.0189
## reviews_per_month  -0.1014     0.0046 -22.0781        0    0.0049
z = (fit[,1][2]/fit[,5][2])
p_value = 2 * pnorm(-abs(z)) 
p_value
## reviews_per_month 
##      3.932789e-95
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and reviews_per_month

#------------------------------

# price and number_of_reviews
glm01=glm(price ~ number_of_reviews, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 786358.7 7695   102.1909
glmsum = summary(glm01,dispersion=D/df)
glmsum$coef
##                       Estimate   Std. Error   z value     Pr(>|z|)
## (Intercept)        5.218975280 0.0112378131 464.41200 0.000000e+00
## number_of_reviews -0.002486994 0.0001431522 -17.37308 1.319573e-67
#plotting the fitted model
plot(seadata$number_of_reviews,seadata$price,cex=0.5,col="lightblue")
x=min(seadata$number_of_reviews):max(seadata$number_of_reviews)
y=exp(glmsum$coef[1]+glmsum$coef[2]*x)
lines(x,y,col=4)

#Coefficient estimate of number_of_reviews = -0.002486994 which is the average difference in the log of mean price per unit difference in the number_of_reviews.
exp(glmsum$coef[2]) 
## [1] 0.9975161
# exp(number_of_reviews estimate) = 0.9975161 is the ratio of mean prices for 2 houses differing by 1 number_of_reviews (or)  mean price decreases by approx 1% for each difference of unit number_of_reviews

# As the assumed mean-variance relationship might not be valid for log-linear regression, use robust se. 
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##                   Estimate Std. Error  z value Pr(>|z|) robust.se
## (Intercept)         5.2190     0.0112 464.4120        0    0.0161
## number_of_reviews  -0.0025     0.0001 -17.3731        0    0.0001
z = (fit[,1][2]/fit[,5][2])
p_value = 2 * pnorm(-abs(z)) 
p_value
## number_of_reviews 
##     6.113393e-138
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and number_of_reviews

#------------------------------

# price and beds
glm01=glm(price ~ beds, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##        D   df dispersion
## 1 694906 7695   90.30617
glmsum = summary(glm01,dispersion=D/df)
glmsum$coef
##              Estimate  Std. Error   z value Pr(>|z|)
## (Intercept) 4.6467001 0.014866070 312.57084        0
## beds        0.2111262 0.005220622  40.44082        0
#plotting the fitted model
plot(seadata$beds,seadata$price,cex=0.5,col="lightblue")
x=min(seadata$beds):max(seadata$beds)
y=exp(glmsum$coef[1]+glmsum$coef[2]*x)
lines(x,y,col=4)

#Coefficient estimate of beds = 0.2111262 which is the average difference in the log of mean price per unit difference in the beds.
exp(glmsum$coef[2]) 
## [1] 1.235068
# exp(beds estimate) = 1.235068 is the ratio of mean prices for 2 houses differing by 1 beds (or) mean price increases by 23% for each difference of 1 bed

# As the assumed mean-variance relationship might not be valid for log-linear regression, use robust se. 
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##             Estimate Std. Error  z value Pr(>|z|) robust.se
## (Intercept)   4.6467     0.0149 312.5708        0    0.0200
## beds          0.2111     0.0052  40.4408        0    0.0067
z = (fit[,1][2]/fit[,5][2])
p_value = 2 * pnorm(-abs(z)) 
p_value
##          beds 
## 6.864939e-218
#Conclusion: The null hypothesis test is not statistically significant, therefore there is no linear association between price and beds

#------------------------------

# price and bedrooms
glm01=glm(price ~ bedrooms, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 687540.9 7695   89.34904
glmsum = summary(glm01,dispersion=D/df)
glmsum$coef
##              Estimate  Std. Error   z value Pr(>|z|)
## (Intercept) 4.6374116 0.014931909 310.57057        0
## bedrooms    0.2929199 0.007132322  41.06936        0
#plotting the fitted model
plot(seadata$bedrooms,seadata$price,cex=0.5,col="lightblue")
x=min(seadata$bedrooms):max(seadata$bedrooms)
y=exp(glmsum$coef[1]+glmsum$coef[2]*x)
lines(x,y,col=4)

#Coefficient estimate of bedrooms = 0.2929199  which is the average difference in the log of mean price per unit difference in the bedrooms.
exp(glmsum$coef[2]) 
## [1] 1.340335
# exp(bedrooms estimate) = 1.340335 is the ratio of mean prices for 2 houses differing by 1 bedroom (or) mean price increases by 34% for each difference of 1 bedroom

# As the assumed mean-variance relationship might not be valid for log-linear regression, use robust se. Data also exhibits Heteroscedasticity.
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##             Estimate Std. Error  z value Pr(>|z|) robust.se
## (Intercept)   4.6374     0.0149 310.5706        0    0.0213
## bedrooms      0.2929     0.0071  41.0694        0    0.0089
z = (fit[,1][2]/fit[,5][2])
p_value = 2 * pnorm(-abs(z)) 
p_value
##     bedrooms 
## 1.57527e-237
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and bedrooms

#------------------------------

# price and bathrooms
glm01=glm(price ~ bathrooms, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 727940.7 7695   94.59918
glmsum = summary(glm01,dispersion=D/df)
glmsum$coef
##              Estimate Std. Error   z value      Pr(>|z|)
## (Intercept) 4.5536227 0.01904143 239.14286  0.000000e+00
## bathrooms   0.3876911 0.01145963  33.83102 6.901393e-251
#plotting the fitted model
plot(seadata$bathrooms,seadata$price,cex=0.5,col="lightblue")
x=min(seadata$bathrooms):max(seadata$bathrooms)
y=exp(glmsum$coef[1]+glmsum$coef[2]*x)
lines(x,y,col=4)

#Coefficient estimate of bathrooms = 0.3876911 which is the average difference in the log of mean price per unit difference in the bathrooms.
exp(glmsum$coef[2]) 
## [1] 1.473575
# exp(bathrooms estimate) = 1.473575 is the ratio of mean prices for 2 houses differing by 1 bathrooms (or) mean price increases by 47% for each difference of 1 bathroom

# As the assumed mean-variance relationship might not be valid for log-linear regression, use robust se. 
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##             Estimate Std. Error  z value Pr(>|z|) robust.se
## (Intercept)   4.5536     0.0190 239.1429        0    0.0260
## bathrooms     0.3877     0.0115  33.8310        0    0.0159
z = (fit[,1][2]/fit[,5][2])
p_value = 2 * pnorm(-abs(z)) 
p_value
##     bathrooms 
## 2.550089e-131
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and bathrooms

#------------------------------

# price and factor(property_type)
glm01=glm(price ~ factor(property_type), data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 767295.8 7671   100.0255
glmsum = summary(glm01,dispersion=D/df)
#glmsum$coef

#Coefficient estimate of property_type Bed and breakfast = -0.69524794 which is the average difference in the log of mean price comparing property types.
exp(glmsum$coef[2]) 
## [1] 0.4989507
# exp(property type Bed and breakfast estimate) = 0.4989507 which implies approx 50% lower price compared to the reference group.  

# LRT test
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
## 
## Model 1: price ~ 1
## Model 2: price ~ factor(property_type)
##   Resid. Df Resid. Dev Df Deviance
## 1      7696     822107            
## 2      7671     767296 25    54811
#1-pchisq(deviance,df from above model)
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and property_type

#------------------------------

# price and factor(room_type)
glm01=glm(price ~ factor(room_type), data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 683249.9 7693   88.81449
glmsum = summary(glm01,dispersion=D/df)
glmsum$coef
##                                 Estimate  Std. Error    z value
## (Intercept)                    5.2308842 0.009019876 579.928622
## factor(room_type)Hotel room    0.3746429 0.055980551   6.692377
## factor(room_type)Private room -0.9033623 0.028365423 -31.847305
## factor(room_type)Shared room  -1.5949188 0.134495957 -11.858489
##                                    Pr(>|z|)
## (Intercept)                    0.000000e+00
## factor(room_type)Hotel room    2.195745e-11
## factor(room_type)Private room 1.434377e-222
## factor(room_type)Shared room   1.944531e-32
#Coefficient estimate of room_type Hotel room = 0.3746429 which is the average difference in the log of mean price comparing room types.
exp(glmsum$coef[2]) 
## [1] 1.454472
# exp(room_type Hotel room estimate) = 1.45447 which implies 45% higher price compared to the reference group. 

# LRT test
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
## 
## Model 1: price ~ 1
## Model 2: price ~ factor(room_type)
##   Resid. Df Resid. Dev Df Deviance
## 1      7696     822107            
## 2      7693     683250  3   138857
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and room_type

GLM - Interactions between predictors

#Interactions between predictors
glm01 = glm(price ~ review_scores_rating * reviews_per_month, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 759561.5 7693    98.7341
glmsum = summary(glm01,dispersion=D/df)

v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##                                        Estimate Std. Error z value
## (Intercept)                              6.3654     0.1075 59.2244
## review_scores_rating                    -0.0113     0.0011 -9.9323
## reviews_per_month                       -0.4509     0.0916 -4.9211
## review_scores_rating:reviews_per_month   0.0037     0.0010  3.8827
##                                        Pr(>|z|) robust.se
## (Intercept)                               0e+00    0.1955
## review_scores_rating                      0e+00    0.0020
## reviews_per_month                         0e+00    0.1288
## review_scores_rating:reviews_per_month    1e-04    0.0013
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z)) 
p_value
## review_scores_rating:reviews_per_month 
##                            0.004425081
# p < 0.05, therefore there is evidence of interaction between review_scores_rating and reviews_per_month

#----------------------------------
glm01 = glm(price ~ reviews_per_month * number_of_reviews, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 759131.6 7693   98.67823
glmsum = summary(glm01,dispersion=D/df)

v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##                                     Estimate Std. Error  z value Pr(>|z|)
## (Intercept)                           5.3571     0.0136 394.8189        0
## reviews_per_month                    -0.0974     0.0061 -15.9231        0
## number_of_reviews                    -0.0027     0.0003  -9.8068        0
## reviews_per_month:number_of_reviews   0.0004     0.0000   9.1453        0
##                                     robust.se
## (Intercept)                            0.0208
## reviews_per_month                      0.0059
## number_of_reviews                      0.0003
## reviews_per_month:number_of_reviews    0.0000
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z)) 
p_value
## reviews_per_month:number_of_reviews 
##                                   0
# p < 0.05, therefore there is strong evidence of interaction between number_of_reviews and reviews_per_month

#------------------------------------------

glm01 = glm(price ~ review_scores_rating * number_of_reviews, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 777218.9 7693   101.0294
glmsum = summary(glm01,dispersion=D/df)

v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##                                        Estimate Std. Error  z value
## (Intercept)                              6.1794      0.095  65.0722
## review_scores_rating                    -0.0102      0.001 -10.1328
## number_of_reviews                       -0.0131      0.004  -3.2375
## review_scores_rating:number_of_reviews   0.0001      0.000   2.6701
##                                        Pr(>|z|) robust.se
## (Intercept)                              0.0000    0.1758
## review_scores_rating                     0.0000    0.0018
## number_of_reviews                        0.0012    0.0039
## review_scores_rating:number_of_reviews   0.0076    0.0000
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z)) 
p_value
## review_scores_rating:number_of_reviews 
##                                      0
# p < 0.05, therefore there is strong evidence of interaction between number_of_reviews and reviews_per_month

#-------------------------------------

glm01 = glm(price ~ beds*factor(property_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
## 
## Model 1: price ~ 1
## Model 2: price ~ beds * factor(property_type)
##   Resid. Df Resid. Dev Df Deviance
## 1      7696     822107            
## 2      7650     596291 46   225815
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between beds and property_type

#-------------------------------------

glm01=glm(price ~ bathrooms*factor(property_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
## 
## Model 1: price ~ 1
## Model 2: price ~ bathrooms * factor(property_type)
##   Resid. Df Resid. Dev Df Deviance
## 1      7696     822107            
## 2      7652     628134 44   193973
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between bathrooms and property_type

#------------------------------

glm01=glm(price ~ bedrooms*factor(property_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
## 
## Model 1: price ~ 1
## Model 2: price ~ bedrooms * factor(property_type)
##   Resid. Df Resid. Dev Df Deviance
## 1      7696     822107            
## 2      7652     560974 44   261132
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between bedrooms and property_type

#----------------------------------------------

glm01=glm(price ~ bathrooms*factor(room_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
## 
## Model 1: price ~ 1
## Model 2: price ~ bathrooms * factor(room_type)
##   Resid. Df Resid. Dev Df Deviance
## 1      7696     822107            
## 2      7689     574073  7   248033
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between bathrooms and room_type

#----------------------------------------------

glm01=glm(price ~ bedrooms*factor(room_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
## 
## Model 1: price ~ 1
## Model 2: price ~ bedrooms * factor(room_type)
##   Resid. Df Resid. Dev Df Deviance
## 1      7696     822107            
## 2      7690     590519  6   231587
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between bedrooms and room_type

#--------------------------------------------

glm01=glm(price ~ beds*factor(room_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
## 
## Model 1: price ~ 1
## Model 2: price ~ beds * factor(room_type)
##   Resid. Df Resid. Dev Df Deviance
## 1      7696     822107            
## 2      7689     597097  7   225010
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between beds and room_type. 

#-------------------------------------------------

glm01=glm(price ~ reviews_per_month*factor(room_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
## 
## Model 1: price ~ 1
## Model 2: price ~ reviews_per_month * factor(room_type)
##   Resid. Df Resid. Dev Df Deviance
## 1      7696     822107            
## 2      7689     620184  7   201923
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between reviews_per_month and room_type

#-----------------------------------

glm01=glm(price ~ number_of_reviews*factor(room_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
## 
## Model 1: price ~ 1
## Model 2: price ~ number_of_reviews * factor(room_type)
##   Resid. Df Resid. Dev Df Deviance
## 1      7696     822107            
## 2      7689     645509  7   176598
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between number_of_reviews and room_type

#----------------------------------------

glm01=glm(price ~ review_scores_rating*factor(room_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
## 
## Model 1: price ~ 1
## Model 2: price ~ review_scores_rating * factor(room_type)
##   Resid. Df Resid. Dev Df Deviance
## 1      7696     822107            
## 2      7689     662911  7   159195
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between review_score_rating and room_type

#-----------------------------------

glm01=glm(price ~ reviews_per_month*factor(property_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
## 
## Model 1: price ~ 1
## Model 2: price ~ reviews_per_month * factor(property_type)
##   Resid. Df Resid. Dev Df Deviance
## 1      7696     822107            
## 2      7647     719061 49   103045
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between reviews_per_month and property_type

#----------------------------------

glm01=glm(price ~ number_of_reviews*factor(property_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
## 
## Model 1: price ~ 1
## Model 2: price ~ number_of_reviews * factor(property_type)
##   Resid. Df Resid. Dev Df Deviance
## 1      7696     822107            
## 2      7647     731656 49    90450
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between number_of_reviews and property_type

#--------------------------------------

glm01=glm(price ~ review_scores_rating*factor(property_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
## 
## Model 1: price ~ 1
## Model 2: price ~ review_scores_rating * factor(property_type)
##   Resid. Df Resid. Dev Df Deviance
## 1      7696     822107            
## 2      7647     736367 49    85740
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between review_scores_rating and property_type

#----------------------------------------

glm01=glm(price ~ reviews_per_month*beds, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 646284.3 7693    84.0094
glmsum = summary(glm01,dispersion=D/df)

v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##                        Estimate Std. Error  z value Pr(>|z|) robust.se
## (Intercept)              4.8418     0.0203 238.2644   0.0000    0.0317
## reviews_per_month       -0.0870     0.0076 -11.4374   0.0000    0.0084
## beds                     0.2151     0.0073  29.3856   0.0000    0.0105
## reviews_per_month:beds  -0.0048     0.0030  -1.6326   0.1025    0.0034
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z)) 
p_value
## reviews_per_month:beds 
##              0.1580193
# p > 0.05, therefore there is no evidence of interaction between beds and reviews_per_month

#----------------------------------------------

glm01=glm(price ~ reviews_per_month*bedrooms, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 652481.4 7693   84.81495
glmsum = summary(glm01,dispersion=D/df)

v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##                            Estimate Std. Error  z value Pr(>|z|) robust.se
## (Intercept)                  4.8225     0.0207 232.9086   0.0000    0.0355
## reviews_per_month           -0.0742     0.0071 -10.3846   0.0000    0.0086
## bedrooms                     0.2850     0.0097  29.2450   0.0000    0.0142
## reviews_per_month:bedrooms  -0.0060     0.0038  -1.5541   0.1202    0.0042
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z)) 
p_value
## reviews_per_month:bedrooms 
##                  0.1531275
# p > 0.05, therefore there is no evidence of interaction between bedrooms and reviews_per_month

#----------------------------------

glm01=glm(price ~ reviews_per_month*bathrooms, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##        D   df dispersion
## 1 691116 7693   89.83699
glmsum = summary(glm01,dispersion=D/df)

v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##                             Estimate Std. Error  z value Pr(>|z|)
## (Intercept)                   4.8078     0.0259 185.3128   0.0000
## reviews_per_month            -0.1015     0.0099 -10.2393   0.0000
## bathrooms                     0.3334     0.0153  21.8181   0.0000
## reviews_per_month:bathrooms   0.0124     0.0066   1.8681   0.0617
##                             robust.se
## (Intercept)                    0.0414
## reviews_per_month              0.0113
## bathrooms                      0.0238
## reviews_per_month:bathrooms    0.0079
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z)) 
p_value
## reviews_per_month:bathrooms 
##                   0.1165035
# p > 0.05, therefore there is no evidence of interaction between bathrooms and reviews_per_month

#----------------------------------------------

glm01=glm(price ~ number_of_reviews*bedrooms, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 665892.6 7693   86.55825
glmsum = summary(glm01,dispersion=D/df)

v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##                            Estimate Std. Error  z value Pr(>|z|) robust.se
## (Intercept)                  4.7636     0.0184 258.4826   0.0000    0.0297
## number_of_reviews           -0.0020     0.0002  -8.9446   0.0000    0.0002
## bedrooms                     0.2780     0.0089  31.4063   0.0000    0.0124
## number_of_reviews:bedrooms   0.0000     0.0001   0.1387   0.8897    0.0001
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z)) 
p_value
## number_of_reviews:bedrooms 
##                          1
# p > 0.05, therefore there is no strong evidence of interaction between bedrooms and number_of_reviews

#-------------------------------

glm01=glm(price ~ number_of_reviews*beds, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 667795.4 7693   86.80559
glmsum = summary(glm01,dispersion=D/df)

v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##                        Estimate Std. Error  z value Pr(>|z|) robust.se
## (Intercept)              4.7847     0.0181 264.3671   0.0000    0.0269
## number_of_reviews       -0.0024     0.0002 -10.3175   0.0000    0.0002
## beds                     0.2002     0.0064  31.1764   0.0000    0.0091
## number_of_reviews:beds   0.0001     0.0001   0.9291   0.3528    0.0001
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z)) 
p_value
## number_of_reviews:beds 
##              0.3173105
# p > 0.05, therefore there is no strong evidence of interaction between beds and number_of_reviews

#-------------------------------------------

glm01=glm(price ~ number_of_reviews*bathrooms, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 704008.9 7693   91.51292
glmsum = summary(glm01,dispersion=D/df)

v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##                             Estimate Std. Error  z value Pr(>|z|)
## (Intercept)                   4.7143     0.0233 202.1202   0.0000
## number_of_reviews            -0.0026     0.0003  -8.2908   0.0000
## bathrooms                     0.3490     0.0138  25.2563   0.0000
## number_of_reviews:bathrooms   0.0004     0.0002   1.8706   0.0614
##                             robust.se
## (Intercept)                    0.0352
## number_of_reviews              0.0003
## bathrooms                      0.0207
## number_of_reviews:bathrooms    0.0002
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z)) 
p_value
## number_of_reviews:bathrooms 
##                  0.04550026
# p < 0.05, therefore there is STRONG evidence of interaction between bathrooms and number_of_reviews

#----------------------------------------------------

glm01=glm(price ~ review_scores_rating*bathrooms, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 703343.8 7693   91.42646
glmsum = summary(glm01,dispersion=D/df)

v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##                                Estimate Std. Error  z value Pr(>|z|)
## (Intercept)                      7.6635     0.2087  36.7239        0
## review_scores_rating            -0.0329     0.0022 -14.9878        0
## bathrooms                       -1.0711     0.1564  -6.8490        0
## review_scores_rating:bathrooms   0.0154     0.0016   9.4194        0
##                                robust.se
## (Intercept)                       0.3912
## review_scores_rating              0.0041
## bathrooms                         0.2979
## review_scores_rating:bathrooms    0.0031
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z)) 
p_value
## review_scores_rating:bathrooms 
##                   6.773702e-07
# p < 0.05, therefore there is strong evidence of interaction between bathrooms and review_scores_rating

#------------------------------------------

glm01=glm(price ~ review_scores_rating*beds, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 674952.7 7693   87.73596
glmsum = summary(glm01,dispersion=D/df)

v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##                           Estimate Std. Error  z value Pr(>|z|) robust.se
## (Intercept)                 6.6556     0.1437  46.3091   0.0000    0.2507
## review_scores_rating       -0.0213     0.0015 -14.0102   0.0000    0.0026
## beds                       -0.1797     0.0653  -2.7531   0.0059    0.1157
## review_scores_rating:beds   0.0042     0.0007   6.0578   0.0000    0.0012
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z)) 
p_value
## review_scores_rating:beds 
##              0.0004652582
# p < 0.05, therefore there is strong evidence of interaction between beds and review_scores_rating

#-------------------------------------------------

glm01=glm(price ~ review_scores_rating*bedrooms, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 662655.9 7693   86.13751
glmsum = summary(glm01,dispersion=D/df)

v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
##                               Estimate Std. Error  z value Pr(>|z|)
## (Intercept)                     6.8469     0.1528  44.8085   0.0000
## review_scores_rating           -0.0235     0.0016 -14.5600   0.0000
## bedrooms                       -0.3071     0.1063  -2.8873   0.0039
## review_scores_rating:bedrooms   0.0064     0.0011   5.7597   0.0000
##                               robust.se
## (Intercept)                      0.2850
## review_scores_rating             0.0030
## bedrooms                         0.1775
## review_scores_rating:bedrooms    0.0018
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z)) 
p_value
## review_scores_rating:bedrooms 
##                  0.0003771812
# p < 0.05, therefore there is strong evidence of interaction between review_scores_rating and bedrooms
glm02 <-glm(price ~ (review_scores_rating + reviews_per_month + number_of_reviews + beds + bathrooms + bedrooms + factor(property_type) + factor(room_type))^2, data=seadata,family=poisson)
# plot  of fitted values and summary
D=summary(glm02)$deviance
df=summary(glm02)$df.residual
data.frame(D,df,dispersion=D/df)
##          D   df dispersion
## 1 427064.7 7492   57.00276
glmsum = summary(glm02,dispersion=D/df)
#glmsum$coef

hist(glm02$fitted.values)

#Fitted vs originals
plot(glm02$fitted.values,seadata$price, col="lightblue")
abline(a=0, b=1)

scatter.smooth(glm02$fitted.values, glm02$residuals,cex=0.5,col="lightblue")

## MSE and RMSE
mean((glm02$fitted.values - seadata$price)^2)
## [1] 20012.79
sqrt(mean((glm02$fitted.values - seadata$price)^2))
## [1] 141.4666
#------------------------------------

# Overall LRT test
fit.full=glm02
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
## 
## Model 1: price ~ 1
## Model 2: price ~ (review_scores_rating + reviews_per_month + number_of_reviews + 
##     beds + bathrooms + bedrooms + factor(property_type) + factor(room_type))^2
##   Resid. Df Resid. Dev  Df Deviance
## 1      7696     822107             
## 2      7492     427065 204   395042
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
#Conclusion: The null hypothesis test is statistically significant, therefore not all regression coefficients are 0.
# No normality assumption for log-linear regression/poisson model

# Poisson regression an assumption is that the variance is proportional to the mean (hence not constant), so we should expect the residuals to display non-constant variance.

#Residuals versus linear predictors: The smooth curve is linear
r=residuals(glm02,type="pearson")
scatter.smooth(glm02$linear.predictors,r,cex=0.5,col="lightblue")

#Absolute residuals versus linear predictors: There is no mean-variance relationship for the residuals and therefore the assumption of variance being proportional to mean is true.
scatter.smooth(glm02$linear.predictors,abs(r),cex=0.5,col="lightblue")

# Pearson and deviance residual plots
scatter.smooth(glm02$linear.predictors,residuals(glm02,type="deviance"),cex=0.5,col="lightblue")

scatter.smooth(glm02$linear.predictors,residuals(glm02,type="pearson"),cex=0.5,col="lightblue")

# Overall summary: The model assumptions are satisfied very well for the Poisson regression model