# Read the Seattle_listsing.csv
sdata <- read.csv("Seattle_listings.csv",stringsAsFactors = FALSE)
# Select only the 10 variables we are interested in
seadata <- sdata[,c("price","reviews_per_month","number_of_reviews","review_scores_rating","beds","bathrooms","bedrooms","property_type","room_type","square_feet")]
# Covert the room type and property type explicitly to factors
seadata$room_type = factor(sdata$room_type)
seadata$property_type = factor(sdata$property_type)
# Remove the dollar sign and any commas and convert them to numeric values
#grep("\\$",seadata$price,value=TRUE)
seadata$price <- as.numeric(gsub('\\$|,', '', seadata$price))
# Look for NA's in all the columns - get the numbers of entries for each column that are NA's
for(i in 1:dim(seadata)[2])
{
print(paste(names(seadata)[i], length(which(is.na(seadata[,i])))))
}
## [1] "price 0"
## [1] "reviews_per_month 1261"
## [1] "number_of_reviews 0"
## [1] "review_scores_rating 1320"
## [1] "beds 3"
## [1] "bathrooms 2"
## [1] "bedrooms 7"
## [1] "property_type 0"
## [1] "room_type 0"
## [1] "square_feet 8620"
# As square feet column has almost 95 percent of data as N/A's, drop the column as we cannot use that column
drops <- c("square_feet")
seadata <- seadata[ ,!(names(seadata) %in% drops)]
# Look for NA's in all the columns - get the numbers of entries for each column that are NA's
for(i in 1:dim(seadata)[2])
{
print(paste(names(seadata)[i], length(which(is.na(seadata[,i])))))
}
## [1] "price 0"
## [1] "reviews_per_month 1261"
## [1] "number_of_reviews 0"
## [1] "review_scores_rating 1320"
## [1] "beds 3"
## [1] "bathrooms 2"
## [1] "bedrooms 7"
## [1] "property_type 0"
## [1] "room_type 0"
# Drop records that have reviews_per_month,bedrooms and review_score_rating as N/As
# Also remove the records having price as 9999 and 0
seadata <- seadata[!is.na(seadata$reviews_per_month),]
seadata <- seadata[!is.na(seadata$review_scores_rating),]
seadata <- seadata[!is.na(seadata$bedrooms),]
seadata <- seadata[seadata$price != 9999 & seadata$price != 0,]
# Look for NA's
for(i in 1:dim(seadata)[2])
{
print(paste(names(seadata)[i], length(which(is.na(seadata[,i])))))
}
## [1] "price 0"
## [1] "reviews_per_month 0"
## [1] "number_of_reviews 0"
## [1] "review_scores_rating 0"
## [1] "beds 0"
## [1] "bathrooms 0"
## [1] "bedrooms 0"
## [1] "property_type 0"
## [1] "room_type 0"
# Capping beds, bathrooms, bedrooms based on99th percentile
x <- seadata$bedrooms
quantiles <- quantile(x, .99)
x[x > quantiles] <- quantiles
seadata$bedrooms <- x
x <- seadata$bathrooms
quantiles <- quantile(x, .99)
x[x > quantiles] <- quantiles
seadata$bathrooms <- x
x <- seadata$beds
quantiles <- quantile(x, .99)
x[x > quantiles] <- quantiles
seadata$beds <- x
nrow(seadata)
## [1] 7697
# Histograms of numeric columns to analyse the distribution
# All the individual distributions appear to be skewed
hist(seadata$review_scores_rating)
hist(seadata$reviews_per_month)
hist(seadata$number_of_reviews)
hist(seadata$beds)
hist(seadata$bathrooms)
hist(seadata$bedrooms)
hist(seadata$price)
predictor_names <-c("price","review_scores_rating","reviews_per_month","number_of_reviews","beds","bathrooms","bedrooms","property_type","room_type")
for(pname in predictor_names)
{
print(typeof(seadata[,pname]))
}
## [1] "double"
## [1] "integer"
## [1] "double"
## [1] "integer"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "integer"
## [1] "integer"
# price and review score rating
plot(price ~ review_scores_rating, data=seadata, col="lightblue")
abline(lm(price ~ review_scores_rating, data=seadata))
summary(lm(price ~ review_scores_rating, data=seadata))
##
## Call:
## lm(formula = price ~ review_scores_rating, data = seadata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -329.3 -80.5 -45.0 20.0 4835.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 396.5848 25.0463 15.834 <2e-16 ***
## review_scores_rating -2.4663 0.2627 -9.387 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 172.4 on 7695 degrees of freedom
## Multiple R-squared: 0.01132, Adjusted R-squared: 0.01119
## F-statistic: 88.11 on 1 and 7695 DF, p-value: < 2.2e-16
# TEST FOR NULL HYPOTHESIS
# E(price) = alpha + beta * review_score_rating
# H0 : beta = 0
n=nrow(seadata)
Y=seadata$price
X=cbind(rep(1,n),seadata$review_scores_rating)
p=ncol(X)
beta=solve(t(X) %*% X) %*% t(X) %*% Y
muhat= X %*% beta
sigmasq=sum((Y - muhat)^2)/(n-p)
covbeta=sigmasq * solve(t(X) %*% X)
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)
## beta sebeta
## 1 396.584770 25.0463296
## 2 -2.466339 0.2627437
z=beta[2]/sebeta[2]
p=2*pt(-abs(z),df=n-2)
data.frame(z,p)
## z p
## 1 -9.386862 7.984815e-21
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and review_score_rating
#----------------------
# price and reviews_per_month
plot(price ~ reviews_per_month, data=seadata, col="lightblue")
abline(lm(price ~ reviews_per_month, data=seadata))
summary(lm(price ~ reviews_per_month, data=seadata))
##
## Call:
## lm(formula = price ~ reviews_per_month, data = seadata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -183.5 -84.0 -37.8 23.8 4815.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 195.6069 2.8004 69.85 <2e-16 ***
## reviews_per_month -14.3334 0.8657 -16.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 170.4 on 7695 degrees of freedom
## Multiple R-squared: 0.0344, Adjusted R-squared: 0.03427
## F-statistic: 274.1 on 1 and 7695 DF, p-value: < 2.2e-16
# E(price) = alpha + beta * reviews_per_month
# H0 : beta = 0
n=nrow(seadata)
Y=seadata$price
X=cbind(rep(1,n),seadata$reviews_per_month)
p=ncol(X)
beta=solve(t(X) %*% X) %*% t(X) %*% Y
muhat= X %*% beta
sigmasq=sum((Y - muhat)^2)/(n-p)
covbeta=sigmasq * solve(t(X) %*% X)
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)
## beta sebeta
## 1 195.60693 2.8003704
## 2 -14.33336 0.8657254
z=beta[2]/sebeta[2]
# Significace calculated based on t-test
p=2*pt(-abs(z),df=n-2)
data.frame(z,p)
## z p
## 1 -16.55648 1.588257e-60
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and reviews_per_month
#------------------------------
# price and number_of_reviews
plot(price ~ number_of_reviews, data=seadata, col="lightblue")
abline(lm(price ~ number_of_reviews, data=seadata))
summary(lm(price ~ number_of_reviews, data=seadata))
##
## Call:
## lm(formula = price ~ number_of_reviews, data = seadata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -170.3 -82.6 -41.2 20.7 4829.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 181.27156 2.43971 74.30 <2e-16 ***
## number_of_reviews -0.32318 0.02474 -13.06 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 171.5 on 7695 degrees of freedom
## Multiple R-squared: 0.02169, Adjusted R-squared: 0.02156
## F-statistic: 170.6 on 1 and 7695 DF, p-value: < 2.2e-16
# E(price) = alpha + beta * number_of_reviews
# H0 : beta = 0
n=nrow(seadata)
Y=seadata$price
X=cbind(rep(1,n),seadata$number_of_reviews)
p=ncol(X)
beta=solve(t(X) %*% X) %*% t(X) %*% Y
muhat= X %*% beta
sigmasq=sum((Y - muhat)^2)/(n-p)
covbeta=sigmasq * solve(t(X) %*% X)
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)
## beta sebeta
## 1 181.271561 2.43971315
## 2 -0.323177 0.02474146
z=beta[2]/sebeta[2]
p=2*pt(-abs(z),df=n-2)
data.frame(z,p)
## z p
## 1 -13.06217 1.390693e-38
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and number_of_reviews
#------------------------------
# price and beds
plot(price ~ beds, data=seadata, col="lightblue")
abline(lm(price ~ beds, data=seadata))
summary(lm(price ~ beds, data=seadata))
##
## Call:
## lm(formula = price ~ beds, data = seadata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -371.4 -68.3 -33.3 16.7 4876.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79.400 3.319 23.92 <2e-16 ***
## beds 43.854 1.453 30.19 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 164 on 7695 degrees of freedom
## Multiple R-squared: 0.1059, Adjusted R-squared: 0.1058
## F-statistic: 911.3 on 1 and 7695 DF, p-value: < 2.2e-16
# E(price) = alpha + beta * beds
# H0 : beta = 0
n=nrow(seadata)
Y=seadata$price
X=cbind(rep(1,n),seadata$beds)
p=ncol(X)
beta=solve(t(X) %*% X) %*% t(X) %*% Y
muhat= X %*% beta
sigmasq=sum((Y - muhat)^2)/(n-p)
covbeta=sigmasq * solve(t(X) %*% X)
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)
## beta sebeta
## 1 79.40048 3.319052
## 2 43.85409 1.452734
z=beta[2]/sebeta[2]
p=2*pt(-abs(z),df=n-2)
data.frame(z,p)
## z p
## 1 30.18728 2.707849e-189
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and beds
#------------------------------
# price and bedrooms
plot(price ~ bedrooms, data=seadata, col="lightblue")
abline(lm(price ~ bedrooms, data=seadata))
summary(lm(price ~ bedrooms, data=seadata))
##
## Call:
## lm(formula = price ~ bedrooms, data = seadata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -325.8 -73.7 -36.7 17.4 4859.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.619 3.192 26.20 <2e-16 ***
## bedrooms 57.041 1.879 30.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 163.9 on 7695 degrees of freedom
## Multiple R-squared: 0.107, Adjusted R-squared: 0.1068
## F-statistic: 921.7 on 1 and 7695 DF, p-value: < 2.2e-16
# E(price) = alpha + beta * bedrooms
# H0 : beta = 0
n=nrow(seadata)
Y=seadata$price
X=cbind(rep(1,n),seadata$bedrooms)
p=ncol(X)
beta=solve(t(X) %*% X) %*% t(X) %*% Y
muhat= X %*% beta
sigmasq=sum((Y - muhat)^2)/(n-p)
covbeta=sigmasq * solve(t(X) %*% X)
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)
## beta sebeta
## 1 83.61866 3.191925
## 2 57.04113 1.878884
z=beta[2]/sebeta[2]
p=2*pt(-abs(z),df=n-2)
data.frame(z,p)
## z p
## 1 30.35904 2.585439e-191
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and bedrooms
#------------------------------
# price and bathrooms
plot(price ~ bathrooms, data=seadata, col="lightblue")
abline(lm(price ~ bathrooms, data=seadata))
summary(lm(price ~ bathrooms, data=seadata))
##
## Call:
## lm(formula = price ~ bathrooms, data = seadata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -310.3 -65.4 -38.4 20.5 4861.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.516 4.483 13.05 <2e-16 ***
## bathrooms 79.952 3.132 25.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 166.5 on 7695 degrees of freedom
## Multiple R-squared: 0.07808, Adjusted R-squared: 0.07796
## F-statistic: 651.7 on 1 and 7695 DF, p-value: < 2.2e-16
# E(price) = alpha + beta * bathrooms
# H0 : beta = 0
n=nrow(seadata)
Y=seadata$price
X=cbind(rep(1,n),seadata$bathrooms)
p=ncol(X)
beta=solve(t(X) %*% X) %*% t(X) %*% Y
muhat= X %*% beta
sigmasq=sum((Y - muhat)^2)/(n-p)
covbeta=sigmasq * solve(t(X) %*% X)
sebeta=sqrt(diag(covbeta))
data.frame(beta,sebeta)
## beta sebeta
## 1 58.51591 4.483052
## 2 79.95192 3.131879
z=beta[2]/sebeta[2]
#p=2*pt(z,df=n-2,lower.tail=FALSE)
p=2*pt(-abs(z),df=n-2)
data.frame(z,p)
## z p
## 1 25.52842 4.691466e-138
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and bathrooms
#------------------------------
# price and room_type
plot(price ~ room_type, data=seadata, col="lightblue")
abline(lm(price ~ factor(room_type), data=seadata))
## Warning in abline(lm(price ~ factor(room_type), data = seadata)): only
## using the first two of 4 regression coefficients
summary(aov(price ~ factor(room_type), data=seadata))
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(room_type) 3 18987575 6329192 229.3 <2e-16 ***
## Residuals 7693 212348694 27603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# F-test using anova
seadata$I_home = as.numeric(seadata$room_type== 'Entire home/apt')
seadata$I_private = as.numeric(seadata$room_type== 'Private room')
seadata$I_shared = as.numeric(seadata$room_type== 'Shared room')
full = (lm(price ~ I_home + I_private + I_shared, data=seadata))
reduced = (lm(price ~ 1, data=seadata))
anova(reduced,full)
## Analysis of Variance Table
##
## Model 1: price ~ 1
## Model 2: price ~ I_home + I_private + I_shared
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7696 231336270
## 2 7693 212348694 3 18987575 229.29 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and room_type
#------------------------------
# price and property_type
plot(price ~ property_type, data=seadata, col="lightblue")
abline(lm(price ~ factor(property_type), data=seadata))
## Warning in abline(lm(price ~ factor(property_type), data = seadata)): only
## using the first two of 26 regression coefficients
summary(aov(price ~ factor(property_type), data=seadata))
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(property_type) 25 8772988 350920 12.1 <2e-16 ***
## Residuals 7671 222563281 29014
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and property_type
# CHECK FOR POSSIBLE CORRELATION BETWEEN OUTPUT AND PREDICTOR VARIABLES
cor.test(seadata$price,seadata$number_of_reviews, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: seadata$price and seadata$number_of_reviews
## t = -13.062, df = 7695, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1690662 -0.1253533
## sample estimates:
## cor
## -0.1472817
cor.test(seadata$price,seadata$review_scores_rating, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: seadata$price and seadata$review_scores_rating
## t = -9.3869, df = 7695, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.12843608 -0.08425997
## sample estimates:
## cor
## -0.1064005
cor.test(seadata$price,seadata$reviews_per_month, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: seadata$price and seadata$reviews_per_month
## t = -16.556, df = 7695, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2069488 -0.1638032
## sample estimates:
## cor
## -0.1854654
cor.test(seadata$price,seadata$bedrooms, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: seadata$price and seadata$bedrooms
## t = 30.359, df = 7695, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3069551 0.3468596
## sample estimates:
## cor
## 0.3270531
cor.test(seadata$price,seadata$bathrooms, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: seadata$price and seadata$bathrooms
## t = 25.528, df = 7695, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2586999 0.2998946
## sample estimates:
## cor
## 0.2794258
cor.test(seadata$price,seadata$beds, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: seadata$price and seadata$beds
## t = 30.187, df = 7695, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3052776 0.3452303
## sample estimates:
## cor
## 0.3253992
#Interactions between predictors
summary(lm(price ~ review_scores_rating * reviews_per_month, data=seadata))
##
## Call:
## lm(formula = price ~ review_scores_rating * reviews_per_month,
## data = seadata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -354.4 -81.8 -36.5 22.4 4815.5
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 443.7847 29.8128 14.886
## review_scores_rating -2.6300 0.3143 -8.367
## reviews_per_month -84.5741 17.8577 -4.736
## review_scores_rating:reviews_per_month 0.7397 0.1856 3.986
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## review_scores_rating < 2e-16 ***
## reviews_per_month 2.22e-06 ***
## review_scores_rating:reviews_per_month 6.79e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 169.6 on 7693 degrees of freedom
## Multiple R-squared: 0.04319, Adjusted R-squared: 0.04282
## F-statistic: 115.7 on 3 and 7693 DF, p-value: < 2.2e-16
# p < 0.05, therefore there is strong evidence of interaction between review_scores_rating and reviews_per_month
summary(lm(price ~ reviews_per_month * number_of_reviews, data=seadata))
##
## Call:
## lm(formula = price ~ reviews_per_month * number_of_reviews, data = seadata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -228.3 -82.7 -33.9 21.7 4817.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 207.33444 3.18985 64.998 < 2e-16
## reviews_per_month -14.98059 1.19280 -12.559 < 2e-16
## number_of_reviews -0.44092 0.05601 -7.873 3.94e-15
## reviews_per_month:number_of_reviews 0.06338 0.00888 7.138 1.04e-12
##
## (Intercept) ***
## reviews_per_month ***
## number_of_reviews ***
## reviews_per_month:number_of_reviews ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 169.7 on 7693 degrees of freedom
## Multiple R-squared: 0.04227, Adjusted R-squared: 0.0419
## F-statistic: 113.2 on 3 and 7693 DF, p-value: < 2.2e-16
# p < 0.05, therefore there is strong evidence of interaction between number_of_reviews and reviews_per_month
summary(lm(price ~ review_scores_rating * number_of_reviews, data=seadata))
##
## Call:
## lm(formula = price ~ review_scores_rating * number_of_reviews,
## data = seadata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -331.9 -81.5 -39.8 22.7 4828.7
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 397.449345 26.245037 15.144
## review_scores_rating -2.287949 0.276576 -8.272
## number_of_reviews -2.183898 0.716406 -3.048
## review_scores_rating:number_of_reviews 0.019536 0.007416 2.634
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## review_scores_rating < 2e-16 ***
## number_of_reviews 0.00231 **
## review_scores_rating:number_of_reviews 0.00845 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 170.8 on 7693 degrees of freedom
## Multiple R-squared: 0.03032, Adjusted R-squared: 0.02994
## F-statistic: 80.18 on 3 and 7693 DF, p-value: < 2.2e-16
# p < 0.05, therefore there is strong evidence of interaction between number_of_reviews and reviews_per_month
summary(aov(price ~ beds*factor(property_type), data=seadata))
## Df Sum Sq Mean Sq F value Pr(>F)
## beds 1 24494959 24494959 985.518 < 2e-16 ***
## factor(property_type) 25 14503364 580135 23.341 < 2e-16 ***
## beds:factor(property_type) 20 2197903 109895 4.421 1.64e-10 ***
## Residuals 7650 190140043 24855
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between beds and property_type
summary(aov(price ~ bedrooms*factor(property_type), data=seadata))
## Df Sum Sq Mean Sq F value Pr(>F)
## bedrooms 1 24744597 24744597 1021.685 <2e-16 ***
## factor(property_type) 25 18070135 722805 29.844 <2e-16 ***
## bedrooms:factor(property_type) 18 3194649 177481 7.328 <2e-16 ***
## Residuals 7652 185326888 24219
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bedrooms and property_type
summary(aov(price ~ bathrooms*factor(property_type), data=seadata))
## Df Sum Sq Mean Sq F value Pr(>F)
## bathrooms 1 18062456 18062456 706.86 <2e-16 ***
## factor(property_type) 25 14603117 584125 22.86 <2e-16 ***
## bathrooms:factor(property_type) 18 3137098 174283 6.82 <2e-16 ***
## Residuals 7652 195533598 25553
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bathrooms and property_type
summary(aov(price ~ bathrooms*factor(room_type), data=seadata))
## Df Sum Sq Mean Sq F value Pr(>F)
## bathrooms 1 18062456 18062456 739.16 <2e-16 ***
## factor(room_type) 3 19901106 6633702 271.47 <2e-16 ***
## bathrooms:factor(room_type) 3 5480877 1826959 74.76 <2e-16 ***
## Residuals 7689 187891830 24436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bathrooms and room_type
summary(aov(price ~ bedrooms*factor(room_type), data=seadata))
## Df Sum Sq Mean Sq F value Pr(>F)
## bedrooms 1 24744597 24744597 983.023 <2e-16 ***
## factor(room_type) 3 12794900 4264967 169.433 <2e-16 ***
## bedrooms:factor(room_type) 2 224504 112252 4.459 0.0116 *
## Residuals 7690 193572269 25172
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bedrooms and room_type
summary(aov(price ~ beds*factor(room_type), data=seadata))
## Df Sum Sq Mean Sq F value Pr(>F)
## beds 1 24494959 24494959 969.62 < 2e-16 ***
## factor(room_type) 3 10844244 3614748 143.09 < 2e-16 ***
## beds:factor(room_type) 3 1753578 584526 23.14 6.63e-15 ***
## Residuals 7689 194243488 25263
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between beds and room_type
summary(aov(price ~ reviews_per_month*factor(room_type), data=seadata))
## Df Sum Sq Mean Sq F value
## reviews_per_month 1 7957368 7957368 303.30
## factor(room_type) 3 20160989 6720330 256.15
## reviews_per_month:factor(room_type) 3 1489864 496621 18.93
## Residuals 7689 201728049 26236
## Pr(>F)
## reviews_per_month < 2e-16 ***
## factor(room_type) < 2e-16 ***
## reviews_per_month:factor(room_type) 3.16e-12 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between reviews_per_month and room_type
summary(aov(price ~ number_of_reviews*factor(room_type), data=seadata))
## Df Sum Sq Mean Sq F value
## number_of_reviews 1 5018122 5018122 187.05
## factor(room_type) 3 18768175 6256058 233.19
## number_of_reviews:factor(room_type) 3 1267214 422405 15.74
## Residuals 7689 206282758 26828
## Pr(>F)
## number_of_reviews < 2e-16 ***
## factor(room_type) < 2e-16 ***
## number_of_reviews:factor(room_type) 3.32e-10 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between number_of_reviews and room_type
summary(aov(price ~ review_scores_rating*factor(room_type), data=seadata))
## Df Sum Sq Mean Sq F value
## review_scores_rating 1 2618975 2618975 96.91
## factor(room_type) 3 19369803 6456601 238.91
## review_scores_rating:factor(room_type) 3 1553888 517963 19.17
## Residuals 7689 207793604 27025
## Pr(>F)
## review_scores_rating < 2e-16 ***
## factor(room_type) < 2e-16 ***
## review_scores_rating:factor(room_type) 2.23e-12 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between review_score_rating and room_type
summary(aov(price ~ reviews_per_month*factor(property_type), data=seadata))
## Df Sum Sq Mean Sq F value
## reviews_per_month 1 7957368 7957368 283.619
## factor(property_type) 25 6676429 267057 9.519
## reviews_per_month:factor(property_type) 23 2154195 93661 3.338
## Residuals 7647 214548278 28057
## Pr(>F)
## reviews_per_month < 2e-16 ***
## factor(property_type) < 2e-16 ***
## reviews_per_month:factor(property_type) 1.16e-07 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between reviews_per_month and property_type
summary(aov(price ~ number_of_reviews*factor(property_type), data=seadata))
## Df Sum Sq Mean Sq F value
## number_of_reviews 1 5018122 5018122 176.923
## factor(property_type) 25 7478598 299144 10.547
## number_of_reviews:factor(property_type) 23 1945017 84566 2.982
## Residuals 7647 216894532 28363
## Pr(>F)
## number_of_reviews < 2e-16 ***
## factor(property_type) < 2e-16 ***
## number_of_reviews:factor(property_type) 2.17e-06 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between number_of_reviews and property_type
summary(aov(price ~ review_scores_rating*factor(property_type), data=seadata))
## Df Sum Sq Mean Sq F value
## review_scores_rating 1 2618975 2618975 92.711
## factor(property_type) 25 7425216 297009 10.514
## review_scores_rating:factor(property_type) 23 5272832 229254 8.115
## Residuals 7647 216019247 28249
## Pr(>F)
## review_scores_rating <2e-16 ***
## factor(property_type) <2e-16 ***
## review_scores_rating:factor(property_type) <2e-16 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between review_scores_rating and property_type
summary(aov(price ~ reviews_per_month*beds, data=seadata))
## Df Sum Sq Mean Sq F value Pr(>F)
## reviews_per_month 1 7957368 7957368 308.15 < 2e-16 ***
## beds 1 23598436 23598436 913.85 < 2e-16 ***
## reviews_per_month:beds 1 1123633 1123633 43.51 4.49e-11 ***
## Residuals 7693 198656833 25823
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between beds and reviews_per_month
summary(aov(price ~ reviews_per_month*bedrooms, data=seadata))
## Df Sum Sq Mean Sq F value Pr(>F)
## reviews_per_month 1 7957368 7957368 304.95 < 2e-16 ***
## bedrooms 1 21449766 21449766 822.01 < 2e-16 ***
## reviews_per_month:bedrooms 1 1186626 1186626 45.48 1.66e-11 ***
## Residuals 7693 200742510 26094
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bedrooms and reviews_per_month
summary(aov(price ~ reviews_per_month*bathrooms, data=seadata))
## Df Sum Sq Mean Sq F value Pr(>F)
## reviews_per_month 1 7957368 7957368 294.347 <2e-16 ***
## bathrooms 1 15265018 15265018 564.660 <2e-16 ***
## reviews_per_month:bathrooms 1 141319 141319 5.227 0.0223 *
## Residuals 7693 207972564 27034
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bathrooms and reviews_per_month
summary(aov(price ~ number_of_reviews*bedrooms, data=seadata))
## Df Sum Sq Mean Sq F value Pr(>F)
## number_of_reviews 1 5018122 5018122 189.76 < 2e-16 ***
## bedrooms 1 22469187 22469187 849.67 < 2e-16 ***
## number_of_reviews:bedrooms 1 411585 411585 15.56 8.05e-05 ***
## Residuals 7693 203437375 26444
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bedrooms and number_of_reviews
summary(aov(price ~ number_of_reviews*beds, data=seadata))
## Df Sum Sq Mean Sq F value Pr(>F)
## number_of_reviews 1 5018122 5018122 190.2 < 2e-16 ***
## beds 1 23045612 23045612 873.5 < 2e-16 ***
## number_of_reviews:beds 1 316601 316601 12.0 0.000535 ***
## Residuals 7693 202955934 26382
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between beds and number_of_reviews
summary(aov(price ~ number_of_reviews*bathrooms, data=seadata))
## Df Sum Sq Mean Sq F value Pr(>F)
## number_of_reviews 1 5018122 5018122 183.760 <2e-16 ***
## bathrooms 1 16178157 16178157 592.433 <2e-16 ***
## number_of_reviews:bathrooms 1 59718 59718 2.187 0.139
## Residuals 7693 210080272 27308
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bathrooms and number_of_reviews
summary(aov(price ~ review_scores_rating*bathrooms, data=seadata))
## Df Sum Sq Mean Sq F value Pr(>F)
## review_scores_rating 1 2618975 2618975 96.55 < 2e-16
## bathrooms 1 18290922 18290922 674.27 < 2e-16
## review_scores_rating:bathrooms 1 1738302 1738302 64.08 1.37e-15
## Residuals 7693 208688071 27127
##
## review_scores_rating ***
## bathrooms ***
## review_scores_rating:bathrooms ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between bathrooms and review_scores_rating
summary(aov(price ~ review_scores_rating*beds, data=seadata))
## Df Sum Sq Mean Sq F value Pr(>F)
## review_scores_rating 1 2618975 2618975 99.05 < 2e-16 ***
## beds 1 24989683 24989683 945.09 < 2e-16 ***
## review_scores_rating:beds 1 312299 312299 11.81 0.000592 ***
## Residuals 7693 203415314 26442
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between beds and review_scores_rating
summary(aov(price ~ review_scores_rating*bedrooms, data=seadata))
## Df Sum Sq Mean Sq F value Pr(>F)
## review_scores_rating 1 2618975 2618975 99.66 < 2e-16 ***
## bedrooms 1 26171631 26171631 995.87 < 2e-16 ***
## review_scores_rating:bedrooms 1 371456 371456 14.13 0.000171 ***
## Residuals 7693 202174208 26280
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, therefore there is strong evidence of interaction between review_scores_rating and bedrooms
mr01 <-(lm(price ~ (review_scores_rating + reviews_per_month + number_of_reviews + beds + bathrooms + bedrooms + factor(property_type) + factor(room_type))^2, data=seadata))
# plot of fitted values and summary
#summary(mr01)
hist(mr01$fitted.values)
#Fitted vs originals
plot(seadata$price, mr01$fitted.values,col="lightblue")
abline(a=0, b=1)
## MSE and RMSE
mean((mr01$fitted.values - seadata$price)^2)
## [1] 20362.3
sqrt(mean((mr01$fitted.values - seadata$price)^2))
## [1] 142.6965
#------------------------------------
# F-test using anova
full = mr01
reduced = (lm(price ~ 1, data=seadata))
anova(reduced,full)
## Analysis of Variance Table
##
## Model 1: price ~ 1
## Model 2: price ~ (review_scores_rating + reviews_per_month + number_of_reviews +
## beds + bathrooms + bedrooms + factor(property_type) + factor(room_type))^2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7696 231336270
## 2 7492 156728659 204 74607611 17.483 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Conclusion: The null hypothesis test is statistically significant, therefore not all regression coefficients are 0.
#—————————————————-
Checking the assumptions for linear regression: Normality of error distribution or large sample size Constant variance of errors Linearity Independence
#Check for normality using residuals
hist(mr01$residuals)
qqnorm(mr01$residuals)
qqline(mr01$residuals)
# Checking the constant variance assumption of Linear regression
scatter.smooth(mr01$fitted.values, mr01$residuals,cex=0.5,col="lightblue")
# LINEAR REGRESSION ASSUMPTIONS NOT MET
#lm function throws error
mr02 <- lm(log(price) ~ (review_scores_rating + reviews_per_month + number_of_reviews + beds + bathrooms + bedrooms + factor(property_type) + factor(room_type))^2, data=seadata)
# plot of fitted values and summary
#summary(mr02)
hist(mr02$fitted.values)
#plot(mr01)
#Fitted vs originals
plot(seadata$price, exp(mr02$fitted.values),col="lightblue")
abline(a=0, b=1)
## MSE and RMSE
mean((exp(mr02$fitted.values) - seadata$price)^2)
## [1] 21456.52
sqrt(mean((exp(mr02$fitted.values) - seadata$price)^2))
## [1] 146.4804
#Check for normality using residuals
hist(mr02$residuals)
qqnorm(mr02$residuals)
qqline(mr02$residuals)
# Checking the constant variance assumption of Linear regression
scatter.smooth(mr02$fitted.values, mr02$residuals,cex=0.5,col='lightblue')
# Logtrsformed price model meets th assumptions of linear regression
# price and review score rating
glm01=glm(price ~ review_scores_rating, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 808238.2 7695 105.0342
glmsum = summary(glm01,dispersion=D/df)
glmsum$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.23923668 0.0928332946 67.20904 0.000000e+00
## review_scores_rating -0.01215559 0.0009808655 -12.39272 2.861862e-35
#plotting the fitted model
plot(seadata$review_scores_rating,seadata$price,cex=0.5,col="lightblue")
x=min(seadata$review_scores_rating):max(seadata$review_scores_rating)
y=exp(glmsum$coef[1]+glmsum$coef[2]*x)
lines(x,y,col=4)
#Coefficient estimate of review score rating = -0.01215559 which is the average difference in the log of mean price per unit difference in the review score rating.
exp(glmsum$coef[2])
## [1] 0.987918
# exp(review score rating estimate) = 0.987918 is the ratio of mean prices for 2 houses differing by a unit review_score_rating.
# As the assumed mean-variance relationship might not be valid for log-linear regression, use robust se. Price vs review_scores_rating exhibits heteroscedasticity.
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value Pr(>|z|) robust.se
## (Intercept) 6.2392 0.0928 67.2090 0 0.1700
## review_scores_rating -0.0122 0.0010 -12.3927 0 0.0018
z = (fit[,1][2]/fit[,5][2])
p_value = 2 * pnorm(-abs(z))
p_value
## review_scores_rating
## 1.220384e-11
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and review_score_rating
#----------------------
# price and reviews_per_month
glm01=glm(price ~ reviews_per_month, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 768457.9 7695 99.86457
glmsum = summary(glm01,dispersion=D/df)
glmsum$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.3015947 0.012451283 425.78704 0.000000e+00
## reviews_per_month -0.1014198 0.004593682 -22.07812 5.130396e-108
#plotting the fitted model
plot(seadata$reviews_per_month,seadata$price,cex=0.5,col="lightblue")
x=min(seadata$reviews_per_month):max(seadata$reviews_per_month)
y=exp(glmsum$coef[1]+glmsum$coef[2]*x)
lines(x,y,col=4)
#Coefficient estimate of reviews_per_month = -0.1014 which is the average difference in the log of mean price per unit difference in the reviews_per_month.
exp(glmsum$coef[2])
## [1] 0.9035536
# exp(reviews_per_month estimate) = 0.9035536 is the ratio of mean prices for 2 houses differing by a unit reviews_per_month (or) mean price decreases by approx 10% for each difference of unit reviews_per_month
# As the assumed mean-variance relationship might not be valid for log-linear regression, use robust se.
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value Pr(>|z|) robust.se
## (Intercept) 5.3016 0.0125 425.7870 0 0.0189
## reviews_per_month -0.1014 0.0046 -22.0781 0 0.0049
z = (fit[,1][2]/fit[,5][2])
p_value = 2 * pnorm(-abs(z))
p_value
## reviews_per_month
## 3.932789e-95
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and reviews_per_month
#------------------------------
# price and number_of_reviews
glm01=glm(price ~ number_of_reviews, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 786358.7 7695 102.1909
glmsum = summary(glm01,dispersion=D/df)
glmsum$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.218975280 0.0112378131 464.41200 0.000000e+00
## number_of_reviews -0.002486994 0.0001431522 -17.37308 1.319573e-67
#plotting the fitted model
plot(seadata$number_of_reviews,seadata$price,cex=0.5,col="lightblue")
x=min(seadata$number_of_reviews):max(seadata$number_of_reviews)
y=exp(glmsum$coef[1]+glmsum$coef[2]*x)
lines(x,y,col=4)
#Coefficient estimate of number_of_reviews = -0.002486994 which is the average difference in the log of mean price per unit difference in the number_of_reviews.
exp(glmsum$coef[2])
## [1] 0.9975161
# exp(number_of_reviews estimate) = 0.9975161 is the ratio of mean prices for 2 houses differing by 1 number_of_reviews (or) mean price decreases by approx 1% for each difference of unit number_of_reviews
# As the assumed mean-variance relationship might not be valid for log-linear regression, use robust se.
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value Pr(>|z|) robust.se
## (Intercept) 5.2190 0.0112 464.4120 0 0.0161
## number_of_reviews -0.0025 0.0001 -17.3731 0 0.0001
z = (fit[,1][2]/fit[,5][2])
p_value = 2 * pnorm(-abs(z))
p_value
## number_of_reviews
## 6.113393e-138
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and number_of_reviews
#------------------------------
# price and beds
glm01=glm(price ~ beds, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 694906 7695 90.30617
glmsum = summary(glm01,dispersion=D/df)
glmsum$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.6467001 0.014866070 312.57084 0
## beds 0.2111262 0.005220622 40.44082 0
#plotting the fitted model
plot(seadata$beds,seadata$price,cex=0.5,col="lightblue")
x=min(seadata$beds):max(seadata$beds)
y=exp(glmsum$coef[1]+glmsum$coef[2]*x)
lines(x,y,col=4)
#Coefficient estimate of beds = 0.2111262 which is the average difference in the log of mean price per unit difference in the beds.
exp(glmsum$coef[2])
## [1] 1.235068
# exp(beds estimate) = 1.235068 is the ratio of mean prices for 2 houses differing by 1 beds (or) mean price increases by 23% for each difference of 1 bed
# As the assumed mean-variance relationship might not be valid for log-linear regression, use robust se.
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value Pr(>|z|) robust.se
## (Intercept) 4.6467 0.0149 312.5708 0 0.0200
## beds 0.2111 0.0052 40.4408 0 0.0067
z = (fit[,1][2]/fit[,5][2])
p_value = 2 * pnorm(-abs(z))
p_value
## beds
## 6.864939e-218
#Conclusion: The null hypothesis test is not statistically significant, therefore there is no linear association between price and beds
#------------------------------
# price and bedrooms
glm01=glm(price ~ bedrooms, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 687540.9 7695 89.34904
glmsum = summary(glm01,dispersion=D/df)
glmsum$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.6374116 0.014931909 310.57057 0
## bedrooms 0.2929199 0.007132322 41.06936 0
#plotting the fitted model
plot(seadata$bedrooms,seadata$price,cex=0.5,col="lightblue")
x=min(seadata$bedrooms):max(seadata$bedrooms)
y=exp(glmsum$coef[1]+glmsum$coef[2]*x)
lines(x,y,col=4)
#Coefficient estimate of bedrooms = 0.2929199 which is the average difference in the log of mean price per unit difference in the bedrooms.
exp(glmsum$coef[2])
## [1] 1.340335
# exp(bedrooms estimate) = 1.340335 is the ratio of mean prices for 2 houses differing by 1 bedroom (or) mean price increases by 34% for each difference of 1 bedroom
# As the assumed mean-variance relationship might not be valid for log-linear regression, use robust se. Data also exhibits Heteroscedasticity.
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value Pr(>|z|) robust.se
## (Intercept) 4.6374 0.0149 310.5706 0 0.0213
## bedrooms 0.2929 0.0071 41.0694 0 0.0089
z = (fit[,1][2]/fit[,5][2])
p_value = 2 * pnorm(-abs(z))
p_value
## bedrooms
## 1.57527e-237
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and bedrooms
#------------------------------
# price and bathrooms
glm01=glm(price ~ bathrooms, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 727940.7 7695 94.59918
glmsum = summary(glm01,dispersion=D/df)
glmsum$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.5536227 0.01904143 239.14286 0.000000e+00
## bathrooms 0.3876911 0.01145963 33.83102 6.901393e-251
#plotting the fitted model
plot(seadata$bathrooms,seadata$price,cex=0.5,col="lightblue")
x=min(seadata$bathrooms):max(seadata$bathrooms)
y=exp(glmsum$coef[1]+glmsum$coef[2]*x)
lines(x,y,col=4)
#Coefficient estimate of bathrooms = 0.3876911 which is the average difference in the log of mean price per unit difference in the bathrooms.
exp(glmsum$coef[2])
## [1] 1.473575
# exp(bathrooms estimate) = 1.473575 is the ratio of mean prices for 2 houses differing by 1 bathrooms (or) mean price increases by 47% for each difference of 1 bathroom
# As the assumed mean-variance relationship might not be valid for log-linear regression, use robust se.
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value Pr(>|z|) robust.se
## (Intercept) 4.5536 0.0190 239.1429 0 0.0260
## bathrooms 0.3877 0.0115 33.8310 0 0.0159
z = (fit[,1][2]/fit[,5][2])
p_value = 2 * pnorm(-abs(z))
p_value
## bathrooms
## 2.550089e-131
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and bathrooms
#------------------------------
# price and factor(property_type)
glm01=glm(price ~ factor(property_type), data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 767295.8 7671 100.0255
glmsum = summary(glm01,dispersion=D/df)
#glmsum$coef
#Coefficient estimate of property_type Bed and breakfast = -0.69524794 which is the average difference in the log of mean price comparing property types.
exp(glmsum$coef[2])
## [1] 0.4989507
# exp(property type Bed and breakfast estimate) = 0.4989507 which implies approx 50% lower price compared to the reference group.
# LRT test
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
##
## Model 1: price ~ 1
## Model 2: price ~ factor(property_type)
## Resid. Df Resid. Dev Df Deviance
## 1 7696 822107
## 2 7671 767296 25 54811
#1-pchisq(deviance,df from above model)
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and property_type
#------------------------------
# price and factor(room_type)
glm01=glm(price ~ factor(room_type), data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 683249.9 7693 88.81449
glmsum = summary(glm01,dispersion=D/df)
glmsum$coef
## Estimate Std. Error z value
## (Intercept) 5.2308842 0.009019876 579.928622
## factor(room_type)Hotel room 0.3746429 0.055980551 6.692377
## factor(room_type)Private room -0.9033623 0.028365423 -31.847305
## factor(room_type)Shared room -1.5949188 0.134495957 -11.858489
## Pr(>|z|)
## (Intercept) 0.000000e+00
## factor(room_type)Hotel room 2.195745e-11
## factor(room_type)Private room 1.434377e-222
## factor(room_type)Shared room 1.944531e-32
#Coefficient estimate of room_type Hotel room = 0.3746429 which is the average difference in the log of mean price comparing room types.
exp(glmsum$coef[2])
## [1] 1.454472
# exp(room_type Hotel room estimate) = 1.45447 which implies 45% higher price compared to the reference group.
# LRT test
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
##
## Model 1: price ~ 1
## Model 2: price ~ factor(room_type)
## Resid. Df Resid. Dev Df Deviance
## 1 7696 822107
## 2 7693 683250 3 138857
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
#Conclusion: The null hypothesis test is statistically significant, therefore there exists a linear association between price and room_type
#Interactions between predictors
glm01 = glm(price ~ review_scores_rating * reviews_per_month, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 759561.5 7693 98.7341
glmsum = summary(glm01,dispersion=D/df)
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value
## (Intercept) 6.3654 0.1075 59.2244
## review_scores_rating -0.0113 0.0011 -9.9323
## reviews_per_month -0.4509 0.0916 -4.9211
## review_scores_rating:reviews_per_month 0.0037 0.0010 3.8827
## Pr(>|z|) robust.se
## (Intercept) 0e+00 0.1955
## review_scores_rating 0e+00 0.0020
## reviews_per_month 0e+00 0.1288
## review_scores_rating:reviews_per_month 1e-04 0.0013
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z))
p_value
## review_scores_rating:reviews_per_month
## 0.004425081
# p < 0.05, therefore there is evidence of interaction between review_scores_rating and reviews_per_month
#----------------------------------
glm01 = glm(price ~ reviews_per_month * number_of_reviews, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 759131.6 7693 98.67823
glmsum = summary(glm01,dispersion=D/df)
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.3571 0.0136 394.8189 0
## reviews_per_month -0.0974 0.0061 -15.9231 0
## number_of_reviews -0.0027 0.0003 -9.8068 0
## reviews_per_month:number_of_reviews 0.0004 0.0000 9.1453 0
## robust.se
## (Intercept) 0.0208
## reviews_per_month 0.0059
## number_of_reviews 0.0003
## reviews_per_month:number_of_reviews 0.0000
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z))
p_value
## reviews_per_month:number_of_reviews
## 0
# p < 0.05, therefore there is strong evidence of interaction between number_of_reviews and reviews_per_month
#------------------------------------------
glm01 = glm(price ~ review_scores_rating * number_of_reviews, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 777218.9 7693 101.0294
glmsum = summary(glm01,dispersion=D/df)
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value
## (Intercept) 6.1794 0.095 65.0722
## review_scores_rating -0.0102 0.001 -10.1328
## number_of_reviews -0.0131 0.004 -3.2375
## review_scores_rating:number_of_reviews 0.0001 0.000 2.6701
## Pr(>|z|) robust.se
## (Intercept) 0.0000 0.1758
## review_scores_rating 0.0000 0.0018
## number_of_reviews 0.0012 0.0039
## review_scores_rating:number_of_reviews 0.0076 0.0000
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z))
p_value
## review_scores_rating:number_of_reviews
## 0
# p < 0.05, therefore there is strong evidence of interaction between number_of_reviews and reviews_per_month
#-------------------------------------
glm01 = glm(price ~ beds*factor(property_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
##
## Model 1: price ~ 1
## Model 2: price ~ beds * factor(property_type)
## Resid. Df Resid. Dev Df Deviance
## 1 7696 822107
## 2 7650 596291 46 225815
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between beds and property_type
#-------------------------------------
glm01=glm(price ~ bathrooms*factor(property_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
##
## Model 1: price ~ 1
## Model 2: price ~ bathrooms * factor(property_type)
## Resid. Df Resid. Dev Df Deviance
## 1 7696 822107
## 2 7652 628134 44 193973
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between bathrooms and property_type
#------------------------------
glm01=glm(price ~ bedrooms*factor(property_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
##
## Model 1: price ~ 1
## Model 2: price ~ bedrooms * factor(property_type)
## Resid. Df Resid. Dev Df Deviance
## 1 7696 822107
## 2 7652 560974 44 261132
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between bedrooms and property_type
#----------------------------------------------
glm01=glm(price ~ bathrooms*factor(room_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
##
## Model 1: price ~ 1
## Model 2: price ~ bathrooms * factor(room_type)
## Resid. Df Resid. Dev Df Deviance
## 1 7696 822107
## 2 7689 574073 7 248033
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between bathrooms and room_type
#----------------------------------------------
glm01=glm(price ~ bedrooms*factor(room_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
##
## Model 1: price ~ 1
## Model 2: price ~ bedrooms * factor(room_type)
## Resid. Df Resid. Dev Df Deviance
## 1 7696 822107
## 2 7690 590519 6 231587
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between bedrooms and room_type
#--------------------------------------------
glm01=glm(price ~ beds*factor(room_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
##
## Model 1: price ~ 1
## Model 2: price ~ beds * factor(room_type)
## Resid. Df Resid. Dev Df Deviance
## 1 7696 822107
## 2 7689 597097 7 225010
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between beds and room_type.
#-------------------------------------------------
glm01=glm(price ~ reviews_per_month*factor(room_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
##
## Model 1: price ~ 1
## Model 2: price ~ reviews_per_month * factor(room_type)
## Resid. Df Resid. Dev Df Deviance
## 1 7696 822107
## 2 7689 620184 7 201923
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between reviews_per_month and room_type
#-----------------------------------
glm01=glm(price ~ number_of_reviews*factor(room_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
##
## Model 1: price ~ 1
## Model 2: price ~ number_of_reviews * factor(room_type)
## Resid. Df Resid. Dev Df Deviance
## 1 7696 822107
## 2 7689 645509 7 176598
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between number_of_reviews and room_type
#----------------------------------------
glm01=glm(price ~ review_scores_rating*factor(room_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
##
## Model 1: price ~ 1
## Model 2: price ~ review_scores_rating * factor(room_type)
## Resid. Df Resid. Dev Df Deviance
## 1 7696 822107
## 2 7689 662911 7 159195
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between review_score_rating and room_type
#-----------------------------------
glm01=glm(price ~ reviews_per_month*factor(property_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
##
## Model 1: price ~ 1
## Model 2: price ~ reviews_per_month * factor(property_type)
## Resid. Df Resid. Dev Df Deviance
## 1 7696 822107
## 2 7647 719061 49 103045
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between reviews_per_month and property_type
#----------------------------------
glm01=glm(price ~ number_of_reviews*factor(property_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
##
## Model 1: price ~ 1
## Model 2: price ~ number_of_reviews * factor(property_type)
## Resid. Df Resid. Dev Df Deviance
## 1 7696 822107
## 2 7647 731656 49 90450
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between number_of_reviews and property_type
#--------------------------------------
glm01=glm(price ~ review_scores_rating*factor(property_type), data=seadata,family=poisson)
fit.full=glm01
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
##
## Model 1: price ~ 1
## Model 2: price ~ review_scores_rating * factor(property_type)
## Resid. Df Resid. Dev Df Deviance
## 1 7696 822107
## 2 7647 736367 49 85740
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
# p < 0.05, therefore there is strong evidence of interaction between review_scores_rating and property_type
#----------------------------------------
glm01=glm(price ~ reviews_per_month*beds, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 646284.3 7693 84.0094
glmsum = summary(glm01,dispersion=D/df)
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value Pr(>|z|) robust.se
## (Intercept) 4.8418 0.0203 238.2644 0.0000 0.0317
## reviews_per_month -0.0870 0.0076 -11.4374 0.0000 0.0084
## beds 0.2151 0.0073 29.3856 0.0000 0.0105
## reviews_per_month:beds -0.0048 0.0030 -1.6326 0.1025 0.0034
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z))
p_value
## reviews_per_month:beds
## 0.1580193
# p > 0.05, therefore there is no evidence of interaction between beds and reviews_per_month
#----------------------------------------------
glm01=glm(price ~ reviews_per_month*bedrooms, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 652481.4 7693 84.81495
glmsum = summary(glm01,dispersion=D/df)
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value Pr(>|z|) robust.se
## (Intercept) 4.8225 0.0207 232.9086 0.0000 0.0355
## reviews_per_month -0.0742 0.0071 -10.3846 0.0000 0.0086
## bedrooms 0.2850 0.0097 29.2450 0.0000 0.0142
## reviews_per_month:bedrooms -0.0060 0.0038 -1.5541 0.1202 0.0042
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z))
p_value
## reviews_per_month:bedrooms
## 0.1531275
# p > 0.05, therefore there is no evidence of interaction between bedrooms and reviews_per_month
#----------------------------------
glm01=glm(price ~ reviews_per_month*bathrooms, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 691116 7693 89.83699
glmsum = summary(glm01,dispersion=D/df)
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.8078 0.0259 185.3128 0.0000
## reviews_per_month -0.1015 0.0099 -10.2393 0.0000
## bathrooms 0.3334 0.0153 21.8181 0.0000
## reviews_per_month:bathrooms 0.0124 0.0066 1.8681 0.0617
## robust.se
## (Intercept) 0.0414
## reviews_per_month 0.0113
## bathrooms 0.0238
## reviews_per_month:bathrooms 0.0079
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z))
p_value
## reviews_per_month:bathrooms
## 0.1165035
# p > 0.05, therefore there is no evidence of interaction between bathrooms and reviews_per_month
#----------------------------------------------
glm01=glm(price ~ number_of_reviews*bedrooms, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 665892.6 7693 86.55825
glmsum = summary(glm01,dispersion=D/df)
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value Pr(>|z|) robust.se
## (Intercept) 4.7636 0.0184 258.4826 0.0000 0.0297
## number_of_reviews -0.0020 0.0002 -8.9446 0.0000 0.0002
## bedrooms 0.2780 0.0089 31.4063 0.0000 0.0124
## number_of_reviews:bedrooms 0.0000 0.0001 0.1387 0.8897 0.0001
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z))
p_value
## number_of_reviews:bedrooms
## 1
# p > 0.05, therefore there is no strong evidence of interaction between bedrooms and number_of_reviews
#-------------------------------
glm01=glm(price ~ number_of_reviews*beds, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 667795.4 7693 86.80559
glmsum = summary(glm01,dispersion=D/df)
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value Pr(>|z|) robust.se
## (Intercept) 4.7847 0.0181 264.3671 0.0000 0.0269
## number_of_reviews -0.0024 0.0002 -10.3175 0.0000 0.0002
## beds 0.2002 0.0064 31.1764 0.0000 0.0091
## number_of_reviews:beds 0.0001 0.0001 0.9291 0.3528 0.0001
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z))
p_value
## number_of_reviews:beds
## 0.3173105
# p > 0.05, therefore there is no strong evidence of interaction between beds and number_of_reviews
#-------------------------------------------
glm01=glm(price ~ number_of_reviews*bathrooms, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 704008.9 7693 91.51292
glmsum = summary(glm01,dispersion=D/df)
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.7143 0.0233 202.1202 0.0000
## number_of_reviews -0.0026 0.0003 -8.2908 0.0000
## bathrooms 0.3490 0.0138 25.2563 0.0000
## number_of_reviews:bathrooms 0.0004 0.0002 1.8706 0.0614
## robust.se
## (Intercept) 0.0352
## number_of_reviews 0.0003
## bathrooms 0.0207
## number_of_reviews:bathrooms 0.0002
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z))
p_value
## number_of_reviews:bathrooms
## 0.04550026
# p < 0.05, therefore there is STRONG evidence of interaction between bathrooms and number_of_reviews
#----------------------------------------------------
glm01=glm(price ~ review_scores_rating*bathrooms, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 703343.8 7693 91.42646
glmsum = summary(glm01,dispersion=D/df)
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.6635 0.2087 36.7239 0
## review_scores_rating -0.0329 0.0022 -14.9878 0
## bathrooms -1.0711 0.1564 -6.8490 0
## review_scores_rating:bathrooms 0.0154 0.0016 9.4194 0
## robust.se
## (Intercept) 0.3912
## review_scores_rating 0.0041
## bathrooms 0.2979
## review_scores_rating:bathrooms 0.0031
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z))
p_value
## review_scores_rating:bathrooms
## 6.773702e-07
# p < 0.05, therefore there is strong evidence of interaction between bathrooms and review_scores_rating
#------------------------------------------
glm01=glm(price ~ review_scores_rating*beds, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 674952.7 7693 87.73596
glmsum = summary(glm01,dispersion=D/df)
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value Pr(>|z|) robust.se
## (Intercept) 6.6556 0.1437 46.3091 0.0000 0.2507
## review_scores_rating -0.0213 0.0015 -14.0102 0.0000 0.0026
## beds -0.1797 0.0653 -2.7531 0.0059 0.1157
## review_scores_rating:beds 0.0042 0.0007 6.0578 0.0000 0.0012
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z))
p_value
## review_scores_rating:beds
## 0.0004652582
# p < 0.05, therefore there is strong evidence of interaction between beds and review_scores_rating
#-------------------------------------------------
glm01=glm(price ~ review_scores_rating*bedrooms, data=seadata,family=poisson)
D=summary(glm01)$deviance
df=summary(glm01)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 662655.9 7693 86.13751
glmsum = summary(glm01,dispersion=D/df)
v <- vcovHC(glm01,dispersion=D/df)
robust.se <- sqrt(diag(v))
fit=round(cbind(glmsum$coef,robust.se),4)
fit
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.8469 0.1528 44.8085 0.0000
## review_scores_rating -0.0235 0.0016 -14.5600 0.0000
## bedrooms -0.3071 0.1063 -2.8873 0.0039
## review_scores_rating:bedrooms 0.0064 0.0011 5.7597 0.0000
## robust.se
## (Intercept) 0.2850
## review_scores_rating 0.0030
## bedrooms 0.1775
## review_scores_rating:bedrooms 0.0018
z = (fit[,1][4]/fit[,5][4])
p_value = 2 * pnorm(-abs(z))
p_value
## review_scores_rating:bedrooms
## 0.0003771812
# p < 0.05, therefore there is strong evidence of interaction between review_scores_rating and bedrooms
glm02 <-glm(price ~ (review_scores_rating + reviews_per_month + number_of_reviews + beds + bathrooms + bedrooms + factor(property_type) + factor(room_type))^2, data=seadata,family=poisson)
# plot of fitted values and summary
D=summary(glm02)$deviance
df=summary(glm02)$df.residual
data.frame(D,df,dispersion=D/df)
## D df dispersion
## 1 427064.7 7492 57.00276
glmsum = summary(glm02,dispersion=D/df)
#glmsum$coef
hist(glm02$fitted.values)
#Fitted vs originals
plot(glm02$fitted.values,seadata$price, col="lightblue")
abline(a=0, b=1)
scatter.smooth(glm02$fitted.values, glm02$residuals,cex=0.5,col="lightblue")
## MSE and RMSE
mean((glm02$fitted.values - seadata$price)^2)
## [1] 20012.79
sqrt(mean((glm02$fitted.values - seadata$price)^2))
## [1] 141.4666
#------------------------------------
# Overall LRT test
fit.full=glm02
fit.null=glm(price ~ 1, data=seadata,family=poisson)
model=anova(fit.null,fit.full,dispersion=D/df)
model
## Analysis of Deviance Table
##
## Model 1: price ~ 1
## Model 2: price ~ (review_scores_rating + reviews_per_month + number_of_reviews +
## beds + bathrooms + bedrooms + factor(property_type) + factor(room_type))^2
## Resid. Df Resid. Dev Df Deviance
## 1 7696 822107
## 2 7492 427065 204 395042
pvalue=1-pchisq(model[,4][2],model[,3][2])
pvalue
## [1] 0
#Conclusion: The null hypothesis test is statistically significant, therefore not all regression coefficients are 0.
# No normality assumption for log-linear regression/poisson model
# Poisson regression an assumption is that the variance is proportional to the mean (hence not constant), so we should expect the residuals to display non-constant variance.
#Residuals versus linear predictors: The smooth curve is linear
r=residuals(glm02,type="pearson")
scatter.smooth(glm02$linear.predictors,r,cex=0.5,col="lightblue")
#Absolute residuals versus linear predictors: There is no mean-variance relationship for the residuals and therefore the assumption of variance being proportional to mean is true.
scatter.smooth(glm02$linear.predictors,abs(r),cex=0.5,col="lightblue")
# Pearson and deviance residual plots
scatter.smooth(glm02$linear.predictors,residuals(glm02,type="deviance"),cex=0.5,col="lightblue")
scatter.smooth(glm02$linear.predictors,residuals(glm02,type="pearson"),cex=0.5,col="lightblue")
# Overall summary: The model assumptions are satisfied very well for the Poisson regression model