library(readxl)
CSDATA <- read_excel(“CSDATA.xlsx”)
dim(CSDATA)
#Fitting the linear regression on all the independent variables
lm.fit = lm(Y ~ ., data = CSDATA)
# Q1.
#VIF for each of the independent variables
library(car)
vif(lm.fit)
# We can see that the variable X2 has the highest VIF, indicates the presence of multicollinearity.
# Hence, we’ll remove X2 from the model.
lm.fit_1 = lm(Y~.-X2, data = CSDATA)
vif(lm.fit_1)
# Multicollinearity is removed from the model as we can see that no variable has VIF greater than 5
#Q2.
summary(lm.fit_1)
# Here as we can see that p value corresponding to variable X3 is not 0.273798 > alpha = 0.05 which implies that here we reject null hypothesis
# i.e., this variable is insignificant
# From the result, we notice that the variable X3 is insignificant. Therefore, we’ll remove X3 from the model.
# So after removing the X3 this is the fitted model
lm.fit_2 = lm(Y~.-X2-X3, data = CSDATA)
summary(lm.fit_2)
# Now, we obtained the model with no multicollinearity and all significant variables
# So only variables X1, X4 and X5 are included in the model
# And fitted equation is
# Y = 8.961389 + 0.250103 X1 – 0.900124 X4 – 1.208128 X5
# Q 3.
# Residuals from the model obtained above
resi = lm.fit_2$residuals
# Partial Residual Analysis on the X1 variable.
lm.fit_3 = lm(resi~X1, data = CSDATA)
summary(lm.fit_3)
# Here as we can see that p value corresponding to the model is 1 > alpha = 0.05 which implies that here we do not have enoigh evidence to reject null hypothesis i.e., squared term can’t be added.
#Partial Residual Analysis on the X4 variable.
lm.fit_4 = lm(resi~X4, data = CSDATA)
summary(lm.fit_4)
# Here as we can see that p value corresponding to the model is 1 > alpha = 0.05 which implies that here we do not have enoigh evidence to reject null hypothesis i.e., squared term can’t be added.
lm.fit_5 = lm(resi~X5, data = CSDATA)
summary(lm.fit_5)
# Here as we can see that p value corresponding to the model is 1 > alpha = 0.05 which implies that here we do not have enoigh evidence to reject null hypothesis i.e., squared term can’t be added.
# From the model obtained from Q2. , we see that there exist no relationship between the residuals and the 3 independent variables.
# So final model has independent variables X1, X4 and X5
# Q 4.
# Final fit
lmfit = lm(Y ~ X1 + X4 + X5, data = CSDATA)
summary(lmfit)
# a.
# Rsquare value is 0.9607. It implies 96.07% of the variation is explained by the multiple linear regression.
# b.
# For X1 : For every $1 increase in the income there is $0.250103 increase in the consumption expenditure.
# For X4 : For every 1 year increase in the age of dependents there is $0.900124 decrease in the consumption expenditure.
# For X5 : For the individual has a college then consumption expenditure is $1.208128 less than if an individual has not college education
# c.
# Ho : coefficient corresponding to X1 is 0.25
# Ha :coefficient corresponding to X1 more than 0.25 (right tailed test)
# So for this we have to write a function to see that if we reject the above stated hypothesis
ttest <- function(reg, coefnum, val){
co <- coef(summary(reg))
tstat <- (co[coefnum,1]-val)/co[coefnum,2]
pt((tstat), reg$df.residual, lower.tail = FALSE)
}
ttest(lmfit, 2, 0.25)
# So here as we can see that p value corresponding to the test is 0.4825303 > alpha = 0.05
# which implies that here we do not have enough evidence to reject null hypothesis
# i.e,. null hypothesis is true
# SO coefficient corresponding to X1 is 0.25 is true