library(readxl)

CSDATA <- read_excel(“CSDATA.xlsx”)

dim(CSDATA)

#Fitting the linear regression on all the independent variables

lm.fit = lm(Y ~ ., data = CSDATA)

# Q1.

#VIF for each of the independent variables

library(car)

vif(lm.fit)

# We can see that the variable X2 has the highest VIF, indicates the presence of multicollinearity.

# Hence, we’ll remove X2 from the model.

lm.fit_1 = lm(Y~.-X2, data = CSDATA)

vif(lm.fit_1)

# Multicollinearity is removed from the model as we can see that no variable has VIF greater than 5

#Q2.

summary(lm.fit_1)

# Here as we can see that p value corresponding to variable X3 is not 0.273798 > alpha = 0.05 which implies that here we reject null hypothesis

# i.e., this variable is insignificant

# From the result, we notice that the variable X3 is insignificant. Therefore, we’ll remove X3 from the model.

# So after removing the X3 this is the fitted model

lm.fit_2 = lm(Y~.-X2-X3, data = CSDATA)

summary(lm.fit_2)

# Now, we obtained the model with no multicollinearity and all significant variables

# So only variables X1, X4 and X5 are included in the model

# And fitted equation is

# Y = 8.961389 + 0.250103 X1 – 0.900124 X4 – 1.208128 X5

# Q 3.

# Residuals from the model obtained above

resi = lm.fit_2$residuals

# Partial Residual Analysis on the X1 variable.

lm.fit_3 = lm(resi~X1, data = CSDATA)

summary(lm.fit_3)

# Here as we can see that p value corresponding to the model is 1 > alpha = 0.05 which implies that here we do not have enoigh evidence to reject null hypothesis i.e., squared term can’t be added.

#Partial Residual Analysis on the X4 variable.

lm.fit_4 = lm(resi~X4, data = CSDATA)

summary(lm.fit_4)

# Here as we can see that p value corresponding to the model is 1 > alpha = 0.05 which implies that here we do not have enoigh evidence to reject null hypothesis i.e., squared term can’t be added.

lm.fit_5 = lm(resi~X5, data = CSDATA)

summary(lm.fit_5)

# Here as we can see that p value corresponding to the model is 1 > alpha = 0.05 which implies that here we do not have enoigh evidence to reject null hypothesis i.e., squared term can’t be added.

# From the model obtained from Q2. , we see that there exist no relationship between the residuals and the 3 independent variables.

# So final model has independent variables X1, X4 and X5

# Q 4.

# Final fit

lmfit = lm(Y ~ X1 + X4 + X5, data = CSDATA)

summary(lmfit)

# a.

# Rsquare value is 0.9607. It implies 96.07% of the variation is explained by the multiple linear regression.

# b.

# For X1 : For every $1 increase in the income there is $0.250103 increase in the consumption expenditure.

# For X4 : For every 1 year increase in the age of dependents there is $0.900124 decrease in the consumption expenditure.

# For X5 : For the individual has a college then consumption expenditure is $1.208128 less than if an individual has not college education

# c.

# Ho : coefficient corresponding to X1 is 0.25

# Ha :coefficient corresponding to X1 more than 0.25 (right tailed test)

# So for this we have to write a function to see that if we reject the above stated hypothesis

ttest <- function(reg, coefnum, val){

co <- coef(summary(reg))

tstat <- (co[coefnum,1]-val)/co[coefnum,2]

pt((tstat), reg$df.residual, lower.tail = FALSE)

}

ttest(lmfit, 2, 0.25)

# So here as we can see that p value corresponding to the test is 0.4825303 > alpha = 0.05

# which implies that here we do not have enough evidence to reject null hypothesis

# i.e,. null hypothesis is true

# SO coefficient corresponding to X1 is 0.25 is true