R Programming Task using the Data Analytics Approach

str(creditDF)

# Q1)
# Exploratory Data Analysis

# To check the number of missing values in the datafram
missing <- function(col){
return(sum(is.na(col)))
}
apply(creditDF, 2, missing)

# as we can see that their are no missing values in the data
# Now we will see how is the dependent variable is :

summary(creditDF\$Balance)

hist((creditDF\$Balance))

boxplot((creditDF\$Balance))

# As we can see that distribution of response variable is highly skewed

# Now we see the relationship with balance with all varaibles

# Here we can see that most of the values are near zero so we will see that how mant values are zero
length(creditDF\$Balance[(creditDF\$Balance == 0)])

# as we can see that this data has 85 values of balaance which are zero
# so the data is highly zero inflated

pairs(creditDF)

# As we can see the last row or last column to see the relationship between balance variable and other independent varaibles
# as we can see that many varaibles are highly linearly related with balance variable so it is good idea to apply multiple linear regression
# Now to have idea about continous variables we draw corelation between the variables

cor(creditDF[,c(1, 2, 3, 4, 5, 11)])

# As we can see the last row that Income is moderately correlated witg balance but Limit and Rating are highly correlated with Balance
#Here one thing to take care is that we can see that covariates Limit and Rating are highly correlated and similarly Income and limit are also
# So we need to take care of these since it there is problem of multicolinearity here
# Now we see the distribution of categorical variables

table(creditDF\$Education)
# Here as we can see that most of the observations are some college followed by some HS

table(creditDF\$Gender)
# Gender distribution is almost identical

table(creditDF\$Student)
#most of the credit card holders are non students

table(creditDF\$Married)

table(creditDF\$Area)

#Here we can see that credit card holders are mostly from urban area followed by suburban and then minimum in rural area
#Q2

# Test train split of data
set.seed(1)

# Proportion of data to be taken in train data
prop = 0.8

# determining the indices of train data from master data
train_index = sample(1:nrow(creditDF), size = nrow(creditDF)*prop)

# training data
train_df <-creditDF[train_index, ]

# test data
test_df <- creditDF[-train_index, ]

#Q3)

#Applying Regression model

model_1 <- lm(Balance~., data = train_df )
summary(model_1)
library(car)
vif(model_1)

# AS we can see that our model suffers from major multicollinearity
# and above we can also see that VIF( variation inflation factor) for variable Limit and Rating is very high and we know that if VIF is higher than 10 multicollinearity is a severe issue
# So either we need to remove one of them from the model so that we resolve this issue

# So we will remove the variable limit from the model and then refit the model

model_2 <- lm(Balance~.-Limit, data = train_df )
summary(model_2)
vif(model_2)
# Now we can see that their is no multicollineariy in the model

# Now we will apply model selection technique to select variables from the model and then select the best model based on the subset which has minimum test error
library(leaps)

# Doing stepwise model selection to select the best variables
reg_fit_best <- regsubsets(Balance~.-Limit, data = train_df, method = “seqrep”)

# Making the design matrix ie., matrix X from the test data
test.mat = model.matrix(Balance~.-Limit,data = test_df)

# Now we check that which of the subsets of variables has minimum test MSE and then select those variables to make a final model

val.errors = c()
for(i in 1:8){
coefi = coef(reg_fit_best, id = i)
pred = test.mat[, names(coefi)]%*%coefi
val.errors[i] = mean((test_df\$Balance-pred)^2)
}
# test MSE for all subsets of variable
val.errors
# Now we want to see that with how many varibles the model has minimum test MSE
which.min(val.errors)
# So we have 4 variables selected from the criteria to have a best model which have minimum test MSE
#So our final model will be

coef(reg_fit_best, 4)
#So the best model slected is :

# Balance = -596.604124 – 7.923768 Income + 4.034338 Rating + 210.295017 EducationHS + 408.559069 StudentYes