Analysing & predicting High School student performance

Source: Unsplash.

The Problem

average = (G3.x + G3.y) / 2

Preparing the Data

average = (G3.x + G3.y) / 2
#Merge both datasets into one
combined.df=merge(mgrades.df,pgrades.df,by=c("school","sex","age","address","famsize","higher","Pstatus","Medu","Fedu","Mjob","Fjob","reason","guardian","traveltime","studytime","schoolsup","famsup","paid","activities","nursery","internet","romantic","famrel","freetime","goout","Dalc","Walc","health"))
write.csv(combined.df, file = "combined.csv")
#Preparing the data - switching "yes" and "no" to "1" and "0"
combined.df$schoolsup<-ifelse(combined.df$schoolsup=='yes', 1,0)

Understanding the data

Correlation

average <- combined.df$average
res<-cor(combined.df[,c(14:39)],use="complete.obs")
col<- colorRampPalette(c("blue", "white", "red"))(20)
heatmap(x = res, col = col, symm = TRUE)
targetCol <- which(names(combined.df)=="average")
startCol <- which(names(combined.df)=="traveltime")
endCol <- which(names(combined.df)=="G3.y")
sort(cor(combined.df[,c(targetCol,startCol:endCol)],use="complete.obs")[1,],decreasing = FALSE) [1:10]

R graphs

aggregate(cbind(goout, romantic, Walc, studytime, internet) ~ sex, data=combined.df,FUN=mean)
aggregate(cbind(goout, romantic, Walc, studytime, internet) ~ school, data=combined.df, FUN=mean)
ggplot(combined.df, aes(x=Mjob, y=average, fill=guardian))+ geom_bar(stat="identity")+ scale_fill_manual(values=c("#4682B4", "#87CEEB", "#6495ED"))+ theme_minimal() + ylim(0, 1000)
ggplot(combined.df, aes(x=Fjob, y=average, fill=guardian))+ geom_bar(stat="identity")+ scale_fill_manual(values=c("#4682B4", "#87CEEB", "#6495ED"))+ theme_minimal() + ylim(0, 1000)
library(rpart)
library(rpart.plot)
rt <- rpart(average ~ address + Fedu + studytime + schoolsup + famsup + higher + internet + romantic + goout, data = combined.df.training)#Plotting the tree
prp(rt, type = 1, extra = 1)

Tableau Graphs

Generating and Testing Prediction Models

Lasso Regularisation with optimized lambda

startCol <- which(names(combined.df)=="traveltime")
endCol <- which(names(combined.df)=="health")
xknown <- as.matrix(combined.df.training[, startCol:endCol])
yknown <- combined.df.training$average
#Basic Lasso Regulization Analysislibrary("glmnet")
library("Matrix")
library("foreach")
lm.lasso <- glmnet(xknown, yknown, family = "gaussian")
plot(lm.lasso, xvar = "lambda", label = TRUE)
coef(lm.lasso, s = exp(0))
#Using cross validation to optimise Lamdfor Lasso Regulization Model
set.seed(101)
lm.lasso.cv <- cv.glmnet(xknown, yknown, nfolds = 5, family = "gaussian")
#Value for an optimised lamda value
lm.lasso.cv$lambda.min(minLogLambda <- log(lm.lasso.cv$lambda.min))
#Plotting optimised Lamda Lasso Regulization
plot(lm.lasso, xvar = "lambda", label = TRUE)
abline(v = log(lm.lasso.cv$lambda.min))
#Coefficients of the regularized linear regression with an optimal lambda.coef(lm.lasso.cv, s = "lambda.min")
#Determining the generalised error rate for this lasso model
xtest <- as.matrix(combined.df.test[, startCol:endCol])
lm.lasso.cv.pred <- predict(lm.lasso.cv, newx = xtest, s = "lambda.min")#Calculate RMSE
grades.lasso.pred=predict(lm.lasso.cv.pred,newdata=combined.df.test)
rmse = sqrt(mean((combined.df.test$average-grades.lasso.pred )^2))
errorvals<-sapply(-6:2, function(loglambda {mae(predict(lm.lasso.cv, newx = xtest, s = exp(loglambda)),combined.df.test$average)})plot(-6:2,errorvals, xlab="log lambda", ylab="error (mae)",type="o")abline(v = log(lm.lasso.cv$lambda.min))

R part

library(rpart)
library(rpart.plot)
rt <- rpart(average ~ address + Fedu + studytime + schoolsup + famsup + higher +internet + romantic + goout,data = combined.df.training)#Plotting the tree
prp(rt, type = 1, extra = 1)
#Calculate RMSE
grades.rpart.pred = predict(rt,newdata=combined.df.test)
rmse = sqrt(mean((combined.df.test$average-grades.rpart.pred )^2)

Random Forest

# Random Forest:
library(randomForest)
grades.forest <- randomForest(average ~ . , data =combined.df.boost, importance=TRUE, ntree=100)
#Calculate RMSE
grades.forest.pred = predict(grades.forest,newdata=combined.df.test)
rmse = sqrt(mean((combined.df.test$average-grades.forest.pred)^2))
#Plotting the forest and prediction comparisation
plot(grades.forest)
plot(combined.df.test$average,pch=16)
points(grades.forest.pred, col = "blue", pch=4)

Ensemble Model

#ENSEMBLE MODEL (r-part & Black Forest)
ensemble_pred <- (grades.rpart.pred + grades.forest.pred)/2 rmse = sqrt(mean((ensemble_pred-pred)^2))

Problem Conclusions and Recommendations

MSci Management Science with Artificial Intelligence student at UCL. Currently on my last year completing a masters concentration in Business Analytics.