# The Problem

`average = (G3.x + G3.y) / 2`

# Preparing the Data

`average = (G3.x + G3.y) / 2`
`#Merge both datasets into onecombined.df=merge(mgrades.df,pgrades.df,by=c("school","sex","age","address","famsize","higher","Pstatus","Medu","Fedu","Mjob","Fjob","reason","guardian","traveltime","studytime","schoolsup","famsup","paid","activities","nursery","internet","romantic","famrel","freetime","goout","Dalc","Walc","health"))write.csv(combined.df, file = "combined.csv")`
`#Preparing the data - switching "yes" and "no" to "1" and "0"combined.df\$schoolsup<-ifelse(combined.df\$schoolsup=='yes', 1,0)`

# Understanding the data

## Correlation

`average <- combined.df\$averageres<-cor(combined.df[,c(14:39)],use="complete.obs")col<- colorRampPalette(c("blue", "white", "red"))(20)heatmap(x = res, col = col, symm = TRUE)`
`targetCol <- which(names(combined.df)=="average")startCol <- which(names(combined.df)=="traveltime")endCol <- which(names(combined.df)=="G3.y")sort(cor(combined.df[,c(targetCol,startCol:endCol)],use="complete.obs")[1,],decreasing = FALSE) [1:10]`

## R graphs

`aggregate(cbind(goout, romantic, Walc, studytime, internet) ~ sex, data=combined.df,FUN=mean)`
`aggregate(cbind(goout, romantic, Walc, studytime, internet) ~ school, data=combined.df, FUN=mean)`
`ggplot(combined.df, aes(x=Mjob, y=average, fill=guardian))+ geom_bar(stat="identity")+ scale_fill_manual(values=c("#4682B4", "#87CEEB", "#6495ED"))+ theme_minimal() + ylim(0, 1000)`
`ggplot(combined.df, aes(x=Fjob, y=average, fill=guardian))+ geom_bar(stat="identity")+ scale_fill_manual(values=c("#4682B4", "#87CEEB", "#6495ED"))+ theme_minimal() + ylim(0, 1000)`
`library(rpart)library(rpart.plot)rt <- rpart(average ~ address + Fedu + studytime + schoolsup + famsup + higher + internet + romantic + goout, data = combined.df.training)#Plotting the treeprp(rt, type = 1, extra = 1)`

# Generating and Testing Prediction Models

## Lasso Regularisation with optimized lambda

`startCol <- which(names(combined.df)=="traveltime")endCol <- which(names(combined.df)=="health")xknown <- as.matrix(combined.df.training[, startCol:endCol])yknown <- combined.df.training\$average#Basic Lasso Regulization Analysislibrary("glmnet")library("Matrix")library("foreach")lm.lasso <- glmnet(xknown, yknown, family = "gaussian")plot(lm.lasso, xvar = "lambda", label = TRUE)coef(lm.lasso, s = exp(0))#Using cross validation to optimise Lamdfor Lasso Regulization Modelset.seed(101)lm.lasso.cv <- cv.glmnet(xknown, yknown, nfolds = 5, family = "gaussian")#Value for an optimised lamda valuelm.lasso.cv\$lambda.min(minLogLambda <- log(lm.lasso.cv\$lambda.min))#Plotting optimised Lamda Lasso Regulizationplot(lm.lasso, xvar = "lambda", label = TRUE)abline(v = log(lm.lasso.cv\$lambda.min))#Coefficients of the regularized linear regression with an optimal lambda.coef(lm.lasso.cv, s = "lambda.min")`
`#Determining the generalised error rate for this lasso modelxtest <- as.matrix(combined.df.test[, startCol:endCol])lm.lasso.cv.pred <- predict(lm.lasso.cv, newx = xtest, s = "lambda.min")#Calculate RMSEgrades.lasso.pred=predict(lm.lasso.cv.pred,newdata=combined.df.test)rmse = sqrt(mean((combined.df.test\$average-grades.lasso.pred )^2))errorvals<-sapply(-6:2, function(loglambda {mae(predict(lm.lasso.cv, newx = xtest, s = exp(loglambda)),combined.df.test\$average)})plot(-6:2,errorvals, xlab="log lambda", ylab="error (mae)",type="o")abline(v = log(lm.lasso.cv\$lambda.min))`

## R part

`library(rpart)library(rpart.plot)rt <- rpart(average ~ address + Fedu + studytime + schoolsup + famsup + higher +internet + romantic + goout,data = combined.df.training)#Plotting the treeprp(rt, type = 1, extra = 1)`
`#Calculate RMSEgrades.rpart.pred = predict(rt,newdata=combined.df.test)rmse = sqrt(mean((combined.df.test\$average-grades.rpart.pred )^2)`

## Random Forest

`# Random Forest:library(randomForest)grades.forest <- randomForest(average ~ . , data =combined.df.boost, importance=TRUE, ntree=100)#Calculate RMSEgrades.forest.pred = predict(grades.forest,newdata=combined.df.test)rmse = sqrt(mean((combined.df.test\$average-grades.forest.pred)^2))#Plotting the forest and prediction comparisationplot(grades.forest)plot(combined.df.test\$average,pch=16)points(grades.forest.pred, col = "blue", pch=4)`

## Ensemble Model

`#ENSEMBLE MODEL (r-part & Black Forest)ensemble_pred <- (grades.rpart.pred + grades.forest.pred)/2 rmse = sqrt(mean((ensemble_pred-pred)^2))`

# Problem Conclusions and Recommendations

MSci Management Science with Artificial Intelligence student at UCL. Currently on my last year completing a masters concentration in Business Analytics.

## More from Rosa Caminal

MSci Management Science with Artificial Intelligence student at UCL. Currently on my last year completing a masters concentration in Business Analytics.