source("04_validation_croisee_code.R") source("05_d_svd_mca_code.R") source("15_loocv_code.R") source("19_nystroem_approximation_code.R") dataset.housing <- function() { set.seed(1123) # loading dataset in memory dat <- read.csv(file="data/housing.csv", header=TRUE) # transform ocean_proximity into a factor dat$ocean_proximity <- as.factor(dat$ocean_proximity) levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="<1H OCEAN"] <- "O:<1H" levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="ISLAND"] <- "O:ISL" levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="INLAND"] <- "O:INL" levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR BAY"] <- "O:NB" levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR OCEAN"] <- "O:NO" # replace total_bedrooms missing values with the median value dat$total_bedrooms[is.na(dat$total_bedrooms)] <- median(dat$total_bedrooms, na.rm=TRUE) # remove individuals corresponding to the Island modality of ocean_proximity dat <- dat[dat$ocean_proximity != "O:ISL", ] # remove empty modalities (here, only "O:ISL") dat <- droplevels(dat) # introduce new variable for number of rooms by households dat['rooms'] <- dat['total_rooms'] / dat['households'] # introduce new variable for number of bedrooms by households dat['bedrooms'] <- dat['total_bedrooms'] / dat['households'] # introduce new variable for number of people by households dat['pop'] <- dat['population'] / dat['households'] # remove individuals with extremely high values of the target dat <- dat[dat$median_house_value < 500001, ] # select variables to retain in the dataset dat <- dat[c('longitude', 'latitude', 'housing_median_age', 'households', 'median_income', 'median_house_value', 'ocean_proximity', 'rooms', 'bedrooms', 'pop')] # one-hot-enconde categorical variable ocean_proximity Z <- onehot_enc(dat[c('ocean_proximity')]) dat <- cbind(dat, as.data.frame(Z)) dat <- dat[,!(colnames(dat) %in% c('ocean_proximity'))] dat.all <- dat # separate observed variables X from the target Y X <- dat[,!(colnames(dat) %in% c('median_house_value'))] Y <- dat[,c('median_house_value')] names(Y) <- rownames(X) dat <- list(X = X, Y = Y) # split the dataset into train and test split <- splitdata(dat, 0.8) train <- split$entr test <- split$test r <- list( dat=dat.all, train=train, test=test ) return(r) } # create a categorical dataset for correspondence analysis of the training data dataset.housing.mca <- function(dat=NULL) { # loading dataset in memory if(is.null(dat)) { dat <- read.csv(file="data/housing.csv", header=TRUE) } # transform ocean_proximity into a factor dat$ocean_proximity <- as.factor(dat$ocean_proximity) levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="<1H OCEAN"] <- "O:<1H" levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="ISLAND"] <- "O:ISL" levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="INLAND"] <- "O:INL" levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR BAY"] <- "O:NB" levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR OCEAN"] <- "O:NO" # discretization of longitude cuts <- kcuts(x = dat$longitude, centers = 4) dat$c_longitude <- cut(x = dat$longitude, unique(cuts), include.lowest = TRUE) levels(dat$c_longitude) <- c('LO:W', 'LO:MW', 'LO:ME', 'LO:E') # discretization of latitude cuts <- kcuts(x = dat$latitude, centers = 4) dat$c_latitude <- cut(x = dat$latitude, unique(cuts), include.lowest = TRUE) levels(dat$c_latitude) <- c('LA:S','LA:MS','LA:MN','LA:N') # discretization of housing_median_age cuts <- c(min(dat$housing_median_age), 15, 25, 35, 51, 52) dat$c_age <- cut(x = dat$housing_median_age, unique(cuts), include.lowest = TRUE) levels(dat$c_age) <- c('AG:15]','AG:25]','AG:35]','AG:51]', 'AG:52') # creation and discretization of rooms dat$rooms <- dat$total_rooms / dat$households cuts <- c(min(dat$rooms), 4, 6, 8, max(dat$rooms)) dat$c_rooms <- cut(x = dat$rooms, unique(cuts), include.lowest = TRUE) levels(dat$c_rooms) <- c('RO:4]','RO:6]','RO:8]', 'RO:>8') # creation and discretization of bedrooms dat$bedrooms <- dat$total_bedrooms / dat$households cuts <- c(min(dat$bedrooms, na.rm = TRUE), 1.1, max(dat$bedrooms, na.rm = TRUE)) dat$c_bedrooms <- cut(x = dat$bedrooms, unique(cuts), include.lowest = TRUE) levels(dat$c_bedrooms) <- c('BE:1]','BE:>1') # creation and discretization of pop dat$pop <- dat$population / dat$households cuts <- c(min(dat$pop), 2, 3, 4, max(dat$pop)) dat$c_pop <- cut(x = dat$pop, unique(cuts), include.lowest = TRUE) levels(dat$c_pop) <- c('PO:2]','PO:3]', 'PO:4]', 'PO:>4') # discretization of households cuts <- c(min(dat$households), 300, 400, 600, max(dat$households)) dat$c_households <- cut(x = dat$households, cuts, include.lowest = TRUE) levels(dat$c_households) <- c('HH:3]', 'HH:4]', 'HH:6]', 'HH:>6') # discretization of median_income cuts <- quantile(dat$median_income, probs = seq(0,1,1/4)) cuts <- c(cuts[1:length(cuts)-1], 15, max(dat$median_income)) dat$c_income <- cut(x = dat$median_income, cuts, include.lowest = TRUE) levels(dat$c_income) <- c('IC:L', 'IC:ML', 'IC:MH', 'IC:H', 'IC:>15') # discretization of median_house_value cuts <- c(min(dat$median_house_value), 115000, 175000, 250000, max(dat$median_house_value)) dat$c_house_value <- cut(x = dat$median_house_value, cuts, include.lowest = TRUE) levels(dat$c_house_value) <- c('HV:A', 'HV:B', 'HV:C', 'HV:D') # discretized version of the entire dataset dat <- dat[c('ocean_proximity', 'c_longitude', 'c_latitude', 'c_age', 'c_rooms', 'c_bedrooms', 'c_pop', 'c_households', 'c_income', 'c_house_value')] vent <- list() # ventilation of modality RO:>8 of variable c_rooms vent$c_rooms <- ventilate(dat$c_rooms, "RO:>8") dat$c_rooms[vent$c_rooms$sup_i] <- vent$c_rooms$smpl # ventilation of modality IC:>15 of variable c_income vent$c_income <- ventilate(dat$c_income, "IC:>15") dat$c_income[vent$c_income$sup_i] <- vent$c_income$smpl # ventilation of missing values of variable c_bedrooms vent$c_bedrooms <- ventilate(dat$c_bedrooms, "NA") dat$c_bedrooms[vent$c_bedrooms$sup_i] <- vent$c_bedrooms$smpl # removal of empty modalities following the various ventilations dat <- droplevels(dat) # c_house_value, the target, is considered as a supplementary variable supvar <- which(names(dat) == "c_house_value") r <- list( vent=vent, dat=dat, supvar=supvar ) return(r) } print("load dataset") dat <- dataset.housing() print("Nystroem approx ridge regression") nakrm <- nakr(dat$train$X, dat$train$Y) nakrm.yh <- predict(nakrm, dat$test$X) nakrm.mae <- mean(abs(nakrm.yh - dat$test$Y)) print(paste("Test MAE for Nystroem: ", nakrm.mae)) # correspondence analysis of the training set rawdat <- read.csv(file="data/housing.csv", header=TRUE) train.rawdat <- rawdat[as.numeric(rownames(dat$train$X)),] dat.mca <- dataset.housing.mca(train.rawdat) cam <- mca(dat.mca) # average of the prediction errors by clusters nakrm.yh.train <- predict(nakrm, dat$train$X) nakrm.mae.train <- mean(abs(nakrm.yh.train - dat$train$Y)) nakrm.err.train <- abs(nakrm.yh.train - dat$train$Y) clst.err <- aggregate(nakrm.err.train, list(cam$clsti$cluster), FUN=mean) clst.err$Group.1 <- paste0("clst-", as.character(clst.err$Group.1)) clst.tbl <- clstcor.mca(cam) clst.err <- clst.err[match(rownames(clst.tbl),clst.err$Group.1),] clst.tbl <- cbind(clst.tbl, clst.err[,2]) colnames(clst.tbl)[ncol(clst.tbl)] <- "ERR" clst.tbl <- clst.tbl[order(-clst.tbl[,"ERR"]),] print("training set clusters obtained by correspondence analysis and ordered by amount of error") print(clst.tbl) # From these informations, we could try to understand why for some clusters the model commits greater errors... print("Linear ridge regression") rm <- ridge(dat$train$X, dat$train$Y) rm.yh <- predict(rm, dat$test$X) rm.mae <- mean(abs(rm.yh - dat$test$Y)) print(paste("Test MAE for Ridge: ", rm.mae)) print("Random forest") library(randomForest) rfm <- randomForest(dat$train$X, dat$train$Y) rfm.yh <- predict(rfm, dat$test$X) rfm.mae <- mean(abs(rfm.yh - dat$test$Y)) print(paste("Test MAE for Random Forest: ", rfm.mae))