experiment with associating mca clusters with nakr errors
This commit is contained in:
parent
5689bf6fef
commit
96e55f2c72
@ -57,12 +57,13 @@ function(cat, mod)
|
|||||||
# pre-processing for exploratory analytics of the housing dataset
|
# pre-processing for exploratory analytics of the housing dataset
|
||||||
|
|
||||||
datasetHousing.mca <-
|
datasetHousing.mca <-
|
||||||
function()
|
function(dat=NULL)
|
||||||
{
|
{
|
||||||
# chargement du jeu de données
|
# loading dataset in memory
|
||||||
dat <- read.csv(file="data/housing.csv", header=TRUE)
|
if(is.null(dat)) { dat <- read.csv(file="data/housing.csv", header=TRUE) }
|
||||||
|
|
||||||
# transformation de ocean_proximity en facteur
|
# transform ocean_proximity into a factor
|
||||||
|
# a factor being R representation of categorical variables
|
||||||
dat$ocean_proximity <- as.factor(dat$ocean_proximity)
|
dat$ocean_proximity <- as.factor(dat$ocean_proximity)
|
||||||
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="<1H OCEAN"] <- "O:<1H"
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="<1H OCEAN"] <- "O:<1H"
|
||||||
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="ISLAND"] <- "O:ISL"
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="ISLAND"] <- "O:ISL"
|
||||||
@ -70,81 +71,83 @@ function()
|
|||||||
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR BAY"] <- "O:NB"
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR BAY"] <- "O:NB"
|
||||||
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR OCEAN"] <- "O:NO"
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR OCEAN"] <- "O:NO"
|
||||||
|
|
||||||
# quantification de longitude
|
# discretization of longitude
|
||||||
cuts <- kcuts(x = dat$longitude, centers = 4)
|
cuts <- kcuts(x = dat$longitude, centers = 4)
|
||||||
dat$c_longitude <- cut(x = dat$longitude, unique(cuts), include.lowest = TRUE)
|
dat$c_longitude <- cut(x = dat$longitude, unique(cuts), include.lowest = TRUE)
|
||||||
levels(dat$c_longitude) <- c('LO:W', 'LO:MW', 'LO:ME', 'LO:E')
|
levels(dat$c_longitude) <- c('LO:W', 'LO:MW', 'LO:ME', 'LO:E')
|
||||||
|
|
||||||
# quantification de latitude
|
# discretization of latitude
|
||||||
cuts <- kcuts(x = dat$latitude, centers = 4)
|
cuts <- kcuts(x = dat$latitude, centers = 4)
|
||||||
dat$c_latitude <- cut(x = dat$latitude, unique(cuts), include.lowest = TRUE)
|
dat$c_latitude <- cut(x = dat$latitude, unique(cuts), include.lowest = TRUE)
|
||||||
levels(dat$c_latitude) <- c('LA:S','LA:MS','LA:MN','LA:N')
|
levels(dat$c_latitude) <- c('LA:S','LA:MS','LA:MN','LA:N')
|
||||||
|
|
||||||
# quantification de housing_median_age
|
# discretization of housing_median_age
|
||||||
cuts <- c(min(dat$housing_median_age), 15, 25, 35, 51, 52)
|
cuts <- c(min(dat$housing_median_age), 15, 25, 35, 51, 52)
|
||||||
dat$c_age <- cut(x = dat$housing_median_age, unique(cuts), include.lowest = TRUE)
|
dat$c_age <- cut(x = dat$housing_median_age, unique(cuts),
|
||||||
|
include.lowest = TRUE)
|
||||||
levels(dat$c_age) <- c('AG:15]','AG:25]','AG:35]','AG:51]', 'AG:52')
|
levels(dat$c_age) <- c('AG:15]','AG:25]','AG:35]','AG:51]', 'AG:52')
|
||||||
|
|
||||||
# création et quantification de rooms
|
# creation and discretization of rooms
|
||||||
dat$rooms <- dat$total_rooms / dat$households
|
dat$rooms <- dat$total_rooms / dat$households
|
||||||
cuts <- c(min(dat$rooms), 4, 6, 8, max(dat$rooms))
|
cuts <- c(min(dat$rooms), 4, 6, 8, max(dat$rooms))
|
||||||
dat$c_rooms <- cut(x = dat$rooms, unique(cuts), include.lowest = TRUE)
|
dat$c_rooms <- cut(x = dat$rooms, unique(cuts), include.lowest = TRUE)
|
||||||
levels(dat$c_rooms) <- c('RO:4]','RO:6]','RO:8]', 'RO:>8')
|
levels(dat$c_rooms) <- c('RO:4]','RO:6]','RO:8]', 'RO:>8')
|
||||||
|
|
||||||
# création et quantification de bedrooms
|
# creation and discretization of bedrooms
|
||||||
dat$bedrooms <- dat$total_bedrooms / dat$households
|
dat$bedrooms <- dat$total_bedrooms / dat$households
|
||||||
cuts <- c(min(dat$bedrooms, na.rm = TRUE), 1.1, max(dat$bedrooms, na.rm = TRUE))
|
cuts <- c(min(dat$bedrooms, na.rm = TRUE), 1.1, max(dat$bedrooms, na.rm = TRUE))
|
||||||
dat$c_bedrooms <- cut(x = dat$bedrooms, unique(cuts), include.lowest = TRUE)
|
dat$c_bedrooms <- cut(x = dat$bedrooms, unique(cuts), include.lowest = TRUE)
|
||||||
levels(dat$c_bedrooms) <- c('BE:1]','BE:>1')
|
levels(dat$c_bedrooms) <- c('BE:1]','BE:>1')
|
||||||
|
|
||||||
# création et quantification de pop
|
# creation and discretization of pop
|
||||||
dat$pop <- dat$population / dat$households
|
dat$pop <- dat$population / dat$households
|
||||||
cuts <- c(min(dat$pop), 2, 3, 4, max(dat$pop))
|
cuts <- c(min(dat$pop), 2, 3, 4, max(dat$pop))
|
||||||
dat$c_pop <- cut(x = dat$pop, unique(cuts), include.lowest = TRUE)
|
dat$c_pop <- cut(x = dat$pop, unique(cuts), include.lowest = TRUE)
|
||||||
levels(dat$c_pop) <- c('PO:2]','PO:3]', 'PO:4]', 'PO:>4')
|
levels(dat$c_pop) <- c('PO:2]','PO:3]', 'PO:4]', 'PO:>4')
|
||||||
|
|
||||||
# quantification de households
|
# discretization of households
|
||||||
cuts <- c(min(dat$households), 300, 400, 600, max(dat$households))
|
cuts <- c(min(dat$households), 300, 400, 600, max(dat$households))
|
||||||
dat$c_households <- cut(x = dat$households, cuts, include.lowest = TRUE)
|
dat$c_households <- cut(x = dat$households, cuts, include.lowest = TRUE)
|
||||||
levels(dat$c_households) <- c('HH:3]', 'HH:4]', 'HH:6]', 'HH:>6')
|
levels(dat$c_households) <- c('HH:3]', 'HH:4]', 'HH:6]', 'HH:>6')
|
||||||
|
|
||||||
# quantification de median_income
|
# discretization of median_income
|
||||||
cuts <- quantile(dat$median_income, probs = seq(0,1,1/4))
|
cuts <- quantile(dat$median_income, probs = seq(0,1,1/4))
|
||||||
cuts <- c(cuts[1:length(cuts)-1], 15, max(dat$median_income))
|
cuts <- c(cuts[1:length(cuts)-1], 15, max(dat$median_income))
|
||||||
dat$c_income <- cut(x = dat$median_income, cuts, include.lowest = TRUE)
|
dat$c_income <- cut(x = dat$median_income, cuts, include.lowest = TRUE)
|
||||||
levels(dat$c_income) <- c('IC:L', 'IC:ML', 'IC:MH', 'IC:H', 'IC:>15')
|
levels(dat$c_income) <- c('IC:L', 'IC:ML', 'IC:MH', 'IC:H', 'IC:>15')
|
||||||
|
|
||||||
# quantification de median_house_value
|
# discretization of median_house_value
|
||||||
cuts <- c(min(dat$median_house_value), 115000, 175000, 250000, 500000,
|
cuts <- c(min(dat$median_house_value), 115000, 175000, 250000, 500000,
|
||||||
max(dat$median_house_value))
|
max(dat$median_house_value))
|
||||||
dat$c_house_value <- cut(x = dat$median_house_value, cuts, include.lowest = TRUE)
|
dat$c_house_value <- cut(x = dat$median_house_value, cuts,
|
||||||
|
include.lowest = TRUE)
|
||||||
levels(dat$c_house_value) <- c('HV:A', 'HV:B', 'HV:C', 'HV:D', 'HV:E')
|
levels(dat$c_house_value) <- c('HV:A', 'HV:B', 'HV:C', 'HV:D', 'HV:E')
|
||||||
|
|
||||||
# création du jeu de données quantifié
|
# discretized version of the entire dataset
|
||||||
dat <- dat[c('ocean_proximity', 'c_longitude', 'c_latitude', 'c_age',
|
dat <- dat[c('ocean_proximity', 'c_longitude', 'c_latitude', 'c_age',
|
||||||
'c_rooms', 'c_bedrooms', 'c_pop', 'c_households', 'c_income',
|
'c_rooms', 'c_bedrooms', 'c_pop', 'c_households', 'c_income',
|
||||||
'c_house_value')]
|
'c_house_value')]
|
||||||
vent <- list()
|
vent <- list()
|
||||||
# ventilation de la modalité RO:>8 de c_rooms
|
# ventilation of modality RO:>8 of variable c_rooms
|
||||||
vent$c_rooms <- ventilate(dat$c_rooms, "RO:>8")
|
vent$c_rooms <- ventilate(dat$c_rooms, "RO:>8")
|
||||||
dat$c_rooms[vent$c_rooms$sup_i] <- vent$c_rooms$smpl
|
dat$c_rooms[vent$c_rooms$sup_i] <- vent$c_rooms$smpl
|
||||||
|
|
||||||
# ventilation de la modalité IC:>15 de c_income
|
# ventilation of modality IC:>15 of variable c_income
|
||||||
vent$c_income <- ventilate(dat$c_income, "IC:>15")
|
vent$c_income <- ventilate(dat$c_income, "IC:>15")
|
||||||
dat$c_income[vent$c_income$sup_i] <- vent$c_income$smpl
|
dat$c_income[vent$c_income$sup_i] <- vent$c_income$smpl
|
||||||
|
|
||||||
# ventilation de la modalité O:ISL de ocean_proximity
|
# ventilation of modality O:ISL of variable ocean_proximity
|
||||||
vent$ocean_proximity <- ventilate(dat$ocean_proximity, "O:ISL")
|
vent$ocean_proximity <- ventilate(dat$ocean_proximity, "O:ISL")
|
||||||
dat$ocean_proximity[vent$ocean_proximity$sup_i] <- vent$ocean_proximity$smpl
|
dat$ocean_proximity[vent$ocean_proximity$sup_i] <- vent$ocean_proximity$smpl
|
||||||
|
|
||||||
# ventilation des valeurs manquantes de c_bedrooms
|
# ventilation of missing values of variable c_bedrooms
|
||||||
vent$c_bedrooms <- ventilate(dat$c_bedrooms, "NA")
|
vent$c_bedrooms <- ventilate(dat$c_bedrooms, "NA")
|
||||||
dat$c_bedrooms[vent$c_bedrooms$sup_i] <- vent$c_bedrooms$smpl
|
dat$c_bedrooms[vent$c_bedrooms$sup_i] <- vent$c_bedrooms$smpl
|
||||||
|
|
||||||
# suppression des modalités vides après ventilation
|
# removal of empty modalities following the various ventilations
|
||||||
dat <- droplevels(dat)
|
dat <- droplevels(dat)
|
||||||
|
|
||||||
# positionnement de c_house_value en variable supplémentaire
|
# c_house_value, the target, is considered as a supplementary variable
|
||||||
supvar <- which(names(dat) == "c_house_value")
|
supvar <- which(names(dat) == "c_house_value")
|
||||||
|
|
||||||
r <- list( vent=vent, dat=dat, supvar=supvar )
|
r <- list( vent=vent, dat=dat, supvar=supvar )
|
||||||
@ -152,8 +155,7 @@ function()
|
|||||||
return(r)
|
return(r)
|
||||||
}
|
}
|
||||||
|
|
||||||
# one hot encoding (codage disjonctif complet) of a dataframe
|
# one hot encoding of a dataframe made of categorical variables
|
||||||
# made of categorical variables
|
|
||||||
onehot_enc <-
|
onehot_enc <-
|
||||||
function(dat.cat)
|
function(dat.cat)
|
||||||
{
|
{
|
||||||
@ -250,9 +252,21 @@ function(dat, nclst = 100)
|
|||||||
# Principal coordinates of clusters' centers
|
# Principal coordinates of clusters' centers
|
||||||
fclst <- clsti$centers
|
fclst <- clsti$centers
|
||||||
|
|
||||||
r <- list(f=f, ctr=ctr, cor=cor, r=r, sv=sv,
|
r <- list(f=f, # principal coordinates
|
||||||
fsi=fsi, sicor=sicor, fsj=fsj, sjcor=sjcor,
|
ctr=ctr, # contributions of the modalities
|
||||||
clsti=clsti)
|
# to the principal axes
|
||||||
|
cor=cor, # correlations of the modalities
|
||||||
|
# with the principal axes
|
||||||
|
r=r, # marginal profile of the modalities
|
||||||
|
sv=sv, # K (=J-Q) singular values
|
||||||
|
fsi=fsi, # principal coordinates of the individuals
|
||||||
|
sicor=sicor, # correlations of the individuals
|
||||||
|
# with the principal axes
|
||||||
|
fsj=fsj, # principal coordinates of the modalities
|
||||||
|
# of the supplementary variables
|
||||||
|
sjcor=sjcor, # correlations of the supplementary variables
|
||||||
|
# with the principal axes
|
||||||
|
clsti=clsti) # kmeans clustering of the individuals
|
||||||
class(r) <- "mca"
|
class(r) <- "mca"
|
||||||
return(r)
|
return(r)
|
||||||
}
|
}
|
||||||
@ -268,7 +282,7 @@ function(o)
|
|||||||
|
|
||||||
nclst <- length(o$clsti$size)
|
nclst <- length(o$clsti$size)
|
||||||
|
|
||||||
# Correlation of clusters and factorial axes
|
# Correlation of clusters' centers and factorial axes
|
||||||
temp <- o$clsti$centers^2
|
temp <- o$clsti$centers^2
|
||||||
sum_cor <- apply(temp, 1, sum)
|
sum_cor <- apply(temp, 1, sum)
|
||||||
clstcor <- sweep(temp, 1, sum_cor, FUN="/")
|
clstcor <- sweep(temp, 1, sum_cor, FUN="/")
|
||||||
@ -285,7 +299,8 @@ function(o)
|
|||||||
|
|
||||||
tblClstCor <- t(sapply(1:nclst, selMostCorFact))
|
tblClstCor <- t(sapply(1:nclst, selMostCorFact))
|
||||||
tblClstCor <- cbind(tblClstCor, o$clsti$size)
|
tblClstCor <- cbind(tblClstCor, o$clsti$size)
|
||||||
rwithinss <- o$clsti$withinss / o$clsti$size # withinss relative to the cluster size
|
rwithinss <- o$clsti$withinss / o$clsti$size # within sum of squares
|
||||||
|
# relative to cluster size
|
||||||
clstqlty <- round_preserve_sum(1000 * rwithinss / sum(rwithinss))
|
clstqlty <- round_preserve_sum(1000 * rwithinss / sum(rwithinss))
|
||||||
tblClstCor <- cbind(tblClstCor, clstqlty)
|
tblClstCor <- cbind(tblClstCor, clstqlty)
|
||||||
rownames(tblClstCor) <- paste0('clst-', 1:nclst)
|
rownames(tblClstCor) <- paste0('clst-', 1:nclst)
|
||||||
@ -304,10 +319,10 @@ function(o, d1 = NULL, d2 = NULL)
|
|||||||
if(is.null(d1)) d1<-1
|
if(is.null(d1)) d1<-1
|
||||||
if(is.null(d2)) d2<-2
|
if(is.null(d2)) d2<-2
|
||||||
|
|
||||||
# Part de l'inertie du plan factoriel d1-d2 expliquée par chaque profil
|
# proportion of inertia of factorial plan d1-d2 explained by each profile
|
||||||
cont <- o$r * (o$f[,d1]^2 + o$f[,d2]^2) / (o$sv[d1]^2 + o$sv[d2]^2)
|
cont <- o$r * (o$f[,d1]^2 + o$f[,d2]^2) / (o$sv[d1]^2 + o$sv[d2]^2)
|
||||||
names <- rownames(o$f)
|
names <- rownames(o$f)
|
||||||
names[cont < 0.01] <- "."
|
names[cont < 0.05] <- "."
|
||||||
optimPar <- nonlinearFontSize.mca(cont)
|
optimPar <- nonlinearFontSize.mca(cont)
|
||||||
sizes <- log(1 + exp(optimPar[1]) * cont^optimPar[2])
|
sizes <- log(1 + exp(optimPar[1]) * cont^optimPar[2])
|
||||||
sizes[cont < 0.01] <- 1
|
sizes[cont < 0.01] <- 1
|
||||||
@ -330,7 +345,8 @@ function(o, d1 = NULL, d2 = NULL)
|
|||||||
|
|
||||||
plot(o$f[,d1], o$f[,d2], type = "n",
|
plot(o$f[,d1], o$f[,d2], type = "n",
|
||||||
xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n")
|
xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n")
|
||||||
text(o$f[,d1], o$f[,d2], ns$names, adj = 0, cex = ns$sizes, col = 'blue', font = 2)
|
text(o$f[,d1], o$f[,d2], ns$names, adj = 0, cex = ns$sizes,
|
||||||
|
col = 'blue', font = 2)
|
||||||
points(0, 0, pch = 3)
|
points(0, 0, pch = 3)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -366,7 +382,7 @@ function(o, d1 = NULL, d2 = NULL)
|
|||||||
if(is.null(d2)) d2<-2
|
if(is.null(d2)) d2<-2
|
||||||
|
|
||||||
nsm <- textSize.mca(o, d1, d2) # names and sizes for modalities
|
nsm <- textSize.mca(o, d1, d2) # names and sizes for modalities
|
||||||
nsi <- textSizeClst.mca(o, d1, d2) # names and sizes for clusters of individuals
|
nsi <- textSizeClst.mca(o, d1, d2) # names and sizes for clust of individuals
|
||||||
|
|
||||||
plot(c(o$f[,d1], o$fsj[,d1], o$clsti$centers[,d1]),
|
plot(c(o$f[,d1], o$fsj[,d1], o$clsti$centers[,d1]),
|
||||||
c(o$f[,d2], o$fsj[,d2], o$clsti$centers[,d2]),
|
c(o$f[,d2], o$fsj[,d2], o$clsti$centers[,d2]),
|
||||||
@ -396,10 +412,10 @@ function(o, d1 = NULL, d2 = NULL)
|
|||||||
clstcor <- sweep(temp, 1, sum_cor, FUN="/")
|
clstcor <- sweep(temp, 1, sum_cor, FUN="/")
|
||||||
clstcor <- rowSums(clstcor[,c(d1,d2)])
|
clstcor <- rowSums(clstcor[,c(d1,d2)])
|
||||||
names <- names(clstcor)
|
names <- names(clstcor)
|
||||||
names[clstcor < 0.01] <- "x"
|
names[clstcor < 0.05] <- "x"
|
||||||
optimPar <- nonlinearFontSize.mca(clstcor)
|
optimPar <- nonlinearFontSize.mca(clstcor)
|
||||||
sizes <- log(1 + exp(optimPar[1]) * clstcor^optimPar[2])
|
sizes <- log(1 + exp(optimPar[1]) * clstcor^optimPar[2])
|
||||||
sizes[clstcor < 0.01] <- 1
|
sizes[clstcor < 0.05] <- 1
|
||||||
r <- list(names=names, sizes=sizes)
|
r <- list(names=names, sizes=sizes)
|
||||||
return(r)
|
return(r)
|
||||||
}
|
}
|
||||||
|
@ -1,11 +1,16 @@
|
|||||||
|
source("04_validation_croisee_code.R")
|
||||||
|
source("05_d_svd_mca_code.R")
|
||||||
source("15_loocv_code.R")
|
source("15_loocv_code.R")
|
||||||
source("19_nystroem_approximation_code.R")
|
source("19_nystroem_approximation_code.R")
|
||||||
|
|
||||||
dataset.housing <-
|
dataset.housing <-
|
||||||
function()
|
function()
|
||||||
{
|
{
|
||||||
|
set.seed(1123)
|
||||||
|
# loading dataset in memory
|
||||||
dat <- read.csv(file="data/housing.csv", header=TRUE)
|
dat <- read.csv(file="data/housing.csv", header=TRUE)
|
||||||
|
|
||||||
|
# transform ocean_proximity into a factor
|
||||||
dat$ocean_proximity <- as.factor(dat$ocean_proximity)
|
dat$ocean_proximity <- as.factor(dat$ocean_proximity)
|
||||||
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="<1H OCEAN"] <- "O:<1H"
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="<1H OCEAN"] <- "O:<1H"
|
||||||
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="ISLAND"] <- "O:ISL"
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="ISLAND"] <- "O:ISL"
|
||||||
@ -13,33 +18,144 @@ function()
|
|||||||
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR BAY"] <- "O:NB"
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR BAY"] <- "O:NB"
|
||||||
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR OCEAN"] <- "O:NO"
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR OCEAN"] <- "O:NO"
|
||||||
|
|
||||||
|
# replace total_bedrooms missing values with the median value
|
||||||
dat$total_bedrooms[is.na(dat$total_bedrooms)] <- median(dat$total_bedrooms, na.rm=TRUE)
|
dat$total_bedrooms[is.na(dat$total_bedrooms)] <- median(dat$total_bedrooms, na.rm=TRUE)
|
||||||
|
|
||||||
|
# remove individuals corresponding to the Island modality of ocean_proximity
|
||||||
dat <- dat[dat$ocean_proximity != "O:ISL", ]
|
dat <- dat[dat$ocean_proximity != "O:ISL", ]
|
||||||
# suppression des modalités vides (ici "O:ISL")
|
|
||||||
|
# remove empty modalities (here, only "O:ISL")
|
||||||
dat <- droplevels(dat)
|
dat <- droplevels(dat)
|
||||||
|
|
||||||
|
# introduce new variable for number of rooms by households
|
||||||
dat['rooms'] <- dat['total_rooms'] / dat['households']
|
dat['rooms'] <- dat['total_rooms'] / dat['households']
|
||||||
|
# introduce new variable for number of bedrooms by households
|
||||||
dat['bedrooms'] <- dat['total_bedrooms'] / dat['households']
|
dat['bedrooms'] <- dat['total_bedrooms'] / dat['households']
|
||||||
|
# introduce new variable for number of people by households
|
||||||
dat['pop'] <- dat['population'] / dat['households']
|
dat['pop'] <- dat['population'] / dat['households']
|
||||||
|
|
||||||
|
# remove individuals with extremely high values of the target
|
||||||
dat <- dat[dat$median_house_value < 500001, ]
|
dat <- dat[dat$median_house_value < 500001, ]
|
||||||
|
|
||||||
|
# select variables to retain in the dataset
|
||||||
dat <- dat[c('longitude', 'latitude', 'housing_median_age', 'households',
|
dat <- dat[c('longitude', 'latitude', 'housing_median_age', 'households',
|
||||||
'median_income', 'median_house_value', 'ocean_proximity',
|
'median_income', 'median_house_value', 'ocean_proximity',
|
||||||
'rooms', 'bedrooms', 'pop')]
|
'rooms', 'bedrooms', 'pop')]
|
||||||
|
|
||||||
|
# one-hot-enconde categorical variable ocean_proximity
|
||||||
Z <- onehot_enc(dat[c('ocean_proximity')])
|
Z <- onehot_enc(dat[c('ocean_proximity')])
|
||||||
dat <- cbind(dat, as.data.frame(Z))
|
dat <- cbind(dat, as.data.frame(Z))
|
||||||
dat <- dat[,!(colnames(dat) %in% c('ocean_proximity'))]
|
dat <- dat[,!(colnames(dat) %in% c('ocean_proximity'))]
|
||||||
|
|
||||||
dat.all <- dat
|
dat.all <- dat
|
||||||
|
|
||||||
|
# separate observed variables X from the target Y
|
||||||
X <- dat[,!(colnames(dat) %in% c('median_house_value'))]
|
X <- dat[,!(colnames(dat) %in% c('median_house_value'))]
|
||||||
Y <- dat[,c('median_house_value')]
|
Y <- dat[,c('median_house_value')]
|
||||||
names(Y) <- rownames(X)
|
names(Y) <- rownames(X)
|
||||||
dat <- list(X = X, Y = Y)
|
dat <- list(X = X, Y = Y)
|
||||||
|
|
||||||
|
# split the dataset into train and test
|
||||||
split <- splitdata(dat, 0.8)
|
split <- splitdata(dat, 0.8)
|
||||||
entr <- split$entr
|
train <- split$entr
|
||||||
test <- split$test
|
test <- split$test
|
||||||
|
|
||||||
r <- list( dat=dat.all, entr=entr, test=test )
|
r <- list( dat=dat.all, train=train, test=test )
|
||||||
|
return(r)
|
||||||
|
}
|
||||||
|
|
||||||
|
# create a categorical dataset for correspondence analysis of the training data
|
||||||
|
dataset.housing.mca <-
|
||||||
|
function(dat=NULL)
|
||||||
|
{
|
||||||
|
# loading dataset in memory
|
||||||
|
if(is.null(dat)) { dat <- read.csv(file="data/housing.csv", header=TRUE) }
|
||||||
|
|
||||||
|
# transform ocean_proximity into a factor
|
||||||
|
dat$ocean_proximity <- as.factor(dat$ocean_proximity)
|
||||||
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="<1H OCEAN"] <- "O:<1H"
|
||||||
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="ISLAND"] <- "O:ISL"
|
||||||
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="INLAND"] <- "O:INL"
|
||||||
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR BAY"] <- "O:NB"
|
||||||
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR OCEAN"] <- "O:NO"
|
||||||
|
|
||||||
|
# discretization of longitude
|
||||||
|
cuts <- kcuts(x = dat$longitude, centers = 4)
|
||||||
|
dat$c_longitude <- cut(x = dat$longitude, unique(cuts), include.lowest = TRUE)
|
||||||
|
levels(dat$c_longitude) <- c('LO:W', 'LO:MW', 'LO:ME', 'LO:E')
|
||||||
|
|
||||||
|
# discretization of latitude
|
||||||
|
cuts <- kcuts(x = dat$latitude, centers = 4)
|
||||||
|
dat$c_latitude <- cut(x = dat$latitude, unique(cuts), include.lowest = TRUE)
|
||||||
|
levels(dat$c_latitude) <- c('LA:S','LA:MS','LA:MN','LA:N')
|
||||||
|
|
||||||
|
# discretization of housing_median_age
|
||||||
|
cuts <- c(min(dat$housing_median_age), 15, 25, 35, 51, 52)
|
||||||
|
dat$c_age <- cut(x = dat$housing_median_age, unique(cuts),
|
||||||
|
include.lowest = TRUE)
|
||||||
|
levels(dat$c_age) <- c('AG:15]','AG:25]','AG:35]','AG:51]', 'AG:52')
|
||||||
|
|
||||||
|
# creation and discretization of rooms
|
||||||
|
dat$rooms <- dat$total_rooms / dat$households
|
||||||
|
cuts <- c(min(dat$rooms), 4, 6, 8, max(dat$rooms))
|
||||||
|
dat$c_rooms <- cut(x = dat$rooms, unique(cuts), include.lowest = TRUE)
|
||||||
|
levels(dat$c_rooms) <- c('RO:4]','RO:6]','RO:8]', 'RO:>8')
|
||||||
|
|
||||||
|
# creation and discretization of bedrooms
|
||||||
|
dat$bedrooms <- dat$total_bedrooms / dat$households
|
||||||
|
cuts <- c(min(dat$bedrooms, na.rm = TRUE), 1.1, max(dat$bedrooms, na.rm = TRUE))
|
||||||
|
dat$c_bedrooms <- cut(x = dat$bedrooms, unique(cuts), include.lowest = TRUE)
|
||||||
|
levels(dat$c_bedrooms) <- c('BE:1]','BE:>1')
|
||||||
|
|
||||||
|
# creation and discretization of pop
|
||||||
|
dat$pop <- dat$population / dat$households
|
||||||
|
cuts <- c(min(dat$pop), 2, 3, 4, max(dat$pop))
|
||||||
|
dat$c_pop <- cut(x = dat$pop, unique(cuts), include.lowest = TRUE)
|
||||||
|
levels(dat$c_pop) <- c('PO:2]','PO:3]', 'PO:4]', 'PO:>4')
|
||||||
|
|
||||||
|
# discretization of households
|
||||||
|
cuts <- c(min(dat$households), 300, 400, 600, max(dat$households))
|
||||||
|
dat$c_households <- cut(x = dat$households, cuts, include.lowest = TRUE)
|
||||||
|
levels(dat$c_households) <- c('HH:3]', 'HH:4]', 'HH:6]', 'HH:>6')
|
||||||
|
|
||||||
|
# discretization of median_income
|
||||||
|
cuts <- quantile(dat$median_income, probs = seq(0,1,1/4))
|
||||||
|
cuts <- c(cuts[1:length(cuts)-1], 15, max(dat$median_income))
|
||||||
|
dat$c_income <- cut(x = dat$median_income, cuts, include.lowest = TRUE)
|
||||||
|
levels(dat$c_income) <- c('IC:L', 'IC:ML', 'IC:MH', 'IC:H', 'IC:>15')
|
||||||
|
|
||||||
|
# discretization of median_house_value
|
||||||
|
cuts <- c(min(dat$median_house_value), 115000, 175000, 250000,
|
||||||
|
max(dat$median_house_value))
|
||||||
|
dat$c_house_value <- cut(x = dat$median_house_value, cuts,
|
||||||
|
include.lowest = TRUE)
|
||||||
|
levels(dat$c_house_value) <- c('HV:A', 'HV:B', 'HV:C', 'HV:D')
|
||||||
|
|
||||||
|
# discretized version of the entire dataset
|
||||||
|
dat <- dat[c('ocean_proximity', 'c_longitude', 'c_latitude', 'c_age',
|
||||||
|
'c_rooms', 'c_bedrooms', 'c_pop', 'c_households', 'c_income',
|
||||||
|
'c_house_value')]
|
||||||
|
vent <- list()
|
||||||
|
# ventilation of modality RO:>8 of variable c_rooms
|
||||||
|
vent$c_rooms <- ventilate(dat$c_rooms, "RO:>8")
|
||||||
|
dat$c_rooms[vent$c_rooms$sup_i] <- vent$c_rooms$smpl
|
||||||
|
|
||||||
|
# ventilation of modality IC:>15 of variable c_income
|
||||||
|
vent$c_income <- ventilate(dat$c_income, "IC:>15")
|
||||||
|
dat$c_income[vent$c_income$sup_i] <- vent$c_income$smpl
|
||||||
|
|
||||||
|
# ventilation of missing values of variable c_bedrooms
|
||||||
|
vent$c_bedrooms <- ventilate(dat$c_bedrooms, "NA")
|
||||||
|
dat$c_bedrooms[vent$c_bedrooms$sup_i] <- vent$c_bedrooms$smpl
|
||||||
|
|
||||||
|
# removal of empty modalities following the various ventilations
|
||||||
|
dat <- droplevels(dat)
|
||||||
|
|
||||||
|
# c_house_value, the target, is considered as a supplementary variable
|
||||||
|
supvar <- which(names(dat) == "c_house_value")
|
||||||
|
|
||||||
|
r <- list( vent=vent, dat=dat, supvar=supvar )
|
||||||
|
|
||||||
return(r)
|
return(r)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -47,20 +163,42 @@ print("load dataset")
|
|||||||
dat <- dataset.housing()
|
dat <- dataset.housing()
|
||||||
|
|
||||||
print("Nystroem approx ridge regression")
|
print("Nystroem approx ridge regression")
|
||||||
nakrm <- nakr(dat$entr$X, dat$entr$Y)
|
nakrm <- nakr(dat$train$X, dat$train$Y)
|
||||||
nakrm.yh <- predict(nakrm, dat$test$X)
|
nakrm.yh <- predict(nakrm, dat$test$X)
|
||||||
nakrm.mae <- mean(abs(nakrm.yh - dat$test$Y))
|
nakrm.mae <- mean(abs(nakrm.yh - dat$test$Y))
|
||||||
print(paste("MAE for Nystroem: ", nakrm.mae))
|
print(paste("Test MAE for Nystroem: ", nakrm.mae))
|
||||||
|
|
||||||
|
# correspondence analysis of the training set
|
||||||
|
rawdat <- read.csv(file="data/housing.csv", header=TRUE)
|
||||||
|
train.rawdat <- rawdat[as.numeric(rownames(dat$train$X)),]
|
||||||
|
dat.mca <- dataset.housing.mca(train.rawdat)
|
||||||
|
cam <- mca(dat.mca)
|
||||||
|
|
||||||
|
# average of the prediction errors by clusters
|
||||||
|
nakrm.yh.train <- predict(nakrm, dat$train$X)
|
||||||
|
nakrm.mae.train <- mean(abs(nakrm.yh.train - dat$train$Y))
|
||||||
|
nakrm.err.train <- abs(nakrm.yh.train - dat$train$Y)
|
||||||
|
clst.err <- aggregate(nakrm.err.train, list(cam$clsti$cluster), FUN=mean)
|
||||||
|
clst.err$Group.1 <- paste0("clst-", as.character(clst.err$Group.1))
|
||||||
|
clst.tbl <- clstcor.mca(cam)
|
||||||
|
clst.err <- clst.err[match(rownames(clst.tbl),clst.err$Group.1),]
|
||||||
|
clst.tbl <- cbind(clst.tbl, clst.err[,2])
|
||||||
|
colnames(clst.tbl)[ncol(clst.tbl)] <- "ERR"
|
||||||
|
clst.tbl <- clst.tbl[order(-clst.tbl[,"ERR"]),]
|
||||||
|
print("training set clusters obtained by correspondence analysis and ordered by amount of error")
|
||||||
|
print(clst.tbl)
|
||||||
|
# From these informations, we could try to understand why for some clusters the model commits greater errors...
|
||||||
|
|
||||||
print("Linear ridge regression")
|
print("Linear ridge regression")
|
||||||
rm <- ridge(dat$entr$X, dat$entr$Y)
|
rm <- ridge(dat$train$X, dat$train$Y)
|
||||||
rm.yh <- predict(rm, dat$test$X)
|
rm.yh <- predict(rm, dat$test$X)
|
||||||
rm.mae <- mean(abs(rm.yh - dat$test$Y))
|
rm.mae <- mean(abs(rm.yh - dat$test$Y))
|
||||||
print(paste("MAE for Ridge: ", rm.mae))
|
print(paste("Test MAE for Ridge: ", rm.mae))
|
||||||
|
|
||||||
|
|
||||||
print("Random forest")
|
print("Random forest")
|
||||||
library(randomForest)
|
library(randomForest)
|
||||||
rfm <- randomForest(dat$entr$X, dat$entr$Y)
|
rfm <- randomForest(dat$train$X, dat$train$Y)
|
||||||
rfm.yh <- predict(rfm, dat$test$X)
|
rfm.yh <- predict(rfm, dat$test$X)
|
||||||
rfm.mae <- mean(abs(rfm.yh - dat$test$Y))
|
rfm.mae <- mean(abs(rfm.yh - dat$test$Y))
|
||||||
print(paste("MAE for Random Forest: ", rfm.mae))
|
print(paste("Test MAE for Random Forest: ", rfm.mae))
|
@ -4,7 +4,7 @@ author: "Pierre-Edouard Portier"
|
|||||||
documentclass: book
|
documentclass: book
|
||||||
geometry: margin=2cm
|
geometry: margin=2cm
|
||||||
fontsize: 12pt
|
fontsize: 12pt
|
||||||
date: "19 Mar 2023"
|
date: "27 Mar 2023"
|
||||||
toc: true
|
toc: true
|
||||||
classoption: fleqn
|
classoption: fleqn
|
||||||
bibliography: intro_to_ml.bib
|
bibliography: intro_to_ml.bib
|
||||||
|
Loading…
Reference in New Issue
Block a user