111 lines
3.5 KiB
R
111 lines
3.5 KiB
R
source("05_d_svd_mca_code.R")
|
|
source("04_validation_croisee_code.R")
|
|
source("19_nystroem_approximation_code.R")
|
|
|
|
datasetHousing.nakr <-
|
|
function()
|
|
{
|
|
dat <- read.csv(file="data/housing.csv", header=TRUE)
|
|
|
|
dat$ocean_proximity <- as.factor(dat$ocean_proximity)
|
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="<1H OCEAN"] <- "O:<1H"
|
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="ISLAND"] <- "O:ISL"
|
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="INLAND"] <- "O:INL"
|
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR BAY"] <- "O:NB"
|
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR OCEAN"] <- "O:NO"
|
|
|
|
dat$total_bedrooms[is.na(dat$total_bedrooms)] <- median(dat$total_bedrooms, na.rm=TRUE)
|
|
dat <- dat[dat$ocean_proximity != "O:ISL", ]
|
|
# suppression des modalités vides (ici "O:ISL")
|
|
dat <- droplevels(dat)
|
|
dat['rooms'] <- dat['total_rooms'] / dat['households']
|
|
dat['bedrooms'] <- dat['total_bedrooms'] / dat['households']
|
|
dat['pop'] <- dat['population'] / dat['households']
|
|
dat <- dat[dat$median_house_value < 500001, ]
|
|
|
|
dat <- dat[c('longitude', 'latitude', 'housing_median_age', 'households',
|
|
'median_income', 'median_house_value', 'ocean_proximity',
|
|
'rooms', 'bedrooms', 'pop')]
|
|
|
|
Z <- onehot_enc(dat[c('ocean_proximity')])
|
|
dat <- cbind(dat, as.data.frame(Z))
|
|
dat <- dat[,!(colnames(dat) %in% c('ocean_proximity'))]
|
|
|
|
dat.all <- dat
|
|
X <- dat[,!(colnames(dat) %in% c('median_house_value'))]
|
|
Y <- dat[,c('median_house_value')]
|
|
names(Y) <- rownames(X)
|
|
dat <- list(X = X, Y = Y)
|
|
split <- splitdata(dat, 0.8)
|
|
entr <- split$entr
|
|
test <- split$test
|
|
|
|
r <- list( dat=dat.all, entr=entr, test=test )
|
|
return(r)
|
|
}
|
|
|
|
# test.tbl <- table(c(4,2,4,2,1,1,1))
|
|
# all(intersperse(test.tbl) == c(1, 2, 4, 1, 2, 4, 1))
|
|
intersperse <-
|
|
function(tbl)
|
|
{
|
|
n <- sum(tbl)
|
|
values <- as.numeric(names(tbl))
|
|
r <- numeric(n)
|
|
i <- 1
|
|
while(i <= n)
|
|
{
|
|
for(j in 1:length(tbl))
|
|
{
|
|
if(tbl[j] != 0)
|
|
{
|
|
r[i] <- values[j]
|
|
i <- i + 1
|
|
tbl[j] <- tbl[j] - 1
|
|
}
|
|
}
|
|
}
|
|
return(r)
|
|
}
|
|
|
|
# sample the landmarks from the clusters of individuals
|
|
# obtained after correspondence analysis
|
|
landmarks.by.ca.clst <-
|
|
function(cam, X, nbLandmarks)
|
|
{
|
|
if(nbLandmarks > nrow(X))
|
|
{
|
|
stop("The number of landmarks must be less than the number of training samples.")
|
|
}
|
|
landmarks <- numeric(nbLandmarks)
|
|
clst <- cam$clsti$cluster[as.numeric(rownames(X))]
|
|
clst.tbl <- table(clst)
|
|
nb.by.clst <- table((intersperse(clst.tbl))[1:nbLandmarks])
|
|
clst.id <- as.numeric(names(nb.by.clst))
|
|
set.seed(1123)
|
|
clst <- sample(clst)
|
|
offset <- 0
|
|
for (i in 1:length(nb.by.clst))
|
|
{
|
|
k <- as.numeric(nb.by.clst[i])
|
|
landmarks[(offset+1):(offset+k)] <- as.numeric(names((clst[clst==clst.id[i]])[1:k]))
|
|
offset <- offset+k
|
|
}
|
|
return(landmarks)
|
|
}
|
|
|
|
hous.dat.nakr <- datasetHousing.nakr()
|
|
X.entr <- hous.dat.nakr$entr$X
|
|
Y.entr <- hous.dat.nakr$entr$Y
|
|
X.test <- hous.dat.nakr$test$X
|
|
Y.test <- hous.dat.nakr$test$Y
|
|
# hous.dat.ca <- datasetHousing.mca()
|
|
# hous.cam <- mca(hous.dat.ca)
|
|
# nb.landmarks <- round(sqrt(nrow(X.entr)))
|
|
# landmarks <- landmarks.by.ca.clst(hous.cam, X.entr, nb.landmarks)
|
|
# nakrm <- kfold.nakr(X.entr, Y.entr, landmarks=landmarks)
|
|
# nakrm.yh <- predict(nakrm, X.test)
|
|
# nakrm.mae <- mean(abs(nakrm.yh - Y.test))
|
|
# nakrm.yh.train <- predict(nakrm, X.entr)
|
|
# rev(order(abs(nakrm.yh.train - Y.entr)))[1:20]
|
|
# hist(Y.entr[rev(order(abs(nakrm.yh.train - Y.entr)))[1:200]]) |