source("05_d_svd_mca_code.R") source("04_validation_croisee_code.R") source("19_nystroem_approximation_code.R") datasetHousing.nakr <- function() { dat <- read.csv(file="data/housing.csv", header=TRUE) dat$ocean_proximity <- as.factor(dat$ocean_proximity) levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="<1H OCEAN"] <- "O:<1H" levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="ISLAND"] <- "O:ISL" levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="INLAND"] <- "O:INL" levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR BAY"] <- "O:NB" levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR OCEAN"] <- "O:NO" dat$total_bedrooms[is.na(dat$total_bedrooms)] <- median(dat$total_bedrooms, na.rm=TRUE) dat <- dat[dat$ocean_proximity != "O:ISL", ] # suppression des modalités vides (ici "O:ISL") dat <- droplevels(dat) dat['rooms'] <- dat['total_rooms'] / dat['households'] dat['bedrooms'] <- dat['total_bedrooms'] / dat['households'] dat['pop'] <- dat['population'] / dat['households'] dat <- dat[dat$median_house_value < 500001, ] dat <- dat[c('longitude', 'latitude', 'housing_median_age', 'households', 'median_income', 'median_house_value', 'ocean_proximity', 'rooms', 'bedrooms', 'pop')] Z <- onehot_enc(dat[c('ocean_proximity')]) dat <- cbind(dat, as.data.frame(Z)) dat <- dat[,!(colnames(dat) %in% c('ocean_proximity'))] dat.all <- dat X <- dat[,!(colnames(dat) %in% c('median_house_value'))] Y <- dat[,c('median_house_value')] names(Y) <- rownames(X) dat <- list(X = X, Y = Y) split <- splitdata(dat, 0.8) entr <- split$entr test <- split$test r <- list( dat=dat.all, entr=entr, test=test ) return(r) } # test.tbl <- table(c(4,2,4,2,1,1,1)) # all(intersperse(test.tbl) == c(1, 2, 4, 1, 2, 4, 1)) intersperse <- function(tbl) { n <- sum(tbl) values <- as.numeric(names(tbl)) r <- numeric(n) i <- 1 while(i <= n) { for(j in 1:length(tbl)) { if(tbl[j] != 0) { r[i] <- values[j] i <- i + 1 tbl[j] <- tbl[j] - 1 } } } return(r) } # sample the landmarks from the clusters of individuals # obtained after correspondence analysis landmarks.by.ca.clst <- function(cam, X, nbLandmarks) { if(nbLandmarks > nrow(X)) { stop("The number of landmarks must be less than the number of training samples.") } landmarks <- numeric(nbLandmarks) clst <- cam$clsti$cluster[as.numeric(rownames(X))] clst.tbl <- table(clst) nb.by.clst <- table((intersperse(clst.tbl))[1:nbLandmarks]) clst.id <- as.numeric(names(nb.by.clst)) set.seed(1123) clst <- sample(clst) offset <- 0 for (i in 1:length(nb.by.clst)) { k <- as.numeric(nb.by.clst[i]) landmarks[(offset+1):(offset+k)] <- as.numeric(names((clst[clst==clst.id[i]])[1:k])) offset <- offset+k } return(landmarks) } hous.dat.nakr <- datasetHousing.nakr() X.entr <- hous.dat.nakr$entr$X Y.entr <- hous.dat.nakr$entr$Y X.test <- hous.dat.nakr$test$X Y.test <- hous.dat.nakr$test$Y # hous.dat.ca <- datasetHousing.mca() # hous.cam <- mca(hous.dat.ca) # nb.landmarks <- round(sqrt(nrow(X.entr))) # landmarks <- landmarks.by.ca.clst(hous.cam, X.entr, nb.landmarks) # nakrm <- kfold.nakr(X.entr, Y.entr, landmarks=landmarks) # nakrm.yh <- predict(nakrm, X.test) # nakrm.mae <- mean(abs(nakrm.yh - Y.test)) # nakrm.yh.train <- predict(nakrm, X.entr) # rev(order(abs(nakrm.yh.train - Y.entr)))[1:20] # hist(Y.entr[rev(order(abs(nakrm.yh.train - Y.entr)))[1:200]])