diff --git a/19_b_nystroem_approximation_housing_experiment_code.R b/19_b_nystroem_approximation_housing_experiment_code.R new file mode 100644 index 0000000..4cf3e7c --- /dev/null +++ b/19_b_nystroem_approximation_housing_experiment_code.R @@ -0,0 +1,106 @@ +source("05_d_svd_mca_code.R") +source("04_validation_croisee_code.R") +source("19_nystroem_approximation_code.R") + +datasetHousing.nakr <- +function() +{ + dat <- read.csv(file="data/housing.csv", header=TRUE) + + dat$ocean_proximity <- as.factor(dat$ocean_proximity) + levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="<1H OCEAN"] <- "O:<1H" + levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="ISLAND"] <- "O:ISL" + levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="INLAND"] <- "O:INL" + levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR BAY"] <- "O:NB" + levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR OCEAN"] <- "O:NO" + + dat$total_bedrooms[is.na(dat$total_bedrooms)] <- median(dat$total_bedrooms, na.rm=TRUE) + dat <- dat[dat$ocean_proximity != "O:ISL", ] + # suppression des modalités vides (ici "O:ISL") + dat <- droplevels(dat) + dat['rooms'] <- dat['total_rooms'] / dat['households'] + dat['bedrooms'] <- dat['total_bedrooms'] / dat['households'] + dat['pop'] <- dat['population'] / dat['households'] + dat <- dat[dat$median_house_value < 500001, ] + + dat <- dat[c('longitude', 'latitude', 'housing_median_age', 'households', + 'median_income', 'median_house_value', 'ocean_proximity', + 'rooms', 'bedrooms', 'pop')] + + Z <- onehot_enc(dat[c('ocean_proximity')]) + dat <- cbind(dat, as.data.frame(Z)) + dat <- dat[,!(colnames(dat) %in% c('ocean_proximity'))] + + dat.all <- dat + dat <- list(X = dat[,!(colnames(dat) %in% c('median_house_value'))], + Y = dat[,c('median_house_value')]) + split <- splitdata(dat, 0.8) + entr <- split$entr + test <- split$test + + r <- list( dat=dat.all, entr=entr, test=test ) + return(r) +} + +# test.tbl <- table(c(4,2,4,2,1,1,1)) +# all(intersperse(test.tbl) == c(1, 2, 4, 1, 2, 4, 1)) +intersperse <- +function(tbl) +{ + n <- sum(tbl) + values <- as.numeric(names(tbl)) + r <- numeric(n) + i <- 1 + while(i <= n) + { + for(j in 1:length(tbl)) + { + if(tbl[j] != 0) + { + r[i] <- values[j] + i <- i + 1 + tbl[j] <- tbl[j] - 1 + } + } + } + return(r) +} + +# sample the landmarks from the clusters of individuals +# obtained after correspondence analysis +landmarks.by.ca.clst <- +function(cam, X, nbLandmarks) +{ + if(nbLandmarks > nrow(X)) + { + stop("The number of landmarks must be less than the number of training samples.") + } + landmarks <- numeric(nbLandmarks) + clst <- cam$clsti$cluster[as.numeric(rownames(X))] + clst.tbl <- table(clst) + nb.by.clst <- table((intersperse(clst.tbl))[1:nbLandmarks]) + clst.id <- as.numeric(names(nb.by.clst)) + set.seed(1123) + clst <- sample(clst) + offset <- 0 + for (i in 1:length(nb.by.clst)) + { + k <- as.numeric(nb.by.clst[i]) + landmarks[(offset+1):(offset+k)] <- as.numeric(names((clst[clst==clst.id[i]])[1:k])) + offset <- offset+k + } + return(landmarks) +} + +hous.dat.nakr <- datasetHousing.nakr() +X.entr <- hous.dat.nakr$entr$X +Y.entr <- hous.dat.nakr$entr$Y +X.test <- hous.dat.nakr$test$X +Y.test <- hous.dat.nakr$test$Y +hous.dat.ca <- datasetHousing.mca() +hous.cam <- mca(hous.dat.ca) +nb.landmarks <- round(sqrt(nrow(X.entr))) +landmarks <- landmarks.by.ca.clst(hous.cam, X.entr, nb.landmarks) +nakrm <- kfold.nakr(X.entr, Y.entr, landmarks=landmarks) +nakrm.yh <- predict(nakrm, X.test) +nakrm.mae <- mean(abs(nakrm.yh - Y.test)) \ No newline at end of file diff --git a/19_nystroem_approximation.Rmd b/19_nystroem_approximation.Rmd index dfadf16..91b65a8 100644 --- a/19_nystroem_approximation.Rmd +++ b/19_nystroem_approximation.Rmd @@ -196,7 +196,7 @@ Nous reprenons le jeu de données synthétique utilisé depuis le premier module entr <- splitres$entr test <- splitres$test - nakrm <- nakr(entr$X,entr$Y, nspl=15) + nakrm <- nakr(entr$X,entr$Y, nb.landmarks=25) yh <- predict(nakrm,test$X) plt(test,f) points(test$X, yh, pch=4) diff --git a/19_nystroem_approximation_code.R b/19_nystroem_approximation_code.R index 8c524af..49b8aac 100644 --- a/19_nystroem_approximation_code.R +++ b/19_nystroem_approximation_code.R @@ -14,20 +14,20 @@ function(X1, X2, sigma2) # Nystroem Approximation Kernel Ridge Regression nakr <- -function(X, y, sigma2=NULL, lambdas=NULL, splidx=NULL, nspl=NULL) +function(X, y, sigma2=NULL, lambda=1E-4, landmarks=NULL, nb.landmarks=NULL) { X <- as.matrix(X) n <- nrow(X) p <- ncol(X) - if(is.null(lambdas)) { lambdas <- 10^seq(-8, 2,by=0.5) } if(is.null(sigma2)) { sigma2 <- p } - if(is.null(splidx)) { - if(is.null(nspl)) { nspl <- round(sqrt(n)) } - splidx <- sample(1:n, nspl, replace = FALSE) + if(is.null(landmarks)) { + if(is.null(nb.landmarks)) { nb.landmarks <- round(sqrt(n)) } + splidx <- sample(1:n, nb.landmarks, replace = FALSE) } else { - nspl <- length(splidx) + splidx <- which(rownames(X) %in% as.character(landmarks)) + nb.landmarks <- length(splidx) } splidx <- sort(splidx) @@ -39,35 +39,28 @@ function(X, y, sigma2=NULL, lambdas=NULL, splidx=NULL, nspl=NULL) svdK11 <- svd(K11) # K11 will often be ill-formed, thus we drop the bottom singular values - k <- 0.8 * nspl + k <- which(svdK11$d < 1E-12)[1] - 1 + US <- svdK11$u[,1:k] %*% diag(1 / sqrt(svdK11$d[1:k])) L <- C %*% US - LtL <- t(L) %*% L + Ginv <- t(L) %*% L + diag(Ginv) <- diag(Ginv) + lambda + Ginv <- chol2inv(chol(Ginv)) + Ginv <- L %*% Ginv %*% t(L) + Ginv <- - Ginv / lambda + diag(Ginv) <- diag(Ginv) + (1/lambda) + coef <- Ginv %*% y - looe <- double(length(lambdas)) - coef <- matrix(data = NA, nrow = n, ncol = length(lambdas)) - i <- 1 - for(lambda in lambdas) { - Ginv <- LtL - diag(Ginv) <- diag(Ginv) + lambda - Ginv <- solve(Ginv) - Ginv <- L %*% Ginv %*% t(L) - Ginv <- - Ginv / lambda - diag(Ginv) <- diag(Ginv) + (1/lambda) - coef[,i] <- Ginv %*% y - looe[i] <- mean((coef[,i]/diag(Ginv))^2) - i <- i+1 - } - looe.min <- min(looe) - lambda <- lambdas[which(looe == looe.min)] - coef <- coef[,which(looe == looe.min)] + K11inv <- svdK11$v[,1:k] %*% diag(1/svdK11$d[1:k]) %*% t(svdK11$u[,1:k]) + beta <- K11inv %*% t(C) %*% coef r <- list(X=X, y=y, sigma2=sigma2, + lambda=lambda, + splidx=splidx, coef=coef, - looe=looe.min, - lambda=lambda + beta=beta ) class(r) <- "nakr" return(r) @@ -87,7 +80,33 @@ function(o, newdata) } newdata <- scale(newdata,center=attr(o$X,"scaled:center"), scale=attr(o$X,"scaled:scale")) - Ktest <- gausskernel(newdata, o$X, o$sigma2) - yh <- Ktest %*% o$coef + Ktest <- gausskernel(newdata, as.matrix(o$X[o$splidx,]), o$sigma2) + yh <- Ktest %*% o$beta yh <- (yh * attr(o$y,"scaled:scale")) + attr(o$y,"scaled:center") } + +kfold.nakr <- +function(X, y, K=5, lambdas=NULL, sigma2=NULL, landmarks=NULL, nb.landmarks=NULL) +{ + if(is.null(lambdas)) { lambdas <- 10^seq(-8, 2, by=1) } + + N <- nrow(X) + folds <- rep_len(1:K, N) + folds <- sample(folds, N) + maes <- matrix(data = NA, nrow = K, ncol = length(lambdas)) + colnames(maes) <- lambdas + lambda_idx <- 1 + for(lambda in lambdas) { + for(k in 1:K) { + fold <- folds == k + nakrm <- nakr(X[!fold,], y[!fold], sigma2, lambda, landmarks, nb.landmarks) + pred <- predict(nakrm, X[fold,]) + maes[k,lambda_idx] <- mean(abs(pred - y[fold])) + } + lambda_idx <- lambda_idx + 1 + } + mmaes <- colMeans(maes) + minmmaes <- min(mmaes) + bestlambda <- lambdas[which(mmaes == minmmaes)] + nakrm <- nakr(X, y, sigma2, bestlambda, landmarks, nb.landmarks) +} \ No newline at end of file