Nystroem approximation is now correct (and simplified).

This commit is contained in:
Pierre-Edouard Portier 2023-03-19 18:48:47 +01:00
parent 3c83c256b5
commit 5689bf6fef
5 changed files with 69 additions and 231 deletions

View File

@ -1,8 +1,7 @@
source("05_d_svd_mca_code.R")
source("04_validation_croisee_code.R")
source("15_loocv_code.R")
source("19_nystroem_approximation_code.R")
datasetHousing.nakr <-
dataset.housing <-
function()
{
dat <- read.csv(file="data/housing.csv", header=TRUE)
@ -44,68 +43,24 @@ function()
return(r)
}
# test.tbl <- table(c(4,2,4,2,1,1,1))
# all(intersperse(test.tbl) == c(1, 2, 4, 1, 2, 4, 1))
intersperse <-
function(tbl)
{
n <- sum(tbl)
values <- as.numeric(names(tbl))
r <- numeric(n)
i <- 1
while(i <= n)
{
for(j in 1:length(tbl))
{
if(tbl[j] != 0)
{
r[i] <- values[j]
i <- i + 1
tbl[j] <- tbl[j] - 1
}
}
}
return(r)
}
print("load dataset")
dat <- dataset.housing()
# sample the landmarks from the clusters of individuals
# obtained after correspondence analysis
landmarks.by.ca.clst <-
function(cam, X, nbLandmarks)
{
if(nbLandmarks > nrow(X))
{
stop("The number of landmarks must be less than the number of training samples.")
}
landmarks <- numeric(nbLandmarks)
clst <- cam$clsti$cluster[as.numeric(rownames(X))]
clst.tbl <- table(clst)
nb.by.clst <- table((intersperse(clst.tbl))[1:nbLandmarks])
clst.id <- as.numeric(names(nb.by.clst))
set.seed(1123)
clst <- sample(clst)
offset <- 0
for (i in 1:length(nb.by.clst))
{
k <- as.numeric(nb.by.clst[i])
landmarks[(offset+1):(offset+k)] <- as.numeric(names((clst[clst==clst.id[i]])[1:k]))
offset <- offset+k
}
return(landmarks)
}
print("Nystroem approx ridge regression")
nakrm <- nakr(dat$entr$X, dat$entr$Y)
nakrm.yh <- predict(nakrm, dat$test$X)
nakrm.mae <- mean(abs(nakrm.yh - dat$test$Y))
print(paste("MAE for Nystroem: ", nakrm.mae))
hous.dat.nakr <- datasetHousing.nakr()
X.entr <- hous.dat.nakr$entr$X
Y.entr <- hous.dat.nakr$entr$Y
X.test <- hous.dat.nakr$test$X
Y.test <- hous.dat.nakr$test$Y
hous.dat.ca <- datasetHousing.mca()
hous.cam <- mca(hous.dat.ca)
nb.landmarks <- round(sqrt(nrow(X.entr)))
landmarks <- landmarks.by.ca.clst(hous.cam, X.entr, nb.landmarks)
# nakrm <- kfold.nakr(X.entr, Y.entr, landmarks=landmarks)
# nakrm.yh <- predict(nakrm, X.test)
# nakrm.mae <- mean(abs(nakrm.yh - Y.test))
# nakrm.yh.train <- predict(nakrm, X.entr)
# rev(order(abs(nakrm.yh.train - Y.entr)))[1:20]
# hist(Y.entr[rev(order(abs(nakrm.yh.train - Y.entr)))[1:200]])
print("Linear ridge regression")
rm <- ridge(dat$entr$X, dat$entr$Y)
rm.yh <- predict(rm, dat$test$X)
rm.mae <- mean(abs(rm.yh - dat$test$Y))
print(paste("MAE for Ridge: ", rm.mae))
print("Random forest")
library(randomForest)
rfm <- randomForest(dat$entr$X, dat$entr$Y)
rfm.yh <- predict(rfm, dat$test$X)
rfm.mae <- mean(abs(rfm.yh - dat$test$Y))
print(paste("MAE for Random Forest: ", rfm.mae))

View File

@ -3,6 +3,7 @@
```{r, include=FALSE}
source("01_intro_code.R", local = knitr::knit_global())
source("04_validation_croisee_code.R", local = knitr::knit_global())
source("15_loocv_code.R", local = knitr::knit_global())
source("19_nystroem_approximation_code.R", local = knitr::knit_global())
```
@ -103,7 +104,7 @@ $$
\end{aligned}
$$
Ainsi, nous pouvons introduire une matrice $\boldsymbol\Phi$ de dimension $m$ telle que $\mathbf{K} \approx \boldsymbol\Phi \boldsymbol\Phi^T$.
Ainsi, nous pouvons introduire une matrice $\boldsymbol\Phi$ de rang $m$ telle que $\mathbf{K} \approx \boldsymbol\Phi \boldsymbol\Phi^T$.
$$
\boldsymbol\Phi =
\left[
@ -162,45 +163,7 @@ $$
\end{aligned}
$$
## Régression ridge à noyau et approximation de Nyström
Dans le cadre de la régression ridge à noyau (voir un précédent module), nous notons : $\mathbf{G} = \mathbf{K} + \lambda\mathbf{I_n}$. Les coefficients du modèle ridge sont alors donnés par : $\boldsymbol\alpha_\lambda = \mathbf{G}^{-1} \mathbf{y}$. Nous cherchons à calculer efficacement $\mathbf{G}^{-1}$ à partir d'une approximation Nyström de rang $m$ de $\mathbf{K} \approx \mathbf{L}\mathbf{L}^T$. Pour ce faire, nous utilisons une forme de l'identité de Woodbury :
$$
\left(\mathbf{A} + \mathbf{U}\mathbf{C}\mathbf{V}\right)^{-1} = \mathbf{A}^{-1} - \mathbf{A}^{-1}\mathbf{U}\left(\mathbf{C}^{-1} + \mathbf{V}\mathbf{A}^{-1}\mathbf{U}\right)^{-1}\mathbf{V} \mathbf{A}^{-1}
$$
Nous obtenons ainsi :
$$
\begin{aligned}
& \mathbf{G}^{-1} \\
= \{& \text{Définition de $\mathbf{G}$} \} \\
& \left(\lambda\mathbf{I_n} + \mathbf{K}\right)^{-1} \\
\approx \{& \text{Approximation de Nyström} \} \\
& \left(\lambda\mathbf{I_n} + \mathbf{L}\mathbf{L}^T\right)^{-1} \\
= \{& \text{Identité de Woodbury} \} \\
& \lambda^{-1}\mathbf{I_n} - \lambda^{-1}\mathbf{L}\left(\mathbf{I_m}+\lambda^{-1}\mathbf{L}^T\mathbf{L}\right)^{-1}\lambda^{-1}\mathbf{L}^T \\
= \{& \text{Algèbre linéaire : si $\mathbf{A}$ et $\mathbf{B}$ sont des matrices carrés inversibles alors $(\mathbf{A}\mathbf{B})^{-1} = \mathbf{B}^{-1}\mathbf{A}^{-1}$} \} \\
& \lambda^{-1}\mathbf{I_n} - \lambda^{-1}\mathbf{L}\left(\lambda\mathbf{I_m}+\mathbf{L}^T\mathbf{L}\right)^{-1}\mathbf{L}^T \\
\end{aligned}
$$
Montrons comment calculer efficacement la prédiction $\hat{y}_{new}$ d'une nouvelle observation $\mathbf{x_{new}}$. Nous avons montré plus haut que l'approximation de Nyström pouvait s'écrire $\mathbf{K} \approx \mathbf{C} \mathbf{K_{11}}^{-1} \mathbf{C}^T$. Donc la ligne de l'approximation du noyau $\mathbf{K}$ qui correspond à l'observation $\mathbf{x_j}$ du jeu d'entrainement (dont la version non approximée est $[k(\mathbf{x_j},\mathbf{x_1}),\dots,k(\mathbf{x_j},\mathbf{x_n})]$) s'obtient par combinaison linéaire des lignes de $\mathbf{K_{11}}^{-1} \mathbf{C}^T$. Les $m$ coefficients de cette combinaison linéaire sont $(k(\mathbf{x_j},\mathbf{x_1}),\dots,k(\mathbf{x_j},\mathbf{x_m}))$.
Ainsi, l'approximation d'une "nouvelle ligne" du noyau $\mathbf{K}$ formée des similarités d'une nouvelle observation $\mathbf{x_{new}}$ avec les observations $(\mathbf{x_1},\dots,\mathbf{x_n})$ du jeu d'entrainement est :
$$[k(\mathbf{x_{new}},\mathbf{x_1}),\dots,k(\mathbf{x_{new}},\mathbf{x_n})] \approx [k(\mathbf{x_{new}},\mathbf{x_1}),\dots,k(\mathbf{x_{new}},\mathbf{x_m})] \mathbf{K_{11}}^{-1} \mathbf{C}^T$$
Enfin, la prédiction associée à $\mathbf{x_{new}}$ à partir de la valeur approximée du noyau est :
$$
\begin{aligned}
& \hat{y}_{new} \\
\approx \{& \text{voir raisonnement ci-dessus, et } \hat{y} = \mathbf{K} \boldsymbol\alpha_\lambda \} \\
& \left[ k(\mathbf{x_{new}},\mathbf{x_1}),\dots,k(\mathbf{x_{new}},\mathbf{x_m}) \right] \left( \mathbf{K_{11}}^{-1} \mathbf{C}^T \boldsymbol\alpha_\lambda \right) \\
= \{& \text{introduction de $\boldsymbol\beta_\lambda = \mathbf{K_{11}}^{-1} \mathbf{C}^T \boldsymbol\alpha_\lambda$} \} \\
& \left[ k(\mathbf{x_{new}},\mathbf{x_1}),\dots,k(\mathbf{x_{new}},\mathbf{x_m}) \right] \boldsymbol\beta_\lambda \\
\end{aligned}
$$
$\boldsymbol\beta_\lambda$ peut être calculé une seule fois à la fin de l'entrainement pour être ensuite réutilisé pour chaque prédiction.
Nous voyons que la matrice $\mathbf{L}$ correspond exactement à la matrice $\boldsymbol\Phi$ dont les lignes sont les nouvelles variables $m$-dimensionnelles telles que $\boldsymbol\Phi \boldsymbol\Phi^T$ soit une approximation de rang-$m$ de $\mathbf{K}$. Ainsi, pour opérer une régression ridge à noyau avec approximation de Nyström, il suffit de faire une régression ridge sur ces variables transformées.
## Exemple sur un jeu de données synthétique

View File

@ -1,80 +1,38 @@
# compute the gaussian kernel between each row of X1 and each row of X2
# should be done more efficiently (C code, threads)
gausskernel.nakr <-
function(X1, X2, sigma2)
{
n1 <- dim(X1)[1]
n2 <- dim(X2)[1]
K <- matrix(nrow = n1, ncol = n2)
for(i in 1:n1)
for(j in 1:n2)
K[i,j] <- sum((X1[i,] - X2[j,])^2)
K <- exp(-1*K/sigma2)
}
# Nystroem Approximation Kernel Ridge Regression
nakr <-
function(X, y, sigma2=NULL, lambda=1E-8, landmarks=NULL, nb.landmarks=NULL)
function(X, y, sigma2=NULL, lambdas=NULL, nb.landmarks=NULL)
{
set.seed(1123)
X <- as.matrix(X)
n <- nrow(X)
p <- ncol(X)
if(is.null(sigma2)) { sigma2 <- p }
ldm <- landmarks.nakr(X, landmarks, nb.landmarks)
if(is.null(nb.landmarks)) { nb.landmarks <- 15*n^(1/3) }
X <- scale(X)
y <- scale(y)
C <- gausskernel.nakr(X, as.matrix(X[ldm$idx,]), sigma2)
K11 <- C[ldm$idx,]
svdK11 <- svd(K11)
# K11 often ill-formed -> drop small sv
ks <- which(svdK11$d < 1E-12)
if (length(ks)>0) {k <- ks[1]} else {k <- length(svdK11$d)}
US <- svdK11$u[,1:k] %*% diag(1 / sqrt(svdK11$d[1:k]))
L <- C %*% US
Ginv <- t(L) %*% L
diag(Ginv) <- diag(Ginv) + lambda
Ginv <- chol2inv(chol(Ginv))
Ginv <- L %*% Ginv %*% t(L)
Ginv <- - Ginv / lambda
diag(Ginv) <- diag(Ginv) + (1/lambda)
coef <- Ginv %*% y
K11inv <- svdK11$v[,1:k] %*% diag(1/svdK11$d[1:k]) %*% t(svdK11$u[,1:k])
beta <- K11inv %*% t(C) %*% coef
r <- list(X=X,
y=y,
idx.landmarks <- sample(1:n, nb.landmarks, replace = FALSE)
S <- X[idx.landmarks, ]
K11 <- gausskernel.nakr(S, S, sigma2)
C <- gausskernel.nakr(X, S, sigma2)
svd.K11 <- svd(K11)
ks <- which(svd.K11$d < 1E-12) # K11 often ill-formed -> drop small sv
if (length(ks)>0) {k <- ks[1]} else {k <- length(svd.K11$d)}
W <- svd.K11$u[,1:k] %*% diag(1/sqrt(svd.K11$d[1:k]))
Phi <- C %*% W
ridge <- ridge(Phi, y, lambdas)
r <- list(center.X=attr(X,"scaled:center"),
scale.X=attr(X,"scaled:scale"),
center.y=attr(y,"scaled:center"),
scale.y=attr(y,"scaled:scale"),
S=S,
sigma2=sigma2,
lambda=lambda,
ldmidx=ldm$idx,
coef=coef,
beta=beta
W=W,
ridge=ridge
)
class(r) <- "nakr"
return(r)
}
landmarks.nakr <-
function(X, landmarks, nb.landmarks)
{
n <- nrow(X)
if(is.null(landmarks)) {
if(is.null(nb.landmarks)) { nb.landmarks <- round(sqrt(n)) }
ldmidx <- sample(1:n, nb.landmarks, replace = FALSE)
} else {
ldmidx <- which(rownames(X) %in% as.character(landmarks))
}
ldmidx <- sort(ldmidx)
ldmnms <- as.numeric(rownames(X)[ldmidx])
return(list(idx=ldmidx, nms=ldmnms))
}
predict.nakr <-
function(o, newdata)
{
@ -83,42 +41,30 @@ function(o, newdata)
UseMethod("predict")
return(invisible(NULL))
}
newdata <- as.matrix(newdata)
if(ncol(o$X)!=ncol(newdata)) {
stop("Not the same number of variables btwn fitted nakr object and new data")
}
newdata <- scale(newdata,center=attr(o$X,"scaled:center"),
scale=attr(o$X,"scaled:scale"))
Ktest <- gausskernel.nakr(newdata, as.matrix(o$X[o$ldmidx,]), o$sigma2)
yh <- Ktest %*% o$beta
yh <- (yh * attr(o$y,"scaled:scale")) + attr(o$y,"scaled:center")
test <- as.matrix(newdata)
test <- scale(test,center=o$center.X,scale=o$scale.X)
K.test <- gausskernel.nakr(test, o$S, o$sigma2)
Phi.test <- K.test %*% o$W
yh <- predict(o$ridge, Phi.test)
yh <- yh * o$scale.y + o$center.y
}
kfold.nakr <-
function(X, y, K=5, lambdas=NULL, sigma2=NULL, landmarks=NULL, nb.landmarks=NULL)
# compute the gaussian kernel between each row of X1 and each row of X2
# should be done more efficiently (C code, threads)
gausskernel.nakr <-
function(X1, X2, sigma2)
{
if(is.null(lambdas)) { lambdas <- 10^seq(-8, 2, by=1) }
n <- nrow(X)
folds <- rep_len(1:K, n)
folds <- sample(folds, n)
maes <- matrix(data = NA, nrow = K, ncol = length(lambdas))
colnames(maes) <- lambdas
lambda_idx <- 1
ldm <- landmarks.nakr(X, landmarks, nb.landmarks)
for(lambda in lambdas) {
for(k in 1:K) {
fold <- folds == k
ldmnms2keep <- ldm$nms[! ldm$idx %in% which(fold)]
nakrm <- nakr(X[!fold,], y[!fold], sigma2, lambda, landmarks=ldmnms2keep)
pred <- predict(nakrm, X[fold,])
maes[k,lambda_idx] <- mean(abs(pred - y[fold]))
print(paste("lbd =", lambda, "; k =", k, "; mae =", maes[k,lambda_idx]))
}
lambda_idx <- lambda_idx + 1
}
mmaes <- colMeans(maes)
minmmaes <- min(mmaes)
bestlambda <- lambdas[which(mmaes == minmmaes)]
nakrm <- nakr(X, y, sigma2, bestlambda, landmarks=ldm$nms)
if(is(X1,"vector"))
X1 <- as.matrix(X1)
if(is(X2,"vector"))
X2 <- as.matrix(X2)
if (!(dim(X1)[2]==dim(X2)[2]))
stop("X1 and X2 must have the same number of columns")
n1 <- dim(X1)[1]
n2 <- dim(X2)[1]
dotX1 <- rowSums(X1*X1)
dotX2 <- rowSums(X2*X2)
res <- X1%*%t(X2)
for(i in 1:n2) res[,i] <- exp((2*res[,i] - dotX1 - rep(dotX2[i],n1))/sigma2)
return(res)
}

View File

@ -4,7 +4,7 @@ author: "Pierre-Edouard Portier"
documentclass: book
geometry: margin=2cm
fontsize: 12pt
date: "5 Mar 2023"
date: "19 Mar 2023"
toc: true
classoption: fleqn
bibliography: intro_to_ml.bib

26
pad.R
View File

@ -27,29 +27,3 @@
# adj = 0, cex = 0.6)
# points(0, 0, pch = 3)
# ```
# source("19_b_nystroem_approximation_housing_experiment_code.R")
# rdat <- hous.dat.nakr$dat[sample(nrow(hous.dat.nakr$dat), size=2000, replace=FALSE),]
# X <- rdat[,!(colnames(rdat) %in% c('median_house_value'))]
# Y <- rdat[,c('median_house_value')]
# names(Y) <- rownames(X)
# rsplt <- splitdata(list(X = X, Y = Y), 0.8)
# X.entr <- rsplt$entr$X
# Y.entr <- rsplt$entr$Y
# X.test <- rsplt$test$X
# Y.test <- rsplt$test$Y
# source("18_kernel_ridge_regression_code.R")
# krm <- krr(X.entr, Y.entr)
# krm.yh <- predict(krm, X.test)
# krm.mae <- mean(abs(krm.yh - Y.test)) # 35445.1
# nakrm <- nakr(X.entr, Y.entr, nb.landmarks=1600)
# nakrm.yh <- predict(nakrm, X.test)
# nakrm.mae <- mean(abs(nakrm.yh - Y.test)) # 65454.18
# source("15_loocv_code.R")
# rm <- ridge(X.entr, Y.entr)
# rm.yh <- predict(rm, X.test)
# rm.mae <- mean(abs(rm.yh - Y.test)) # 45786.62
# library(randomForest)
# rfm <- randomForest(X.entr, Y.entr)
# rfm.yh <- predict(rfm, X.test)
# rfm.mae <- mean(abs(rfm.yh - Y.test)) # 34229.02