Nystroem approximation is now correct (and simplified).
This commit is contained in:
parent
3c83c256b5
commit
5689bf6fef
@ -1,8 +1,7 @@
|
||||
source("05_d_svd_mca_code.R")
|
||||
source("04_validation_croisee_code.R")
|
||||
source("15_loocv_code.R")
|
||||
source("19_nystroem_approximation_code.R")
|
||||
|
||||
datasetHousing.nakr <-
|
||||
dataset.housing <-
|
||||
function()
|
||||
{
|
||||
dat <- read.csv(file="data/housing.csv", header=TRUE)
|
||||
@ -44,68 +43,24 @@ function()
|
||||
return(r)
|
||||
}
|
||||
|
||||
# test.tbl <- table(c(4,2,4,2,1,1,1))
|
||||
# all(intersperse(test.tbl) == c(1, 2, 4, 1, 2, 4, 1))
|
||||
intersperse <-
|
||||
function(tbl)
|
||||
{
|
||||
n <- sum(tbl)
|
||||
values <- as.numeric(names(tbl))
|
||||
r <- numeric(n)
|
||||
i <- 1
|
||||
while(i <= n)
|
||||
{
|
||||
for(j in 1:length(tbl))
|
||||
{
|
||||
if(tbl[j] != 0)
|
||||
{
|
||||
r[i] <- values[j]
|
||||
i <- i + 1
|
||||
tbl[j] <- tbl[j] - 1
|
||||
}
|
||||
}
|
||||
}
|
||||
return(r)
|
||||
}
|
||||
print("load dataset")
|
||||
dat <- dataset.housing()
|
||||
|
||||
# sample the landmarks from the clusters of individuals
|
||||
# obtained after correspondence analysis
|
||||
landmarks.by.ca.clst <-
|
||||
function(cam, X, nbLandmarks)
|
||||
{
|
||||
if(nbLandmarks > nrow(X))
|
||||
{
|
||||
stop("The number of landmarks must be less than the number of training samples.")
|
||||
}
|
||||
landmarks <- numeric(nbLandmarks)
|
||||
clst <- cam$clsti$cluster[as.numeric(rownames(X))]
|
||||
clst.tbl <- table(clst)
|
||||
nb.by.clst <- table((intersperse(clst.tbl))[1:nbLandmarks])
|
||||
clst.id <- as.numeric(names(nb.by.clst))
|
||||
set.seed(1123)
|
||||
clst <- sample(clst)
|
||||
offset <- 0
|
||||
for (i in 1:length(nb.by.clst))
|
||||
{
|
||||
k <- as.numeric(nb.by.clst[i])
|
||||
landmarks[(offset+1):(offset+k)] <- as.numeric(names((clst[clst==clst.id[i]])[1:k]))
|
||||
offset <- offset+k
|
||||
}
|
||||
return(landmarks)
|
||||
}
|
||||
print("Nystroem approx ridge regression")
|
||||
nakrm <- nakr(dat$entr$X, dat$entr$Y)
|
||||
nakrm.yh <- predict(nakrm, dat$test$X)
|
||||
nakrm.mae <- mean(abs(nakrm.yh - dat$test$Y))
|
||||
print(paste("MAE for Nystroem: ", nakrm.mae))
|
||||
|
||||
hous.dat.nakr <- datasetHousing.nakr()
|
||||
X.entr <- hous.dat.nakr$entr$X
|
||||
Y.entr <- hous.dat.nakr$entr$Y
|
||||
X.test <- hous.dat.nakr$test$X
|
||||
Y.test <- hous.dat.nakr$test$Y
|
||||
hous.dat.ca <- datasetHousing.mca()
|
||||
hous.cam <- mca(hous.dat.ca)
|
||||
nb.landmarks <- round(sqrt(nrow(X.entr)))
|
||||
landmarks <- landmarks.by.ca.clst(hous.cam, X.entr, nb.landmarks)
|
||||
# nakrm <- kfold.nakr(X.entr, Y.entr, landmarks=landmarks)
|
||||
# nakrm.yh <- predict(nakrm, X.test)
|
||||
# nakrm.mae <- mean(abs(nakrm.yh - Y.test))
|
||||
# nakrm.yh.train <- predict(nakrm, X.entr)
|
||||
# rev(order(abs(nakrm.yh.train - Y.entr)))[1:20]
|
||||
# hist(Y.entr[rev(order(abs(nakrm.yh.train - Y.entr)))[1:200]])
|
||||
print("Linear ridge regression")
|
||||
rm <- ridge(dat$entr$X, dat$entr$Y)
|
||||
rm.yh <- predict(rm, dat$test$X)
|
||||
rm.mae <- mean(abs(rm.yh - dat$test$Y))
|
||||
print(paste("MAE for Ridge: ", rm.mae))
|
||||
|
||||
print("Random forest")
|
||||
library(randomForest)
|
||||
rfm <- randomForest(dat$entr$X, dat$entr$Y)
|
||||
rfm.yh <- predict(rfm, dat$test$X)
|
||||
rfm.mae <- mean(abs(rfm.yh - dat$test$Y))
|
||||
print(paste("MAE for Random Forest: ", rfm.mae))
|
@ -3,6 +3,7 @@
|
||||
```{r, include=FALSE}
|
||||
source("01_intro_code.R", local = knitr::knit_global())
|
||||
source("04_validation_croisee_code.R", local = knitr::knit_global())
|
||||
source("15_loocv_code.R", local = knitr::knit_global())
|
||||
source("19_nystroem_approximation_code.R", local = knitr::knit_global())
|
||||
```
|
||||
|
||||
@ -103,7 +104,7 @@ $$
|
||||
\end{aligned}
|
||||
$$
|
||||
|
||||
Ainsi, nous pouvons introduire une matrice $\boldsymbol\Phi$ de dimension $m$ telle que $\mathbf{K} \approx \boldsymbol\Phi \boldsymbol\Phi^T$.
|
||||
Ainsi, nous pouvons introduire une matrice $\boldsymbol\Phi$ de rang $m$ telle que $\mathbf{K} \approx \boldsymbol\Phi \boldsymbol\Phi^T$.
|
||||
$$
|
||||
\boldsymbol\Phi =
|
||||
\left[
|
||||
@ -162,45 +163,7 @@ $$
|
||||
\end{aligned}
|
||||
$$
|
||||
|
||||
## Régression ridge à noyau et approximation de Nyström
|
||||
|
||||
Dans le cadre de la régression ridge à noyau (voir un précédent module), nous notons : $\mathbf{G} = \mathbf{K} + \lambda\mathbf{I_n}$. Les coefficients du modèle ridge sont alors donnés par : $\boldsymbol\alpha_\lambda = \mathbf{G}^{-1} \mathbf{y}$. Nous cherchons à calculer efficacement $\mathbf{G}^{-1}$ à partir d'une approximation Nyström de rang $m$ de $\mathbf{K} \approx \mathbf{L}\mathbf{L}^T$. Pour ce faire, nous utilisons une forme de l'identité de Woodbury :
|
||||
$$
|
||||
\left(\mathbf{A} + \mathbf{U}\mathbf{C}\mathbf{V}\right)^{-1} = \mathbf{A}^{-1} - \mathbf{A}^{-1}\mathbf{U}\left(\mathbf{C}^{-1} + \mathbf{V}\mathbf{A}^{-1}\mathbf{U}\right)^{-1}\mathbf{V} \mathbf{A}^{-1}
|
||||
$$
|
||||
|
||||
Nous obtenons ainsi :
|
||||
$$
|
||||
\begin{aligned}
|
||||
& \mathbf{G}^{-1} \\
|
||||
= \{& \text{Définition de $\mathbf{G}$} \} \\
|
||||
& \left(\lambda\mathbf{I_n} + \mathbf{K}\right)^{-1} \\
|
||||
\approx \{& \text{Approximation de Nyström} \} \\
|
||||
& \left(\lambda\mathbf{I_n} + \mathbf{L}\mathbf{L}^T\right)^{-1} \\
|
||||
= \{& \text{Identité de Woodbury} \} \\
|
||||
& \lambda^{-1}\mathbf{I_n} - \lambda^{-1}\mathbf{L}\left(\mathbf{I_m}+\lambda^{-1}\mathbf{L}^T\mathbf{L}\right)^{-1}\lambda^{-1}\mathbf{L}^T \\
|
||||
= \{& \text{Algèbre linéaire : si $\mathbf{A}$ et $\mathbf{B}$ sont des matrices carrés inversibles alors $(\mathbf{A}\mathbf{B})^{-1} = \mathbf{B}^{-1}\mathbf{A}^{-1}$} \} \\
|
||||
& \lambda^{-1}\mathbf{I_n} - \lambda^{-1}\mathbf{L}\left(\lambda\mathbf{I_m}+\mathbf{L}^T\mathbf{L}\right)^{-1}\mathbf{L}^T \\
|
||||
\end{aligned}
|
||||
$$
|
||||
|
||||
Montrons comment calculer efficacement la prédiction $\hat{y}_{new}$ d'une nouvelle observation $\mathbf{x_{new}}$. Nous avons montré plus haut que l'approximation de Nyström pouvait s'écrire $\mathbf{K} \approx \mathbf{C} \mathbf{K_{11}}^{-1} \mathbf{C}^T$. Donc la ligne de l'approximation du noyau $\mathbf{K}$ qui correspond à l'observation $\mathbf{x_j}$ du jeu d'entrainement (dont la version non approximée est $[k(\mathbf{x_j},\mathbf{x_1}),\dots,k(\mathbf{x_j},\mathbf{x_n})]$) s'obtient par combinaison linéaire des lignes de $\mathbf{K_{11}}^{-1} \mathbf{C}^T$. Les $m$ coefficients de cette combinaison linéaire sont $(k(\mathbf{x_j},\mathbf{x_1}),\dots,k(\mathbf{x_j},\mathbf{x_m}))$.
|
||||
|
||||
Ainsi, l'approximation d'une "nouvelle ligne" du noyau $\mathbf{K}$ formée des similarités d'une nouvelle observation $\mathbf{x_{new}}$ avec les observations $(\mathbf{x_1},\dots,\mathbf{x_n})$ du jeu d'entrainement est :
|
||||
$$[k(\mathbf{x_{new}},\mathbf{x_1}),\dots,k(\mathbf{x_{new}},\mathbf{x_n})] \approx [k(\mathbf{x_{new}},\mathbf{x_1}),\dots,k(\mathbf{x_{new}},\mathbf{x_m})] \mathbf{K_{11}}^{-1} \mathbf{C}^T$$
|
||||
|
||||
Enfin, la prédiction associée à $\mathbf{x_{new}}$ à partir de la valeur approximée du noyau est :
|
||||
$$
|
||||
\begin{aligned}
|
||||
& \hat{y}_{new} \\
|
||||
\approx \{& \text{voir raisonnement ci-dessus, et } \hat{y} = \mathbf{K} \boldsymbol\alpha_\lambda \} \\
|
||||
& \left[ k(\mathbf{x_{new}},\mathbf{x_1}),\dots,k(\mathbf{x_{new}},\mathbf{x_m}) \right] \left( \mathbf{K_{11}}^{-1} \mathbf{C}^T \boldsymbol\alpha_\lambda \right) \\
|
||||
= \{& \text{introduction de $\boldsymbol\beta_\lambda = \mathbf{K_{11}}^{-1} \mathbf{C}^T \boldsymbol\alpha_\lambda$} \} \\
|
||||
& \left[ k(\mathbf{x_{new}},\mathbf{x_1}),\dots,k(\mathbf{x_{new}},\mathbf{x_m}) \right] \boldsymbol\beta_\lambda \\
|
||||
\end{aligned}
|
||||
$$
|
||||
|
||||
$\boldsymbol\beta_\lambda$ peut être calculé une seule fois à la fin de l'entrainement pour être ensuite réutilisé pour chaque prédiction.
|
||||
Nous voyons que la matrice $\mathbf{L}$ correspond exactement à la matrice $\boldsymbol\Phi$ dont les lignes sont les nouvelles variables $m$-dimensionnelles telles que $\boldsymbol\Phi \boldsymbol\Phi^T$ soit une approximation de rang-$m$ de $\mathbf{K}$. Ainsi, pour opérer une régression ridge à noyau avec approximation de Nyström, il suffit de faire une régression ridge sur ces variables transformées.
|
||||
|
||||
## Exemple sur un jeu de données synthétique
|
||||
|
||||
@ -214,7 +177,7 @@ Nous reprenons le jeu de données synthétique utilisé depuis le premier module
|
||||
entr <- splitres$entr
|
||||
test <- splitres$test
|
||||
|
||||
nakrm <- nakr(entr$X,entr$Y, nb.landmarks=25)
|
||||
nakrm <- nakr(entr$X,entr$Y,nb.landmarks=25)
|
||||
yh <- predict(nakrm,test$X)
|
||||
plt(test,f)
|
||||
points(test$X, yh, pch=4)
|
||||
|
@ -1,80 +1,38 @@
|
||||
# compute the gaussian kernel between each row of X1 and each row of X2
|
||||
# should be done more efficiently (C code, threads)
|
||||
gausskernel.nakr <-
|
||||
function(X1, X2, sigma2)
|
||||
{
|
||||
n1 <- dim(X1)[1]
|
||||
n2 <- dim(X2)[1]
|
||||
K <- matrix(nrow = n1, ncol = n2)
|
||||
for(i in 1:n1)
|
||||
for(j in 1:n2)
|
||||
K[i,j] <- sum((X1[i,] - X2[j,])^2)
|
||||
K <- exp(-1*K/sigma2)
|
||||
}
|
||||
|
||||
# Nystroem Approximation Kernel Ridge Regression
|
||||
nakr <-
|
||||
function(X, y, sigma2=NULL, lambda=1E-8, landmarks=NULL, nb.landmarks=NULL)
|
||||
function(X, y, sigma2=NULL, lambdas=NULL, nb.landmarks=NULL)
|
||||
{
|
||||
set.seed(1123)
|
||||
X <- as.matrix(X)
|
||||
n <- nrow(X)
|
||||
p <- ncol(X)
|
||||
|
||||
if(is.null(sigma2)) { sigma2 <- p }
|
||||
|
||||
ldm <- landmarks.nakr(X, landmarks, nb.landmarks)
|
||||
|
||||
if(is.null(nb.landmarks)) { nb.landmarks <- 15*n^(1/3) }
|
||||
X <- scale(X)
|
||||
y <- scale(y)
|
||||
|
||||
C <- gausskernel.nakr(X, as.matrix(X[ldm$idx,]), sigma2)
|
||||
K11 <- C[ldm$idx,]
|
||||
|
||||
svdK11 <- svd(K11)
|
||||
# K11 often ill-formed -> drop small sv
|
||||
ks <- which(svdK11$d < 1E-12)
|
||||
if (length(ks)>0) {k <- ks[1]} else {k <- length(svdK11$d)}
|
||||
|
||||
US <- svdK11$u[,1:k] %*% diag(1 / sqrt(svdK11$d[1:k]))
|
||||
L <- C %*% US
|
||||
Ginv <- t(L) %*% L
|
||||
diag(Ginv) <- diag(Ginv) + lambda
|
||||
Ginv <- chol2inv(chol(Ginv))
|
||||
Ginv <- L %*% Ginv %*% t(L)
|
||||
Ginv <- - Ginv / lambda
|
||||
diag(Ginv) <- diag(Ginv) + (1/lambda)
|
||||
coef <- Ginv %*% y
|
||||
|
||||
K11inv <- svdK11$v[,1:k] %*% diag(1/svdK11$d[1:k]) %*% t(svdK11$u[,1:k])
|
||||
beta <- K11inv %*% t(C) %*% coef
|
||||
|
||||
r <- list(X=X,
|
||||
y=y,
|
||||
idx.landmarks <- sample(1:n, nb.landmarks, replace = FALSE)
|
||||
S <- X[idx.landmarks, ]
|
||||
K11 <- gausskernel.nakr(S, S, sigma2)
|
||||
C <- gausskernel.nakr(X, S, sigma2)
|
||||
svd.K11 <- svd(K11)
|
||||
ks <- which(svd.K11$d < 1E-12) # K11 often ill-formed -> drop small sv
|
||||
if (length(ks)>0) {k <- ks[1]} else {k <- length(svd.K11$d)}
|
||||
W <- svd.K11$u[,1:k] %*% diag(1/sqrt(svd.K11$d[1:k]))
|
||||
Phi <- C %*% W
|
||||
ridge <- ridge(Phi, y, lambdas)
|
||||
r <- list(center.X=attr(X,"scaled:center"),
|
||||
scale.X=attr(X,"scaled:scale"),
|
||||
center.y=attr(y,"scaled:center"),
|
||||
scale.y=attr(y,"scaled:scale"),
|
||||
S=S,
|
||||
sigma2=sigma2,
|
||||
lambda=lambda,
|
||||
ldmidx=ldm$idx,
|
||||
coef=coef,
|
||||
beta=beta
|
||||
W=W,
|
||||
ridge=ridge
|
||||
)
|
||||
class(r) <- "nakr"
|
||||
return(r)
|
||||
}
|
||||
|
||||
landmarks.nakr <-
|
||||
function(X, landmarks, nb.landmarks)
|
||||
{
|
||||
n <- nrow(X)
|
||||
if(is.null(landmarks)) {
|
||||
if(is.null(nb.landmarks)) { nb.landmarks <- round(sqrt(n)) }
|
||||
ldmidx <- sample(1:n, nb.landmarks, replace = FALSE)
|
||||
} else {
|
||||
ldmidx <- which(rownames(X) %in% as.character(landmarks))
|
||||
}
|
||||
ldmidx <- sort(ldmidx)
|
||||
ldmnms <- as.numeric(rownames(X)[ldmidx])
|
||||
return(list(idx=ldmidx, nms=ldmnms))
|
||||
}
|
||||
|
||||
predict.nakr <-
|
||||
function(o, newdata)
|
||||
{
|
||||
@ -83,42 +41,30 @@ function(o, newdata)
|
||||
UseMethod("predict")
|
||||
return(invisible(NULL))
|
||||
}
|
||||
newdata <- as.matrix(newdata)
|
||||
if(ncol(o$X)!=ncol(newdata)) {
|
||||
stop("Not the same number of variables btwn fitted nakr object and new data")
|
||||
}
|
||||
newdata <- scale(newdata,center=attr(o$X,"scaled:center"),
|
||||
scale=attr(o$X,"scaled:scale"))
|
||||
Ktest <- gausskernel.nakr(newdata, as.matrix(o$X[o$ldmidx,]), o$sigma2)
|
||||
yh <- Ktest %*% o$beta
|
||||
yh <- (yh * attr(o$y,"scaled:scale")) + attr(o$y,"scaled:center")
|
||||
test <- as.matrix(newdata)
|
||||
test <- scale(test,center=o$center.X,scale=o$scale.X)
|
||||
K.test <- gausskernel.nakr(test, o$S, o$sigma2)
|
||||
Phi.test <- K.test %*% o$W
|
||||
yh <- predict(o$ridge, Phi.test)
|
||||
yh <- yh * o$scale.y + o$center.y
|
||||
}
|
||||
|
||||
kfold.nakr <-
|
||||
function(X, y, K=5, lambdas=NULL, sigma2=NULL, landmarks=NULL, nb.landmarks=NULL)
|
||||
# compute the gaussian kernel between each row of X1 and each row of X2
|
||||
# should be done more efficiently (C code, threads)
|
||||
gausskernel.nakr <-
|
||||
function(X1, X2, sigma2)
|
||||
{
|
||||
if(is.null(lambdas)) { lambdas <- 10^seq(-8, 2, by=1) }
|
||||
|
||||
n <- nrow(X)
|
||||
folds <- rep_len(1:K, n)
|
||||
folds <- sample(folds, n)
|
||||
maes <- matrix(data = NA, nrow = K, ncol = length(lambdas))
|
||||
colnames(maes) <- lambdas
|
||||
lambda_idx <- 1
|
||||
ldm <- landmarks.nakr(X, landmarks, nb.landmarks)
|
||||
for(lambda in lambdas) {
|
||||
for(k in 1:K) {
|
||||
fold <- folds == k
|
||||
ldmnms2keep <- ldm$nms[! ldm$idx %in% which(fold)]
|
||||
nakrm <- nakr(X[!fold,], y[!fold], sigma2, lambda, landmarks=ldmnms2keep)
|
||||
pred <- predict(nakrm, X[fold,])
|
||||
maes[k,lambda_idx] <- mean(abs(pred - y[fold]))
|
||||
print(paste("lbd =", lambda, "; k =", k, "; mae =", maes[k,lambda_idx]))
|
||||
}
|
||||
lambda_idx <- lambda_idx + 1
|
||||
}
|
||||
mmaes <- colMeans(maes)
|
||||
minmmaes <- min(mmaes)
|
||||
bestlambda <- lambdas[which(mmaes == minmmaes)]
|
||||
nakrm <- nakr(X, y, sigma2, bestlambda, landmarks=ldm$nms)
|
||||
if(is(X1,"vector"))
|
||||
X1 <- as.matrix(X1)
|
||||
if(is(X2,"vector"))
|
||||
X2 <- as.matrix(X2)
|
||||
if (!(dim(X1)[2]==dim(X2)[2]))
|
||||
stop("X1 and X2 must have the same number of columns")
|
||||
n1 <- dim(X1)[1]
|
||||
n2 <- dim(X2)[1]
|
||||
dotX1 <- rowSums(X1*X1)
|
||||
dotX2 <- rowSums(X2*X2)
|
||||
res <- X1%*%t(X2)
|
||||
for(i in 1:n2) res[,i] <- exp((2*res[,i] - dotX1 - rep(dotX2[i],n1))/sigma2)
|
||||
return(res)
|
||||
}
|
||||
|
@ -4,7 +4,7 @@ author: "Pierre-Edouard Portier"
|
||||
documentclass: book
|
||||
geometry: margin=2cm
|
||||
fontsize: 12pt
|
||||
date: "5 Mar 2023"
|
||||
date: "19 Mar 2023"
|
||||
toc: true
|
||||
classoption: fleqn
|
||||
bibliography: intro_to_ml.bib
|
||||
|
26
pad.R
26
pad.R
@ -27,29 +27,3 @@
|
||||
# adj = 0, cex = 0.6)
|
||||
# points(0, 0, pch = 3)
|
||||
# ```
|
||||
|
||||
# source("19_b_nystroem_approximation_housing_experiment_code.R")
|
||||
# rdat <- hous.dat.nakr$dat[sample(nrow(hous.dat.nakr$dat), size=2000, replace=FALSE),]
|
||||
# X <- rdat[,!(colnames(rdat) %in% c('median_house_value'))]
|
||||
# Y <- rdat[,c('median_house_value')]
|
||||
# names(Y) <- rownames(X)
|
||||
# rsplt <- splitdata(list(X = X, Y = Y), 0.8)
|
||||
# X.entr <- rsplt$entr$X
|
||||
# Y.entr <- rsplt$entr$Y
|
||||
# X.test <- rsplt$test$X
|
||||
# Y.test <- rsplt$test$Y
|
||||
# source("18_kernel_ridge_regression_code.R")
|
||||
# krm <- krr(X.entr, Y.entr)
|
||||
# krm.yh <- predict(krm, X.test)
|
||||
# krm.mae <- mean(abs(krm.yh - Y.test)) # 35445.1
|
||||
# nakrm <- nakr(X.entr, Y.entr, nb.landmarks=1600)
|
||||
# nakrm.yh <- predict(nakrm, X.test)
|
||||
# nakrm.mae <- mean(abs(nakrm.yh - Y.test)) # 65454.18
|
||||
# source("15_loocv_code.R")
|
||||
# rm <- ridge(X.entr, Y.entr)
|
||||
# rm.yh <- predict(rm, X.test)
|
||||
# rm.mae <- mean(abs(rm.yh - Y.test)) # 45786.62
|
||||
# library(randomForest)
|
||||
# rfm <- randomForest(X.entr, Y.entr)
|
||||
# rfm.yh <- predict(rfm, X.test)
|
||||
# rfm.mae <- mean(abs(rfm.yh - Y.test)) # 34229.02
|
||||
|
Loading…
Reference in New Issue
Block a user