Nystroem approximation is now correct (and simplified).

2023-03-19 18:48:47 +01:00 · 2023-03-19 18:48:47 +01:00 · 5689bf6fef
commit 5689bf6fef
parent 3c83c256b5
5 changed files with 69 additions and 231 deletions
--- a/19_b_nystroem_approximation_housing_experiment_code.R
+++ b/19_b_nystroem_approximation_housing_experiment_code.R
@ -1,8 +1,7 @@
-source("05_d_svd_mca_code.R")
-source("04_validation_croisee_code.R")
+source("15_loocv_code.R")
 source("19_nystroem_approximation_code.R")

-datasetHousing.nakr <-
+dataset.housing <-
 function()
 {
  dat <- read.csv(file="data/housing.csv", header=TRUE)
@ -44,68 +43,24 @@ function()
  return(r)
 }

-# test.tbl <- table(c(4,2,4,2,1,1,1))
-# all(intersperse(test.tbl) == c(1, 2, 4, 1, 2, 4, 1))
-intersperse <-
-function(tbl)
-{
-  n <- sum(tbl)
-  values <- as.numeric(names(tbl))
-  r <- numeric(n)
-  i <- 1
-  while(i <= n)
-  {
-    for(j in 1:length(tbl))
-    {
-      if(tbl[j] != 0)
-      {
-        r[i] <- values[j]
-        i <- i + 1
-        tbl[j] <- tbl[j] - 1
-      }
-    }
-  }
-  return(r)
-}
+print("load dataset")
+dat <- dataset.housing()

-# sample the landmarks from the clusters of individuals
-# obtained after correspondence analysis
-landmarks.by.ca.clst <-
-function(cam, X, nbLandmarks)
-{
-  if(nbLandmarks > nrow(X))
-  {
-    stop("The number of landmarks must be less than the number of training samples.")
-  }
-  landmarks <- numeric(nbLandmarks)
-  clst <- cam$clsti$cluster[as.numeric(rownames(X))]
-  clst.tbl <- table(clst)
-  nb.by.clst <- table((intersperse(clst.tbl))[1:nbLandmarks])
-  clst.id <- as.numeric(names(nb.by.clst))
-  set.seed(1123)
-  clst <- sample(clst)
-  offset <- 0
-  for (i in 1:length(nb.by.clst))
-  {
-    k <- as.numeric(nb.by.clst[i])
-    landmarks[(offset+1):(offset+k)] <- as.numeric(names((clst[clst==clst.id[i]])[1:k]))
-    offset <- offset+k
-  }
-  return(landmarks)
-}
+print("Nystroem approx ridge regression")
+nakrm <- nakr(dat$entr$X, dat$entr$Y)
+nakrm.yh <- predict(nakrm, dat$test$X)
+nakrm.mae <- mean(abs(nakrm.yh - dat$test$Y))
+print(paste("MAE for Nystroem: ", nakrm.mae))

-hous.dat.nakr <- datasetHousing.nakr()
-X.entr <- hous.dat.nakr$entr$X
-Y.entr <- hous.dat.nakr$entr$Y
-X.test <- hous.dat.nakr$test$X
-Y.test <- hous.dat.nakr$test$Y
-hous.dat.ca <- datasetHousing.mca()
-hous.cam <- mca(hous.dat.ca)
-nb.landmarks <- round(sqrt(nrow(X.entr)))
-landmarks <- landmarks.by.ca.clst(hous.cam, X.entr, nb.landmarks)
-#  nakrm <- kfold.nakr(X.entr, Y.entr, landmarks=landmarks)
-#  nakrm.yh <- predict(nakrm, X.test)
-#  nakrm.mae <- mean(abs(nakrm.yh - Y.test))
-#  nakrm.yh.train <- predict(nakrm, X.entr)
-#  rev(order(abs(nakrm.yh.train - Y.entr)))[1:20]
-#  hist(Y.entr[rev(order(abs(nakrm.yh.train - Y.entr)))[1:200]])
+print("Linear ridge regression")
+rm <- ridge(dat$entr$X, dat$entr$Y)
+rm.yh <- predict(rm, dat$test$X)
+rm.mae <- mean(abs(rm.yh - dat$test$Y))
+print(paste("MAE for Ridge: ", rm.mae))
+
+print("Random forest")
+library(randomForest)
+rfm <- randomForest(dat$entr$X, dat$entr$Y)
+rfm.yh <- predict(rfm, dat$test$X)
+rfm.mae <- mean(abs(rfm.yh - dat$test$Y))
+print(paste("MAE for Random Forest: ", rfm.mae))
--- a/19_nystroem_approximation.Rmd
+++ b/19_nystroem_approximation.Rmd
@ -3,6 +3,7 @@
 ```{r, include=FALSE}
 source("01_intro_code.R", local = knitr::knit_global())
 source("04_validation_croisee_code.R", local = knitr::knit_global())
+source("15_loocv_code.R", local = knitr::knit_global())
 source("19_nystroem_approximation_code.R", local = knitr::knit_global())
 ```

@ -103,7 +104,7 @@ $$
 \end{aligned}
 $$

-Ainsi, nous pouvons introduire une matrice $\boldsymbol\Phi$ de dimension $m$ telle que $\mathbf{K} \approx \boldsymbol\Phi \boldsymbol\Phi^T$.
+Ainsi, nous pouvons introduire une matrice $\boldsymbol\Phi$ de rang $m$ telle que $\mathbf{K} \approx \boldsymbol\Phi \boldsymbol\Phi^T$.
 $$
 \boldsymbol\Phi =
 \left[
@ -162,45 +163,7 @@ $$
 \end{aligned}
 $$

-## Régression ridge à noyau et approximation de Nyström
-
-Dans le cadre de la régression ridge à noyau (voir un précédent module), nous notons : $\mathbf{G} = \mathbf{K} + \lambda\mathbf{I_n}$. Les coefficients du modèle ridge sont alors donnés par : $\boldsymbol\alpha_\lambda = \mathbf{G}^{-1} \mathbf{y}$. Nous cherchons à calculer efficacement $\mathbf{G}^{-1}$ à partir d'une approximation Nyström de rang $m$ de $\mathbf{K} \approx \mathbf{L}\mathbf{L}^T$. Pour ce faire, nous utilisons une forme de l'identité de Woodbury :
-$$
-\left(\mathbf{A} + \mathbf{U}\mathbf{C}\mathbf{V}\right)^{-1} = \mathbf{A}^{-1} - \mathbf{A}^{-1}\mathbf{U}\left(\mathbf{C}^{-1} + \mathbf{V}\mathbf{A}^{-1}\mathbf{U}\right)^{-1}\mathbf{V} \mathbf{A}^{-1}
-$$
-
-Nous obtenons ainsi :
-$$
-\begin{aligned}
- & \mathbf{G}^{-1} \\
-= \{& \text{Définition de $\mathbf{G}$} \} \\
- & \left(\lambda\mathbf{I_n} + \mathbf{K}\right)^{-1} \\
-\approx \{& \text{Approximation de Nyström} \} \\
- & \left(\lambda\mathbf{I_n} + \mathbf{L}\mathbf{L}^T\right)^{-1} \\
-= \{& \text{Identité de Woodbury} \} \\
- & \lambda^{-1}\mathbf{I_n} - \lambda^{-1}\mathbf{L}\left(\mathbf{I_m}+\lambda^{-1}\mathbf{L}^T\mathbf{L}\right)^{-1}\lambda^{-1}\mathbf{L}^T \\
-= \{& \text{Algèbre linéaire : si $\mathbf{A}$ et $\mathbf{B}$ sont des matrices carrés inversibles alors $(\mathbf{A}\mathbf{B})^{-1} = \mathbf{B}^{-1}\mathbf{A}^{-1}$} \} \\
- & \lambda^{-1}\mathbf{I_n} - \lambda^{-1}\mathbf{L}\left(\lambda\mathbf{I_m}+\mathbf{L}^T\mathbf{L}\right)^{-1}\mathbf{L}^T \\
-\end{aligned}
-$$
-
-Montrons comment calculer efficacement la prédiction $\hat{y}_{new}$ d'une nouvelle observation $\mathbf{x_{new}}$. Nous avons montré plus haut que l'approximation de Nyström pouvait s'écrire $\mathbf{K} \approx \mathbf{C} \mathbf{K_{11}}^{-1} \mathbf{C}^T$. Donc la ligne de l'approximation du noyau $\mathbf{K}$ qui correspond à l'observation $\mathbf{x_j}$ du jeu d'entrainement (dont la version non approximée est $[k(\mathbf{x_j},\mathbf{x_1}),\dots,k(\mathbf{x_j},\mathbf{x_n})]$) s'obtient par combinaison linéaire des lignes de $\mathbf{K_{11}}^{-1} \mathbf{C}^T$. Les $m$ coefficients de cette combinaison linéaire sont $(k(\mathbf{x_j},\mathbf{x_1}),\dots,k(\mathbf{x_j},\mathbf{x_m}))$.
-
-Ainsi, l'approximation d'une "nouvelle ligne" du noyau $\mathbf{K}$ formée des similarités d'une nouvelle observation $\mathbf{x_{new}}$ avec les observations $(\mathbf{x_1},\dots,\mathbf{x_n})$ du jeu d'entrainement est :
-$$[k(\mathbf{x_{new}},\mathbf{x_1}),\dots,k(\mathbf{x_{new}},\mathbf{x_n})] \approx [k(\mathbf{x_{new}},\mathbf{x_1}),\dots,k(\mathbf{x_{new}},\mathbf{x_m})] \mathbf{K_{11}}^{-1} \mathbf{C}^T$$
-
-Enfin, la prédiction associée à $\mathbf{x_{new}}$ à partir de la valeur approximée du noyau est :
-$$
-\begin{aligned}
- & \hat{y}_{new} \\
-\approx \{& \text{voir raisonnement ci-dessus, et } \hat{y} = \mathbf{K} \boldsymbol\alpha_\lambda \} \\
- & \left[ k(\mathbf{x_{new}},\mathbf{x_1}),\dots,k(\mathbf{x_{new}},\mathbf{x_m}) \right] \left( \mathbf{K_{11}}^{-1} \mathbf{C}^T \boldsymbol\alpha_\lambda \right) \\
-= \{& \text{introduction de $\boldsymbol\beta_\lambda = \mathbf{K_{11}}^{-1} \mathbf{C}^T \boldsymbol\alpha_\lambda$} \} \\
- & \left[ k(\mathbf{x_{new}},\mathbf{x_1}),\dots,k(\mathbf{x_{new}},\mathbf{x_m}) \right]  \boldsymbol\beta_\lambda \\
-\end{aligned}
-$$
-
-$\boldsymbol\beta_\lambda$ peut être calculé une seule fois à la fin de l'entrainement pour être ensuite réutilisé pour chaque prédiction.
+Nous voyons que la matrice $\mathbf{L}$ correspond exactement à la matrice $\boldsymbol\Phi$ dont les lignes sont les nouvelles variables $m$-dimensionnelles telles que $\boldsymbol\Phi \boldsymbol\Phi^T$ soit une approximation de rang-$m$ de $\mathbf{K}$. Ainsi, pour opérer une régression ridge à noyau avec approximation de Nyström, il suffit de faire une régression ridge sur ces variables transformées.

 ## Exemple sur un jeu de données synthétique

@ -214,7 +177,7 @@ Nous reprenons le jeu de données synthétique utilisé depuis le premier module
  entr <- splitres$entr
  test <- splitres$test

-  nakrm <- nakr(entr$X,entr$Y, nb.landmarks=25)
+  nakrm <- nakr(entr$X,entr$Y,nb.landmarks=25)
  yh <- predict(nakrm,test$X)
  plt(test,f)
  points(test$X, yh, pch=4)
--- a/19_nystroem_approximation_code.R
+++ b/19_nystroem_approximation_code.R
@ -1,80 +1,38 @@
-# compute the gaussian kernel between each row of X1 and each row of X2
-# should be done more efficiently (C code, threads)
-gausskernel.nakr <-
-function(X1, X2, sigma2)
-{
-  n1 <- dim(X1)[1]
-  n2 <- dim(X2)[1]
-  K <- matrix(nrow = n1, ncol = n2)
-  for(i in 1:n1)
-    for(j in 1:n2)
-      K[i,j] <- sum((X1[i,] - X2[j,])^2)
-  K <- exp(-1*K/sigma2)
-}
-
 # Nystroem Approximation Kernel Ridge Regression
 nakr <-
-function(X, y, sigma2=NULL, lambda=1E-8, landmarks=NULL, nb.landmarks=NULL)
+function(X, y, sigma2=NULL, lambdas=NULL, nb.landmarks=NULL)
 {
+  set.seed(1123)
  X <- as.matrix(X)
  n <- nrow(X)
  p <- ncol(X)
-
  if(is.null(sigma2)) { sigma2 <- p }
-
-  ldm <- landmarks.nakr(X, landmarks, nb.landmarks)
-
+  if(is.null(nb.landmarks)) { nb.landmarks <- 15*n^(1/3) }
  X <- scale(X)
  y <- scale(y)
-
-  C <- gausskernel.nakr(X, as.matrix(X[ldm$idx,]), sigma2)
-  K11 <- C[ldm$idx,]
-
-  svdK11 <- svd(K11)
-  # K11 often ill-formed -> drop small sv
-  ks <- which(svdK11$d < 1E-12)
-  if (length(ks)>0) {k <- ks[1]} else {k <- length(svdK11$d)}
-
-  US <- svdK11$u[,1:k] %*% diag(1 / sqrt(svdK11$d[1:k]))
-  L <- C %*% US
-  Ginv <- t(L) %*% L
-  diag(Ginv) <- diag(Ginv) + lambda
-  Ginv <- chol2inv(chol(Ginv))
-  Ginv <- L %*% Ginv %*% t(L)
-  Ginv <- - Ginv / lambda
-  diag(Ginv) <- diag(Ginv) + (1/lambda)
-  coef <- Ginv %*% y
-
-  K11inv <- svdK11$v[,1:k] %*% diag(1/svdK11$d[1:k]) %*% t(svdK11$u[,1:k])
-  beta <- K11inv %*% t(C) %*% coef
-
-  r <- list(X=X,
-            y=y,
+  idx.landmarks <- sample(1:n, nb.landmarks, replace = FALSE)
+  S <- X[idx.landmarks, ]
+  K11 <- gausskernel.nakr(S, S, sigma2)
+  C <- gausskernel.nakr(X, S, sigma2)
+  svd.K11 <- svd(K11)
+  ks <- which(svd.K11$d < 1E-12) # K11 often ill-formed -> drop small sv
+  if (length(ks)>0) {k <- ks[1]} else {k <- length(svd.K11$d)}
+  W <- svd.K11$u[,1:k] %*% diag(1/sqrt(svd.K11$d[1:k]))
+  Phi <- C %*% W
+  ridge <- ridge(Phi, y, lambdas)
+  r <- list(center.X=attr(X,"scaled:center"),
+            scale.X=attr(X,"scaled:scale"),
+            center.y=attr(y,"scaled:center"),
+            scale.y=attr(y,"scaled:scale"),
+            S=S,
            sigma2=sigma2,
-            lambda=lambda,
-            ldmidx=ldm$idx,
-            coef=coef,
-            beta=beta
+            W=W,
+            ridge=ridge
           )
  class(r) <- "nakr"
  return(r)
 }

-landmarks.nakr <-
-function(X, landmarks, nb.landmarks)
-{
-  n <- nrow(X)
-  if(is.null(landmarks)) {
-    if(is.null(nb.landmarks)) { nb.landmarks <- round(sqrt(n)) }
-    ldmidx <- sample(1:n, nb.landmarks, replace = FALSE)
-  } else {
-    ldmidx <- which(rownames(X) %in% as.character(landmarks))
-  }
-  ldmidx <- sort(ldmidx)
-  ldmnms <- as.numeric(rownames(X)[ldmidx])
-  return(list(idx=ldmidx, nms=ldmnms))
-}
-
 predict.nakr <-
 function(o, newdata)
 {
@ -83,42 +41,30 @@ function(o, newdata)
    UseMethod("predict")
    return(invisible(NULL))
  }
-  newdata <- as.matrix(newdata)
-  if(ncol(o$X)!=ncol(newdata)) {
-    stop("Not the same number of variables btwn fitted nakr object and new data")
-  }
-  newdata <- scale(newdata,center=attr(o$X,"scaled:center"),
-                   scale=attr(o$X,"scaled:scale"))
-  Ktest <- gausskernel.nakr(newdata, as.matrix(o$X[o$ldmidx,]), o$sigma2)
-  yh <- Ktest %*% o$beta
-  yh <- (yh * attr(o$y,"scaled:scale")) + attr(o$y,"scaled:center")
+  test <- as.matrix(newdata)
+  test <- scale(test,center=o$center.X,scale=o$scale.X)
+  K.test <- gausskernel.nakr(test, o$S, o$sigma2)
+  Phi.test <- K.test %*% o$W
+  yh <- predict(o$ridge, Phi.test)
+  yh <- yh * o$scale.y + o$center.y
 }

-kfold.nakr <-
-function(X, y, K=5, lambdas=NULL, sigma2=NULL, landmarks=NULL, nb.landmarks=NULL)
+# compute the gaussian kernel between each row of X1 and each row of X2
+# should be done more efficiently (C code, threads)
+gausskernel.nakr <-
+function(X1, X2, sigma2)
 {
-  if(is.null(lambdas)) { lambdas <- 10^seq(-8, 2, by=1) }
-
-  n <- nrow(X)
-  folds <- rep_len(1:K, n)
-  folds <- sample(folds, n)
-  maes <- matrix(data = NA, nrow = K, ncol = length(lambdas))
-  colnames(maes) <- lambdas
-  lambda_idx <- 1
-  ldm <- landmarks.nakr(X, landmarks, nb.landmarks)
-  for(lambda in lambdas) {
-    for(k in 1:K) {
-      fold <- folds == k
-      ldmnms2keep <- ldm$nms[! ldm$idx %in% which(fold)]
-      nakrm <- nakr(X[!fold,], y[!fold], sigma2, lambda, landmarks=ldmnms2keep)
-      pred <- predict(nakrm, X[fold,])
-      maes[k,lambda_idx] <- mean(abs(pred - y[fold]))
-      print(paste("lbd =", lambda, "; k =", k, "; mae =", maes[k,lambda_idx]))
-    }
-    lambda_idx <- lambda_idx + 1
-  }
-  mmaes <- colMeans(maes)
-  minmmaes <- min(mmaes)
-  bestlambda <- lambdas[which(mmaes == minmmaes)]
-  nakrm <- nakr(X, y, sigma2, bestlambda, landmarks=ldm$nms)
+  if(is(X1,"vector"))
+    X1 <- as.matrix(X1)
+  if(is(X2,"vector"))
+    X2 <- as.matrix(X2)
+  if (!(dim(X1)[2]==dim(X2)[2]))
+    stop("X1 and X2 must have the same number of columns")
+  n1 <- dim(X1)[1]
+  n2 <- dim(X2)[1]
+  dotX1 <- rowSums(X1*X1)
+  dotX2 <- rowSums(X2*X2)
+  res <- X1%*%t(X2)
+  for(i in 1:n2) res[,i] <- exp((2*res[,i] - dotX1 - rep(dotX2[i],n1))/sigma2)
+  return(res)
 }
--- a/index.Rmd
+++ b/index.Rmd
@ -4,7 +4,7 @@ author: "Pierre-Edouard Portier"
 documentclass: book
 geometry: margin=2cm
 fontsize: 12pt
-date: "5 Mar 2023"
+date: "19 Mar 2023"
 toc: true
 classoption: fleqn
 bibliography: intro_to_ml.bib
--- a/pad.R
+++ b/pad.R
@ -27,29 +27,3 @@
 #      adj = 0, cex = 0.6)
 # points(0, 0, pch = 3)
 # ```
-
-#  source("19_b_nystroem_approximation_housing_experiment_code.R")
-#  rdat <- hous.dat.nakr$dat[sample(nrow(hous.dat.nakr$dat), size=2000, replace=FALSE),]
-#  X <- rdat[,!(colnames(rdat) %in% c('median_house_value'))]
-#  Y <- rdat[,c('median_house_value')]
-#  names(Y) <- rownames(X)
-#  rsplt <- splitdata(list(X = X, Y = Y), 0.8)
-#  X.entr <- rsplt$entr$X
-#  Y.entr <- rsplt$entr$Y
-#  X.test <- rsplt$test$X
-#  Y.test <- rsplt$test$Y
-#  source("18_kernel_ridge_regression_code.R")
-#  krm <- krr(X.entr, Y.entr)
-#  krm.yh <- predict(krm, X.test)
-#  krm.mae <- mean(abs(krm.yh - Y.test)) # 35445.1
-#  nakrm <- nakr(X.entr, Y.entr, nb.landmarks=1600)
-#  nakrm.yh <- predict(nakrm, X.test)
-#  nakrm.mae <- mean(abs(nakrm.yh - Y.test)) # 65454.18
-#  source("15_loocv_code.R")
-#  rm <- ridge(X.entr, Y.entr)
-#  rm.yh <- predict(rm, X.test)
-#  rm.mae <- mean(abs(rm.yh - Y.test)) # 45786.62
-#  library(randomForest)
-#  rfm <- randomForest(X.entr, Y.entr)
-#  rfm.yh <- predict(rfm, X.test)
-#  rfm.mae <- mean(abs(rfm.yh - Y.test)) # 34229.02