From 3c83c256b5a965f6e31f32ba8be746068f2aa9df Mon Sep 17 00:00:00 2001
From: Pierre-Edouard Portier <p6e7p7@freeshell.org>
Date: Mon, 13 Mar 2023 13:01:40 +0100
Subject: [PATCH] progress on nystroem approximation implementation ; there is
 still a problem: for a small sample size the result of krr should be the same
 as the one of nystroem approx krr where the number of landmarks is equal to
 the number of training points. For now, it isn't.

---
 ...em_approximation_housing_experiment_code.R | 10 ++--
 19_nystroem_approximation_code.R              | 51 +++++++++++--------
 2 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/19_b_nystroem_approximation_housing_experiment_code.R b/19_b_nystroem_approximation_housing_experiment_code.R
index 3c6d285..8af0a3c 100644
--- a/19_b_nystroem_approximation_housing_experiment_code.R
+++ b/19_b_nystroem_approximation_housing_experiment_code.R
@@ -99,13 +99,13 @@ X.entr <- hous.dat.nakr$entr$X
 Y.entr <- hous.dat.nakr$entr$Y
 X.test <- hous.dat.nakr$test$X
 Y.test <- hous.dat.nakr$test$Y
-#  hous.dat.ca <- datasetHousing.mca()
-#  hous.cam <- mca(hous.dat.ca)
-#  nb.landmarks <- round(sqrt(nrow(X.entr)))
-#  landmarks <- landmarks.by.ca.clst(hous.cam, X.entr, nb.landmarks)
+hous.dat.ca <- datasetHousing.mca()
+hous.cam <- mca(hous.dat.ca)
+nb.landmarks <- round(sqrt(nrow(X.entr)))
+landmarks <- landmarks.by.ca.clst(hous.cam, X.entr, nb.landmarks)
 #  nakrm <- kfold.nakr(X.entr, Y.entr, landmarks=landmarks)
 #  nakrm.yh <- predict(nakrm, X.test)
 #  nakrm.mae <- mean(abs(nakrm.yh - Y.test))
 #  nakrm.yh.train <- predict(nakrm, X.entr)
 #  rev(order(abs(nakrm.yh.train - Y.entr)))[1:20]
-#  hist(Y.entr[rev(order(abs(nakrm.yh.train - Y.entr)))[1:200]])
\ No newline at end of file
+#  hist(Y.entr[rev(order(abs(nakrm.yh.train - Y.entr)))[1:200]])
diff --git a/19_nystroem_approximation_code.R b/19_nystroem_approximation_code.R
index c0a534b..8bc05f0 100644
--- a/19_nystroem_approximation_code.R
+++ b/19_nystroem_approximation_code.R
@@ -1,5 +1,5 @@
 # compute the gaussian kernel between each row of X1 and each row of X2
-# should be done more efficiently
+# should be done more efficiently (C code, threads)
 gausskernel.nakr <-
 function(X1, X2, sigma2)
 {
@@ -14,7 +14,7 @@ function(X1, X2, sigma2)
 
 # Nystroem Approximation Kernel Ridge Regression
 nakr <-
-function(X, y, sigma2=NULL, lambda=1E-4, landmarks=NULL, nb.landmarks=NULL)
+function(X, y, sigma2=NULL, lambda=1E-8, landmarks=NULL, nb.landmarks=NULL)
 {
   X <- as.matrix(X)
   n <- nrow(X)
@@ -22,23 +22,16 @@ function(X, y, sigma2=NULL, lambda=1E-4, landmarks=NULL, nb.landmarks=NULL)
 
   if(is.null(sigma2)) { sigma2 <- p }
 
-  if(is.null(landmarks)) {
-    if(is.null(nb.landmarks)) { nb.landmarks <- round(sqrt(n)) }
-    splidx <- sample(1:n, nb.landmarks, replace = FALSE)
-  } else {
-    splidx <- which(rownames(X) %in% as.character(landmarks))
-    nb.landmarks <- length(splidx)
-  }
-  splidx <- sort(splidx)
+  ldm <- landmarks.nakr(X, landmarks, nb.landmarks)
 
   X <- scale(X)
   y <- scale(y)
 
-  C <- gausskernel.nakr(X, as.matrix(X[splidx,]), sigma2)
-  K11 <- C[splidx,]
+  C <- gausskernel.nakr(X, as.matrix(X[ldm$idx,]), sigma2)
+  K11 <- C[ldm$idx,]
 
   svdK11 <- svd(K11)
-  # K11 will often be ill-formed, thus we drop the bottom singular values
+  # K11 often ill-formed -> drop small sv
   ks <- which(svdK11$d < 1E-12)
   if (length(ks)>0) {k <- ks[1]} else {k <- length(svdK11$d)}
 
@@ -59,7 +52,7 @@ function(X, y, sigma2=NULL, lambda=1E-4, landmarks=NULL, nb.landmarks=NULL)
             y=y,
             sigma2=sigma2,
             lambda=lambda,
-            splidx=splidx,
+            ldmidx=ldm$idx,
             coef=coef,
             beta=beta
            )
@@ -67,6 +60,21 @@ function(X, y, sigma2=NULL, lambda=1E-4, landmarks=NULL, nb.landmarks=NULL)
   return(r)
 }
 
+landmarks.nakr <-
+function(X, landmarks, nb.landmarks)
+{
+  n <- nrow(X)
+  if(is.null(landmarks)) {
+    if(is.null(nb.landmarks)) { nb.landmarks <- round(sqrt(n)) }
+    ldmidx <- sample(1:n, nb.landmarks, replace = FALSE)
+  } else {
+    ldmidx <- which(rownames(X) %in% as.character(landmarks))
+  }
+  ldmidx <- sort(ldmidx)
+  ldmnms <- as.numeric(rownames(X)[ldmidx])
+  return(list(idx=ldmidx, nms=ldmnms))
+}
+
 predict.nakr <-
 function(o, newdata)
 {
@@ -81,7 +89,7 @@ function(o, newdata)
   }
   newdata <- scale(newdata,center=attr(o$X,"scaled:center"),
                    scale=attr(o$X,"scaled:scale"))
-  Ktest <- gausskernel.nakr(newdata, as.matrix(o$X[o$splidx,]), o$sigma2)
+  Ktest <- gausskernel.nakr(newdata, as.matrix(o$X[o$ldmidx,]), o$sigma2)
   yh <- Ktest %*% o$beta
   yh <- (yh * attr(o$y,"scaled:scale")) + attr(o$y,"scaled:center")
 }
@@ -91,23 +99,26 @@ function(X, y, K=5, lambdas=NULL, sigma2=NULL, landmarks=NULL, nb.landmarks=NULL
 {
   if(is.null(lambdas)) { lambdas <- 10^seq(-8, 2, by=1) }
 
-  N <- nrow(X)
-  folds <- rep_len(1:K, N)
-  folds <- sample(folds, N)
+  n <- nrow(X)
+  folds <- rep_len(1:K, n)
+  folds <- sample(folds, n)
   maes <- matrix(data = NA, nrow = K, ncol = length(lambdas))
   colnames(maes) <- lambdas
   lambda_idx <- 1
+  ldm <- landmarks.nakr(X, landmarks, nb.landmarks)
   for(lambda in lambdas) {
     for(k in 1:K) {
       fold <- folds == k
-      nakrm <- nakr(X[!fold,], y[!fold], sigma2, lambda, landmarks, nb.landmarks)
+      ldmnms2keep <- ldm$nms[! ldm$idx %in% which(fold)]
+      nakrm <- nakr(X[!fold,], y[!fold], sigma2, lambda, landmarks=ldmnms2keep)
       pred <- predict(nakrm, X[fold,])
       maes[k,lambda_idx] <- mean(abs(pred - y[fold]))
+      print(paste("lbd =", lambda, "; k =", k, "; mae =", maes[k,lambda_idx]))
     }
     lambda_idx <- lambda_idx + 1
   }
   mmaes <- colMeans(maes)
   minmmaes <- min(mmaes)
   bestlambda <- lambdas[which(mmaes == minmmaes)]
-  nakrm <- nakr(X, y, sigma2, bestlambda, landmarks, nb.landmarks)
+  nakrm <- nakr(X, y, sigma2, bestlambda, landmarks=ldm$nms)
 }