progress on nystroem approximation implementation ; there is still a problem: for a small sample size the result of krr should be the same as the one of nystroem approx krr where the number of landmarks is equal to the number of training points. For now, it isn't.

2023-03-13 13:01:40 +01:00 · 2023-03-13 13:01:40 +01:00 · 3c83c256b5
commit 3c83c256b5
parent ef6db6e85a
2 changed files with 36 additions and 25 deletions
--- a/19_b_nystroem_approximation_housing_experiment_code.R
+++ b/19_b_nystroem_approximation_housing_experiment_code.R
@ -99,13 +99,13 @@ X.entr <- hous.dat.nakr$entr$X
 Y.entr <- hous.dat.nakr$entr$Y
 X.test <- hous.dat.nakr$test$X
 Y.test <- hous.dat.nakr$test$Y
-#  hous.dat.ca <- datasetHousing.mca()
-#  hous.cam <- mca(hous.dat.ca)
-#  nb.landmarks <- round(sqrt(nrow(X.entr)))
-#  landmarks <- landmarks.by.ca.clst(hous.cam, X.entr, nb.landmarks)
+hous.dat.ca <- datasetHousing.mca()
+hous.cam <- mca(hous.dat.ca)
+nb.landmarks <- round(sqrt(nrow(X.entr)))
+landmarks <- landmarks.by.ca.clst(hous.cam, X.entr, nb.landmarks)
 #  nakrm <- kfold.nakr(X.entr, Y.entr, landmarks=landmarks)
 #  nakrm.yh <- predict(nakrm, X.test)
 #  nakrm.mae <- mean(abs(nakrm.yh - Y.test))
 #  nakrm.yh.train <- predict(nakrm, X.entr)
 #  rev(order(abs(nakrm.yh.train - Y.entr)))[1:20]
-#  hist(Y.entr[rev(order(abs(nakrm.yh.train - Y.entr)))[1:200]])
+#  hist(Y.entr[rev(order(abs(nakrm.yh.train - Y.entr)))[1:200]])
--- a/19_nystroem_approximation_code.R
+++ b/19_nystroem_approximation_code.R
@ -1,5 +1,5 @@
 # compute the gaussian kernel between each row of X1 and each row of X2
-# should be done more efficiently
+# should be done more efficiently (C code, threads)
 gausskernel.nakr <-
 function(X1, X2, sigma2)
 {
@ -14,7 +14,7 @@ function(X1, X2, sigma2)

 # Nystroem Approximation Kernel Ridge Regression
 nakr <-
-function(X, y, sigma2=NULL, lambda=1E-4, landmarks=NULL, nb.landmarks=NULL)
+function(X, y, sigma2=NULL, lambda=1E-8, landmarks=NULL, nb.landmarks=NULL)
 {
  X <- as.matrix(X)
  n <- nrow(X)
@ -22,23 +22,16 @@ function(X, y, sigma2=NULL, lambda=1E-4, landmarks=NULL, nb.landmarks=NULL)

  if(is.null(sigma2)) { sigma2 <- p }

-  if(is.null(landmarks)) {
-    if(is.null(nb.landmarks)) { nb.landmarks <- round(sqrt(n)) }
-    splidx <- sample(1:n, nb.landmarks, replace = FALSE)
-  } else {
-    splidx <- which(rownames(X) %in% as.character(landmarks))
-    nb.landmarks <- length(splidx)
-  }
-  splidx <- sort(splidx)
+  ldm <- landmarks.nakr(X, landmarks, nb.landmarks)

  X <- scale(X)
  y <- scale(y)

-  C <- gausskernel.nakr(X, as.matrix(X[splidx,]), sigma2)
-  K11 <- C[splidx,]
+  C <- gausskernel.nakr(X, as.matrix(X[ldm$idx,]), sigma2)
+  K11 <- C[ldm$idx,]

  svdK11 <- svd(K11)
-  # K11 will often be ill-formed, thus we drop the bottom singular values
+  # K11 often ill-formed -> drop small sv
  ks <- which(svdK11$d < 1E-12)
  if (length(ks)>0) {k <- ks[1]} else {k <- length(svdK11$d)}

@ -59,7 +52,7 @@ function(X, y, sigma2=NULL, lambda=1E-4, landmarks=NULL, nb.landmarks=NULL)
            y=y,
            sigma2=sigma2,
            lambda=lambda,
-            splidx=splidx,
+            ldmidx=ldm$idx,
            coef=coef,
            beta=beta
           )
@ -67,6 +60,21 @@ function(X, y, sigma2=NULL, lambda=1E-4, landmarks=NULL, nb.landmarks=NULL)
  return(r)
 }

+landmarks.nakr <-
+function(X, landmarks, nb.landmarks)
+{
+  n <- nrow(X)
+  if(is.null(landmarks)) {
+    if(is.null(nb.landmarks)) { nb.landmarks <- round(sqrt(n)) }
+    ldmidx <- sample(1:n, nb.landmarks, replace = FALSE)
+  } else {
+    ldmidx <- which(rownames(X) %in% as.character(landmarks))
+  }
+  ldmidx <- sort(ldmidx)
+  ldmnms <- as.numeric(rownames(X)[ldmidx])
+  return(list(idx=ldmidx, nms=ldmnms))
+}
+
 predict.nakr <-
 function(o, newdata)
 {
@ -81,7 +89,7 @@ function(o, newdata)
  }
  newdata <- scale(newdata,center=attr(o$X,"scaled:center"),
                   scale=attr(o$X,"scaled:scale"))
-  Ktest <- gausskernel.nakr(newdata, as.matrix(o$X[o$splidx,]), o$sigma2)
+  Ktest <- gausskernel.nakr(newdata, as.matrix(o$X[o$ldmidx,]), o$sigma2)
  yh <- Ktest %*% o$beta
  yh <- (yh * attr(o$y,"scaled:scale")) + attr(o$y,"scaled:center")
 }
@ -91,23 +99,26 @@ function(X, y, K=5, lambdas=NULL, sigma2=NULL, landmarks=NULL, nb.landmarks=NULL
 {
  if(is.null(lambdas)) { lambdas <- 10^seq(-8, 2, by=1) }

-  N <- nrow(X)
-  folds <- rep_len(1:K, N)
-  folds <- sample(folds, N)
+  n <- nrow(X)
+  folds <- rep_len(1:K, n)
+  folds <- sample(folds, n)
  maes <- matrix(data = NA, nrow = K, ncol = length(lambdas))
  colnames(maes) <- lambdas
  lambda_idx <- 1
+  ldm <- landmarks.nakr(X, landmarks, nb.landmarks)
  for(lambda in lambdas) {
    for(k in 1:K) {
      fold <- folds == k
-      nakrm <- nakr(X[!fold,], y[!fold], sigma2, lambda, landmarks, nb.landmarks)
+      ldmnms2keep <- ldm$nms[! ldm$idx %in% which(fold)]
+      nakrm <- nakr(X[!fold,], y[!fold], sigma2, lambda, landmarks=ldmnms2keep)
      pred <- predict(nakrm, X[fold,])
      maes[k,lambda_idx] <- mean(abs(pred - y[fold]))
+      print(paste("lbd =", lambda, "; k =", k, "; mae =", maes[k,lambda_idx]))
    }
    lambda_idx <- lambda_idx + 1
  }
  mmaes <- colMeans(maes)
  minmmaes <- min(mmaes)
  bestlambda <- lambdas[which(mmaes == minmmaes)]
-  nakrm <- nakr(X, y, sigma2, bestlambda, landmarks, nb.landmarks)
+  nakrm <- nakr(X, y, sigma2, bestlambda, landmarks=ldm$nms)
 }