intro_to_ml/19_nystroem_approximation_c...

# compute the gaussian kernel between each row of X1 and each row of X2
# should be done more efficiently (C code, threads)
gausskernel.nakr <-
function(X1, X2, sigma2)
{
  n1 <- dim(X1)[1]
  n2 <- dim(X2)[1]
  K <- matrix(nrow = n1, ncol = n2)
  for(i in 1:n1)
    for(j in 1:n2)
      K[i,j] <- sum((X1[i,] - X2[j,])^2)
  K <- exp(-1*K/sigma2)
}

# Nystroem Approximation Kernel Ridge Regression
nakr <-
function(X, y, sigma2=NULL, lambda=1E-8, landmarks=NULL, nb.landmarks=NULL)
{
  X <- as.matrix(X)
  n <- nrow(X)
  p <- ncol(X)

  if(is.null(sigma2)) { sigma2 <- p }

  ldm <- landmarks.nakr(X, landmarks, nb.landmarks)

  X <- scale(X)
  y <- scale(y)

  C <- gausskernel.nakr(X, as.matrix(X[ldm$idx,]), sigma2)
  K11 <- C[ldm$idx,]

  svdK11 <- svd(K11)
  # K11 often ill-formed -> drop small sv
  ks <- which(svdK11$d < 1E-12)
  if (length(ks)>0) {k <- ks[1]} else {k <- length(svdK11$d)}

  US <- svdK11$u[,1:k] %*% diag(1 / sqrt(svdK11$d[1:k]))
  L <- C %*% US
  Ginv <- t(L) %*% L
  diag(Ginv) <- diag(Ginv) + lambda
  Ginv <- chol2inv(chol(Ginv))
  Ginv <- L %*% Ginv %*% t(L)
  Ginv <- - Ginv / lambda
  diag(Ginv) <- diag(Ginv) + (1/lambda)
  coef <- Ginv %*% y

  K11inv <- svdK11$v[,1:k] %*% diag(1/svdK11$d[1:k]) %*% t(svdK11$u[,1:k])
  beta <- K11inv %*% t(C) %*% coef

  r <- list(X=X,
            y=y,
            sigma2=sigma2,
            lambda=lambda,
            ldmidx=ldm$idx,
            coef=coef,
            beta=beta
           )
  class(r) <- "nakr"
  return(r)
}

landmarks.nakr <-
function(X, landmarks, nb.landmarks)
{
  n <- nrow(X)
  if(is.null(landmarks)) {
    if(is.null(nb.landmarks)) { nb.landmarks <- round(sqrt(n)) }
    ldmidx <- sample(1:n, nb.landmarks, replace = FALSE)
  } else {
    ldmidx <- which(rownames(X) %in% as.character(landmarks))
  }
  ldmidx <- sort(ldmidx)
  ldmnms <- as.numeric(rownames(X)[ldmidx])
  return(list(idx=ldmidx, nms=ldmnms))
}

predict.nakr <-
function(o, newdata)
{
  if(class(o) != "nakr") {
    warning("Object is not of class 'nakr'")
    UseMethod("predict")
    return(invisible(NULL))
  }
  newdata <- as.matrix(newdata)
  if(ncol(o$X)!=ncol(newdata)) {
    stop("Not the same number of variables btwn fitted nakr object and new data")
  }
  newdata <- scale(newdata,center=attr(o$X,"scaled:center"),
                   scale=attr(o$X,"scaled:scale"))
  Ktest <- gausskernel.nakr(newdata, as.matrix(o$X[o$ldmidx,]), o$sigma2)
  yh <- Ktest %*% o$beta
  yh <- (yh * attr(o$y,"scaled:scale")) + attr(o$y,"scaled:center")
}

kfold.nakr <-
function(X, y, K=5, lambdas=NULL, sigma2=NULL, landmarks=NULL, nb.landmarks=NULL)
{
  if(is.null(lambdas)) { lambdas <- 10^seq(-8, 2, by=1) }

  n <- nrow(X)
  folds <- rep_len(1:K, n)
  folds <- sample(folds, n)
  maes <- matrix(data = NA, nrow = K, ncol = length(lambdas))
  colnames(maes) <- lambdas
  lambda_idx <- 1
  ldm <- landmarks.nakr(X, landmarks, nb.landmarks)
  for(lambda in lambdas) {
    for(k in 1:K) {
      fold <- folds == k
      ldmnms2keep <- ldm$nms[! ldm$idx %in% which(fold)]
      nakrm <- nakr(X[!fold,], y[!fold], sigma2, lambda, landmarks=ldmnms2keep)
      pred <- predict(nakrm, X[fold,])
      maes[k,lambda_idx] <- mean(abs(pred - y[fold]))
      print(paste("lbd =", lambda, "; k =", k, "; mae =", maes[k,lambda_idx]))
    }
    lambda_idx <- lambda_idx + 1
  }
  mmaes <- colMeans(maes)
  minmmaes <- min(mmaes)
  bestlambda <- lambdas[which(mmaes == minmmaes)]
  nakrm <- nakr(X, y, sigma2, bestlambda, landmarks=ldm$nms)
}
implementation of nystroem approx 2023-01-16 23:17:52 +00:00			`# compute the gaussian kernel between each row of X1 and each row of X2`
progress on nystroem approximation implementation ; there is still a problem: for a small sample size the result of krr should be the same as the one of nystroem approx krr where the number of landmarks is equal to the number of training points. For now, it isn't. 2023-03-13 12:01:40 +00:00			`# should be done more efficiently (C code, threads)`
minor. rename gausskernel fnc 2023-01-29 16:53:32 +00:00			`gausskernel.nakr <-`
implementation of nystroem approx 2023-01-16 23:17:52 +00:00			`function(X1, X2, sigma2)`
			`{`
			`n1 <- dim(X1)[1]`
			`n2 <- dim(X2)[1]`
			`K <- matrix(nrow = n1, ncol = n2)`
			`for(i in 1:n1)`
			`for(j in 1:n2)`
correction of a bug in gauss kernel computation 2023-02-20 08:41:21 +00:00			`K[i,j] <- sum((X1[i,] - X2[j,])^2)`
implementation of nystroem approx 2023-01-16 23:17:52 +00:00			`K <- exp(-1*K/sigma2)`
			`}`

			`# Nystroem Approximation Kernel Ridge Regression`
			`nakr <-`
progress on nystroem approximation implementation ; there is still a problem: for a small sample size the result of krr should be the same as the one of nystroem approx krr where the number of landmarks is equal to the number of training points. For now, it isn't. 2023-03-13 12:01:40 +00:00			`function(X, y, sigma2=NULL, lambda=1E-8, landmarks=NULL, nb.landmarks=NULL)`
implementation of nystroem approx 2023-01-16 23:17:52 +00:00			`{`
			`X <- as.matrix(X)`
			`n <- nrow(X)`
			`p <- ncol(X)`

			`if(is.null(sigma2)) { sigma2 <- p }`

progress on nystroem approximation implementation ; there is still a problem: for a small sample size the result of krr should be the same as the one of nystroem approx krr where the number of landmarks is equal to the number of training points. For now, it isn't. 2023-03-13 12:01:40 +00:00			`ldm <- landmarks.nakr(X, landmarks, nb.landmarks)`
implementation of nystroem approx 2023-01-16 23:17:52 +00:00
			`X <- scale(X)`
			`y <- scale(y)`

progress on nystroem approximation implementation ; there is still a problem: for a small sample size the result of krr should be the same as the one of nystroem approx krr where the number of landmarks is equal to the number of training points. For now, it isn't. 2023-03-13 12:01:40 +00:00			`C <- gausskernel.nakr(X, as.matrix(X[ldm$idx,]), sigma2)`
			`K11 <- C[ldm$idx,]`
implementation of nystroem approx 2023-01-16 23:17:52 +00:00
			`svdK11 <- svd(K11)`
progress on nystroem approximation implementation ; there is still a problem: for a small sample size the result of krr should be the same as the one of nystroem approx krr where the number of landmarks is equal to the number of training points. For now, it isn't. 2023-03-13 12:01:40 +00:00			`# K11 often ill-formed -> drop small sv`
correction of a bug in gauss kernel computation 2023-02-20 08:41:21 +00:00			`ks <- which(svdK11$d < 1E-12)`
			`if (length(ks)>0) {k <- ks[1]} else {k <- length(svdK11$d)}`
one hot encoding function / correspondence analysis based landmarks for nystroem approx / experiment on housing dataset 2023-01-22 20:27:38 +00:00
implementation of nystroem approx 2023-01-16 23:17:52 +00:00			`US <- svdK11$u[,1:k] %*% diag(1 / sqrt(svdK11$d[1:k]))`
			`L <- C %*% US`
one hot encoding function / correspondence analysis based landmarks for nystroem approx / experiment on housing dataset 2023-01-22 20:27:38 +00:00			`Ginv <- t(L) %*% L`
			`diag(Ginv) <- diag(Ginv) + lambda`
			`Ginv <- chol2inv(chol(Ginv))`
			`Ginv <- L %% Ginv %% t(L)`
			`Ginv <- - Ginv / lambda`
			`diag(Ginv) <- diag(Ginv) + (1/lambda)`
			`coef <- Ginv %*% y`
implementation of nystroem approx 2023-01-16 23:17:52 +00:00
one hot encoding function / correspondence analysis based landmarks for nystroem approx / experiment on housing dataset 2023-01-22 20:27:38 +00:00			`K11inv <- svdK11$v[,1:k] %% diag(1/svdK11$d[1:k]) %% t(svdK11$u[,1:k])`
			`beta <- K11inv %% t(C) %% coef`
implementation of nystroem approx 2023-01-16 23:17:52 +00:00
			`r <- list(X=X,`
			`y=y,`
			`sigma2=sigma2,`
one hot encoding function / correspondence analysis based landmarks for nystroem approx / experiment on housing dataset 2023-01-22 20:27:38 +00:00			`lambda=lambda,`
progress on nystroem approximation implementation ; there is still a problem: for a small sample size the result of krr should be the same as the one of nystroem approx krr where the number of landmarks is equal to the number of training points. For now, it isn't. 2023-03-13 12:01:40 +00:00			`ldmidx=ldm$idx,`
implementation of nystroem approx 2023-01-16 23:17:52 +00:00			`coef=coef,`
one hot encoding function / correspondence analysis based landmarks for nystroem approx / experiment on housing dataset 2023-01-22 20:27:38 +00:00			`beta=beta`
implementation of nystroem approx 2023-01-16 23:17:52 +00:00			`)`
			`class(r) <- "nakr"`
			`return(r)`
			`}`

progress on nystroem approximation implementation ; there is still a problem: for a small sample size the result of krr should be the same as the one of nystroem approx krr where the number of landmarks is equal to the number of training points. For now, it isn't. 2023-03-13 12:01:40 +00:00			`landmarks.nakr <-`
			`function(X, landmarks, nb.landmarks)`
			`{`
			`n <- nrow(X)`
			`if(is.null(landmarks)) {`
			`if(is.null(nb.landmarks)) { nb.landmarks <- round(sqrt(n)) }`
			`ldmidx <- sample(1:n, nb.landmarks, replace = FALSE)`
			`} else {`
			`ldmidx <- which(rownames(X) %in% as.character(landmarks))`
			`}`
			`ldmidx <- sort(ldmidx)`
			`ldmnms <- as.numeric(rownames(X)[ldmidx])`
			`return(list(idx=ldmidx, nms=ldmnms))`
			`}`

implementation of nystroem approx 2023-01-16 23:17:52 +00:00			`predict.nakr <-`
			`function(o, newdata)`
			`{`
			`if(class(o) != "nakr") {`
			`warning("Object is not of class 'nakr'")`
			`UseMethod("predict")`
			`return(invisible(NULL))`
			`}`
			`newdata <- as.matrix(newdata)`
			`if(ncol(o$X)!=ncol(newdata)) {`
			`stop("Not the same number of variables btwn fitted nakr object and new data")`
			`}`
			`newdata <- scale(newdata,center=attr(o$X,"scaled:center"),`
			`scale=attr(o$X,"scaled:scale"))`
progress on nystroem approximation implementation ; there is still a problem: for a small sample size the result of krr should be the same as the one of nystroem approx krr where the number of landmarks is equal to the number of training points. For now, it isn't. 2023-03-13 12:01:40 +00:00			`Ktest <- gausskernel.nakr(newdata, as.matrix(o$X[o$ldmidx,]), o$sigma2)`
one hot encoding function / correspondence analysis based landmarks for nystroem approx / experiment on housing dataset 2023-01-22 20:27:38 +00:00			`yh <- Ktest %*% o$beta`
implementation of nystroem approx 2023-01-16 23:17:52 +00:00			`yh <- (yh * attr(o$y,"scaled:scale")) + attr(o$y,"scaled:center")`
			`}`
one hot encoding function / correspondence analysis based landmarks for nystroem approx / experiment on housing dataset 2023-01-22 20:27:38 +00:00
			`kfold.nakr <-`
			`function(X, y, K=5, lambdas=NULL, sigma2=NULL, landmarks=NULL, nb.landmarks=NULL)`
			`{`
			`if(is.null(lambdas)) { lambdas <- 10^seq(-8, 2, by=1) }`

progress on nystroem approximation implementation ; there is still a problem: for a small sample size the result of krr should be the same as the one of nystroem approx krr where the number of landmarks is equal to the number of training points. For now, it isn't. 2023-03-13 12:01:40 +00:00			`n <- nrow(X)`
			`folds <- rep_len(1:K, n)`
			`folds <- sample(folds, n)`
one hot encoding function / correspondence analysis based landmarks for nystroem approx / experiment on housing dataset 2023-01-22 20:27:38 +00:00			`maes <- matrix(data = NA, nrow = K, ncol = length(lambdas))`
			`colnames(maes) <- lambdas`
			`lambda_idx <- 1`
progress on nystroem approximation implementation ; there is still a problem: for a small sample size the result of krr should be the same as the one of nystroem approx krr where the number of landmarks is equal to the number of training points. For now, it isn't. 2023-03-13 12:01:40 +00:00			`ldm <- landmarks.nakr(X, landmarks, nb.landmarks)`
one hot encoding function / correspondence analysis based landmarks for nystroem approx / experiment on housing dataset 2023-01-22 20:27:38 +00:00			`for(lambda in lambdas) {`
			`for(k in 1:K) {`
			`fold <- folds == k`
progress on nystroem approximation implementation ; there is still a problem: for a small sample size the result of krr should be the same as the one of nystroem approx krr where the number of landmarks is equal to the number of training points. For now, it isn't. 2023-03-13 12:01:40 +00:00			`ldmnms2keep <- ldm$nms[! ldm$idx %in% which(fold)]`
			`nakrm <- nakr(X[!fold,], y[!fold], sigma2, lambda, landmarks=ldmnms2keep)`
one hot encoding function / correspondence analysis based landmarks for nystroem approx / experiment on housing dataset 2023-01-22 20:27:38 +00:00			`pred <- predict(nakrm, X[fold,])`
			`maes[k,lambda_idx] <- mean(abs(pred - y[fold]))`
progress on nystroem approximation implementation ; there is still a problem: for a small sample size the result of krr should be the same as the one of nystroem approx krr where the number of landmarks is equal to the number of training points. For now, it isn't. 2023-03-13 12:01:40 +00:00			`print(paste("lbd =", lambda, "; k =", k, "; mae =", maes[k,lambda_idx]))`
one hot encoding function / correspondence analysis based landmarks for nystroem approx / experiment on housing dataset 2023-01-22 20:27:38 +00:00			`}`
			`lambda_idx <- lambda_idx + 1`
			`}`
			`mmaes <- colMeans(maes)`
			`minmmaes <- min(mmaes)`
			`bestlambda <- lambdas[which(mmaes == minmmaes)]`
progress on nystroem approximation implementation ; there is still a problem: for a small sample size the result of krr should be the same as the one of nystroem approx krr where the number of landmarks is equal to the number of training points. For now, it isn't. 2023-03-13 12:01:40 +00:00			`nakrm <- nakr(X, y, sigma2, bestlambda, landmarks=ldm$nms)`
correction of a bug in gauss kernel computation 2023-02-20 08:41:21 +00:00			`}`