intro_to_ml/19_nystroem_approximation_c...

# compute the gaussian kernel between each row of X1 and each row of X2
# should be done more efficiently
gausskernel <-
function(X1, X2, sigma2)
{
  n1 <- dim(X1)[1]
  n2 <- dim(X2)[1]
  K <- matrix(nrow = n1, ncol = n2)
  for(i in 1:n1)
    for(j in 1:n2)
      K[i,j] <- sum(X1[i,] - X2[j,])^2
  K <- exp(-1*K/sigma2)
}

# Nystroem Approximation Kernel Ridge Regression
nakr <-
function(X, y, sigma2=NULL, lambdas=NULL, splidx=NULL, nspl=NULL)
{
  X <- as.matrix(X)
  n <- nrow(X)
  p <- ncol(X)

  if(is.null(lambdas)) { lambdas <- 10^seq(-8, 2,by=0.5) }
  if(is.null(sigma2)) { sigma2 <- p }

  if(is.null(splidx)) {
    if(is.null(nspl)) { nspl <- round(sqrt(n)) }
    splidx <- sample(1:n, nspl, replace = FALSE)
  } else {
    nspl <- length(splidx)
  }
  splidx <- sort(splidx)

  X <- scale(X)
  y <- scale(y)

  C <- gausskernel(X, as.matrix(X[splidx,]), sigma2)
  K11 <- C[splidx,]

  svdK11 <- svd(K11)
  # K11 will often be ill-formed, thus we drop the bottom singular values
  k <- 0.8 * nspl
  US <- svdK11$u[,1:k] %*% diag(1 / sqrt(svdK11$d[1:k]))
  L <- C %*% US
  LtL <- t(L) %*% L

  looe <- double(length(lambdas))
  coef <- matrix(data = NA, nrow = n, ncol = length(lambdas))
  i <- 1
  for(lambda in lambdas) {
    Ginv <- LtL
    diag(Ginv) <- diag(Ginv) + lambda
    Ginv <- solve(Ginv)
    Ginv <- L %*% Ginv %*% t(L)
    Ginv <- - Ginv / lambda
    diag(Ginv) <- diag(Ginv) + (1/lambda)
    coef[,i] <- Ginv %*% y
    looe[i] <- mean((coef[,i]/diag(Ginv))^2)
    i <- i+1
  }
  looe.min <- min(looe)
  lambda <- lambdas[which(looe == looe.min)]
  coef <- coef[,which(looe == looe.min)]

  r <- list(X=X,
            y=y,
            sigma2=sigma2,
            coef=coef,
            looe=looe.min,
            lambda=lambda
           )
  class(r) <- "nakr"
  return(r)
}

predict.nakr <-
function(o, newdata)
{
  if(class(o) != "nakr") {
    warning("Object is not of class 'nakr'")
    UseMethod("predict")
    return(invisible(NULL))
  }
  newdata <- as.matrix(newdata)
  if(ncol(o$X)!=ncol(newdata)) {
    stop("Not the same number of variables btwn fitted nakr object and new data")
  }
  newdata <- scale(newdata,center=attr(o$X,"scaled:center"),
                   scale=attr(o$X,"scaled:scale"))
  Ktest <- gausskernel(newdata, o$X, o$sigma2)
  yh <- Ktest %*% o$coef
  yh <- (yh * attr(o$y,"scaled:scale")) + attr(o$y,"scaled:center")
}
implementation of nystroem approx 2023-01-16 23:17:52 +00:00			`# compute the gaussian kernel between each row of X1 and each row of X2`
			`# should be done more efficiently`
			`gausskernel <-`
			`function(X1, X2, sigma2)`
			`{`
			`n1 <- dim(X1)[1]`
			`n2 <- dim(X2)[1]`
			`K <- matrix(nrow = n1, ncol = n2)`
			`for(i in 1:n1)`
			`for(j in 1:n2)`
			`K[i,j] <- sum(X1[i,] - X2[j,])^2`
			`K <- exp(-1*K/sigma2)`
			`}`

			`# Nystroem Approximation Kernel Ridge Regression`
			`nakr <-`
			`function(X, y, sigma2=NULL, lambdas=NULL, splidx=NULL, nspl=NULL)`
			`{`
			`X <- as.matrix(X)`
			`n <- nrow(X)`
			`p <- ncol(X)`

			`if(is.null(lambdas)) { lambdas <- 10^seq(-8, 2,by=0.5) }`
			`if(is.null(sigma2)) { sigma2 <- p }`

			`if(is.null(splidx)) {`
			`if(is.null(nspl)) { nspl <- round(sqrt(n)) }`
			`splidx <- sample(1:n, nspl, replace = FALSE)`
			`} else {`
			`nspl <- length(splidx)`
			`}`
			`splidx <- sort(splidx)`

			`X <- scale(X)`
			`y <- scale(y)`

			`C <- gausskernel(X, as.matrix(X[splidx,]), sigma2)`
			`K11 <- C[splidx,]`

			`svdK11 <- svd(K11)`
			`# K11 will often be ill-formed, thus we drop the bottom singular values`
			`k <- 0.8 * nspl`
			`US <- svdK11$u[,1:k] %*% diag(1 / sqrt(svdK11$d[1:k]))`
			`L <- C %*% US`
			`LtL <- t(L) %*% L`

			`looe <- double(length(lambdas))`
			`coef <- matrix(data = NA, nrow = n, ncol = length(lambdas))`
			`i <- 1`
			`for(lambda in lambdas) {`
			`Ginv <- LtL`
			`diag(Ginv) <- diag(Ginv) + lambda`
			`Ginv <- solve(Ginv)`
			`Ginv <- L %% Ginv %% t(L)`
			`Ginv <- - Ginv / lambda`
			`diag(Ginv) <- diag(Ginv) + (1/lambda)`
			`coef[,i] <- Ginv %*% y`
			`looe[i] <- mean((coef[,i]/diag(Ginv))^2)`
			`i <- i+1`
			`}`
			`looe.min <- min(looe)`
			`lambda <- lambdas[which(looe == looe.min)]`
			`coef <- coef[,which(looe == looe.min)]`

			`r <- list(X=X,`
			`y=y,`
			`sigma2=sigma2,`
			`coef=coef,`
			`looe=looe.min,`
			`lambda=lambda`
			`)`
			`class(r) <- "nakr"`
			`return(r)`
			`}`

			`predict.nakr <-`
			`function(o, newdata)`
			`{`
			`if(class(o) != "nakr") {`
			`warning("Object is not of class 'nakr'")`
			`UseMethod("predict")`
			`return(invisible(NULL))`
			`}`
			`newdata <- as.matrix(newdata)`
			`if(ncol(o$X)!=ncol(newdata)) {`
			`stop("Not the same number of variables btwn fitted nakr object and new data")`
			`}`
			`newdata <- scale(newdata,center=attr(o$X,"scaled:center"),`
			`scale=attr(o$X,"scaled:scale"))`
			`Ktest <- gausskernel(newdata, o$X, o$sigma2)`
			`yh <- Ktest %*% o$coef`
			`yh <- (yh * attr(o$y,"scaled:scale")) + attr(o$y,"scaled:center")`
			`}`