# compute the gaussian kernel between each row of X1 and each row of X2 # should be done more efficiently gausskernel.nakr <- function(X1, X2, sigma2) { n1 <- dim(X1)[1] n2 <- dim(X2)[1] K <- matrix(nrow = n1, ncol = n2) for(i in 1:n1) for(j in 1:n2) K[i,j] <- sum(X1[i,] - X2[j,])^2 K <- exp(-1*K/sigma2) } # Nystroem Approximation Kernel Ridge Regression nakr <- function(X, y, sigma2=NULL, lambda=1E-4, landmarks=NULL, nb.landmarks=NULL) { X <- as.matrix(X) n <- nrow(X) p <- ncol(X) if(is.null(sigma2)) { sigma2 <- p } if(is.null(landmarks)) { if(is.null(nb.landmarks)) { nb.landmarks <- round(sqrt(n)) } splidx <- sample(1:n, nb.landmarks, replace = FALSE) } else { splidx <- which(rownames(X) %in% as.character(landmarks)) nb.landmarks <- length(splidx) } splidx <- sort(splidx) X <- scale(X) y <- scale(y) C <- gausskernel.nakr(X, as.matrix(X[splidx,]), sigma2) K11 <- C[splidx,] svdK11 <- svd(K11) # K11 will often be ill-formed, thus we drop the bottom singular values k <- which(svdK11$d < 1E-12)[1] - 1 US <- svdK11$u[,1:k] %*% diag(1 / sqrt(svdK11$d[1:k])) L <- C %*% US Ginv <- t(L) %*% L diag(Ginv) <- diag(Ginv) + lambda Ginv <- chol2inv(chol(Ginv)) Ginv <- L %*% Ginv %*% t(L) Ginv <- - Ginv / lambda diag(Ginv) <- diag(Ginv) + (1/lambda) coef <- Ginv %*% y K11inv <- svdK11$v[,1:k] %*% diag(1/svdK11$d[1:k]) %*% t(svdK11$u[,1:k]) beta <- K11inv %*% t(C) %*% coef r <- list(X=X, y=y, sigma2=sigma2, lambda=lambda, splidx=splidx, coef=coef, beta=beta ) class(r) <- "nakr" return(r) } predict.nakr <- function(o, newdata) { if(class(o) != "nakr") { warning("Object is not of class 'nakr'") UseMethod("predict") return(invisible(NULL)) } newdata <- as.matrix(newdata) if(ncol(o$X)!=ncol(newdata)) { stop("Not the same number of variables btwn fitted nakr object and new data") } newdata <- scale(newdata,center=attr(o$X,"scaled:center"), scale=attr(o$X,"scaled:scale")) Ktest <- gausskernel.nakr(newdata, as.matrix(o$X[o$splidx,]), o$sigma2) yh <- Ktest %*% o$beta yh <- (yh * attr(o$y,"scaled:scale")) + attr(o$y,"scaled:center") } kfold.nakr <- function(X, y, K=5, lambdas=NULL, sigma2=NULL, landmarks=NULL, nb.landmarks=NULL) { if(is.null(lambdas)) { lambdas <- 10^seq(-8, 2, by=1) } N <- nrow(X) folds <- rep_len(1:K, N) folds <- sample(folds, N) maes <- matrix(data = NA, nrow = K, ncol = length(lambdas)) colnames(maes) <- lambdas lambda_idx <- 1 for(lambda in lambdas) { for(k in 1:K) { fold <- folds == k nakrm <- nakr(X[!fold,], y[!fold], sigma2, lambda, landmarks, nb.landmarks) pred <- predict(nakrm, X[fold,]) maes[k,lambda_idx] <- mean(abs(pred - y[fold])) } lambda_idx <- lambda_idx + 1 } mmaes <- colMeans(maes) minmmaes <- min(mmaes) bestlambda <- lambdas[which(mmaes == minmmaes)] nakrm <- nakr(X, y, sigma2, bestlambda, landmarks, nb.landmarks) }