one hot encoding function / correspondence analysis based landmarks for nystroem approx / experiment on housing dataset

2023-01-22 21:27:38 +01:00 · 2023-01-22 21:27:38 +01:00 · d3639fe2ef
commit d3639fe2ef
parent 7a656f6597
3 changed files with 155 additions and 30 deletions
--- a/19_b_nystroem_approximation_housing_experiment_code.R
+++ b/19_b_nystroem_approximation_housing_experiment_code.R
@ -0,0 +1,106 @@
+source("05_d_svd_mca_code.R")
+source("04_validation_croisee_code.R")
+source("19_nystroem_approximation_code.R")
+
+datasetHousing.nakr <-
+function()
+{
+  dat <- read.csv(file="data/housing.csv", header=TRUE)
+
+  dat$ocean_proximity <- as.factor(dat$ocean_proximity)
+  levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="<1H OCEAN"] <- "O:<1H"
+  levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="ISLAND"] <- "O:ISL"
+  levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="INLAND"] <- "O:INL"
+  levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR BAY"] <- "O:NB"
+  levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR OCEAN"] <- "O:NO"
+
+  dat$total_bedrooms[is.na(dat$total_bedrooms)] <- median(dat$total_bedrooms, na.rm=TRUE)
+  dat <- dat[dat$ocean_proximity != "O:ISL", ]
+  # suppression des modalités vides (ici "O:ISL")
+  dat <- droplevels(dat)
+  dat['rooms'] <- dat['total_rooms'] / dat['households']
+  dat['bedrooms'] <- dat['total_bedrooms'] / dat['households']
+  dat['pop'] <- dat['population'] / dat['households']
+  dat <- dat[dat$median_house_value < 500001, ]
+
+  dat <- dat[c('longitude', 'latitude', 'housing_median_age', 'households',
+               'median_income', 'median_house_value', 'ocean_proximity',
+               'rooms', 'bedrooms', 'pop')]
+
+  Z <- onehot_enc(dat[c('ocean_proximity')])
+  dat <- cbind(dat, as.data.frame(Z))
+  dat <- dat[,!(colnames(dat) %in% c('ocean_proximity'))]
+
+  dat.all <- dat
+  dat <- list(X = dat[,!(colnames(dat) %in% c('median_house_value'))],
+              Y = dat[,c('median_house_value')])
+  split <- splitdata(dat, 0.8)
+  entr <- split$entr
+  test <- split$test
+
+  r <- list( dat=dat.all, entr=entr, test=test )
+  return(r)
+}
+
+# test.tbl <- table(c(4,2,4,2,1,1,1))
+# all(intersperse(test.tbl) == c(1, 2, 4, 1, 2, 4, 1))
+intersperse <-
+function(tbl)
+{
+  n <- sum(tbl)
+  values <- as.numeric(names(tbl))
+  r <- numeric(n)
+  i <- 1
+  while(i <= n)
+  {
+    for(j in 1:length(tbl))
+    {
+      if(tbl[j] != 0)
+      {
+        r[i] <- values[j]
+        i <- i + 1
+        tbl[j] <- tbl[j] - 1
+      }
+    }
+  }
+  return(r)
+}
+
+# sample the landmarks from the clusters of individuals
+# obtained after correspondence analysis
+landmarks.by.ca.clst <-
+function(cam, X, nbLandmarks)
+{
+  if(nbLandmarks > nrow(X))
+  {
+    stop("The number of landmarks must be less than the number of training samples.")
+  }
+  landmarks <- numeric(nbLandmarks)
+  clst <- cam$clsti$cluster[as.numeric(rownames(X))]
+  clst.tbl <- table(clst)
+  nb.by.clst <- table((intersperse(clst.tbl))[1:nbLandmarks])
+  clst.id <- as.numeric(names(nb.by.clst))
+  set.seed(1123)
+  clst <- sample(clst)
+  offset <- 0
+  for (i in 1:length(nb.by.clst))
+  {
+    k <- as.numeric(nb.by.clst[i])
+    landmarks[(offset+1):(offset+k)] <- as.numeric(names((clst[clst==clst.id[i]])[1:k]))
+    offset <- offset+k
+  }
+  return(landmarks)
+}
+
+hous.dat.nakr <- datasetHousing.nakr()
+X.entr <- hous.dat.nakr$entr$X
+Y.entr <- hous.dat.nakr$entr$Y
+X.test <- hous.dat.nakr$test$X
+Y.test <- hous.dat.nakr$test$Y
+hous.dat.ca <- datasetHousing.mca()
+hous.cam <- mca(hous.dat.ca)
+nb.landmarks <- round(sqrt(nrow(X.entr)))
+landmarks <- landmarks.by.ca.clst(hous.cam, X.entr, nb.landmarks)
+nakrm <- kfold.nakr(X.entr, Y.entr, landmarks=landmarks)
+nakrm.yh <- predict(nakrm, X.test)
+nakrm.mae <- mean(abs(nakrm.yh - Y.test))
--- a/19_nystroem_approximation.Rmd
+++ b/19_nystroem_approximation.Rmd
@ -196,7 +196,7 @@ Nous reprenons le jeu de données synthétique utilisé depuis le premier module
  entr <- splitres$entr
  test <- splitres$test

-  nakrm <- nakr(entr$X,entr$Y, nspl=15)
+  nakrm <- nakr(entr$X,entr$Y, nb.landmarks=25)
  yh <- predict(nakrm,test$X)
  plt(test,f)
  points(test$X, yh, pch=4)
--- a/19_nystroem_approximation_code.R
+++ b/19_nystroem_approximation_code.R
@ -14,20 +14,20 @@ function(X1, X2, sigma2)

 # Nystroem Approximation Kernel Ridge Regression
 nakr <-
-function(X, y, sigma2=NULL, lambdas=NULL, splidx=NULL, nspl=NULL)
+function(X, y, sigma2=NULL, lambda=1E-4, landmarks=NULL, nb.landmarks=NULL)
 {
  X <- as.matrix(X)
  n <- nrow(X)
  p <- ncol(X)

-  if(is.null(lambdas)) { lambdas <- 10^seq(-8, 2,by=0.5) }
  if(is.null(sigma2)) { sigma2 <- p }

-  if(is.null(splidx)) {
-    if(is.null(nspl)) { nspl <- round(sqrt(n)) }
-    splidx <- sample(1:n, nspl, replace = FALSE)
+  if(is.null(landmarks)) {
+    if(is.null(nb.landmarks)) { nb.landmarks <- round(sqrt(n)) }
+    splidx <- sample(1:n, nb.landmarks, replace = FALSE)
  } else {
-    nspl <- length(splidx)
+    splidx <- which(rownames(X) %in% as.character(landmarks))
+    nb.landmarks <- length(splidx)
  }
  splidx <- sort(splidx)

@ -39,35 +39,28 @@ function(X, y, sigma2=NULL, lambdas=NULL, splidx=NULL, nspl=NULL)

  svdK11 <- svd(K11)
  # K11 will often be ill-formed, thus we drop the bottom singular values
-  k <- 0.8 * nspl
+  k <- which(svdK11$d < 1E-12)[1] - 1
+
  US <- svdK11$u[,1:k] %*% diag(1 / sqrt(svdK11$d[1:k]))
  L <- C %*% US
-  LtL <- t(L) %*% L
+  Ginv <- t(L) %*% L
+  diag(Ginv) <- diag(Ginv) + lambda
+  Ginv <- chol2inv(chol(Ginv))
+  Ginv <- L %*% Ginv %*% t(L)
+  Ginv <- - Ginv / lambda
+  diag(Ginv) <- diag(Ginv) + (1/lambda)
+  coef <- Ginv %*% y

-  looe <- double(length(lambdas))
-  coef <- matrix(data = NA, nrow = n, ncol = length(lambdas))
-  i <- 1
-  for(lambda in lambdas) {
-    Ginv <- LtL
-    diag(Ginv) <- diag(Ginv) + lambda
-    Ginv <- solve(Ginv)
-    Ginv <- L %*% Ginv %*% t(L)
-    Ginv <- - Ginv / lambda
-    diag(Ginv) <- diag(Ginv) + (1/lambda)
-    coef[,i] <- Ginv %*% y
-    looe[i] <- mean((coef[,i]/diag(Ginv))^2)
-    i <- i+1
-  }
-  looe.min <- min(looe)
-  lambda <- lambdas[which(looe == looe.min)]
-  coef <- coef[,which(looe == looe.min)]
+  K11inv <- svdK11$v[,1:k] %*% diag(1/svdK11$d[1:k]) %*% t(svdK11$u[,1:k])
+  beta <- K11inv %*% t(C) %*% coef

  r <- list(X=X,
            y=y,
            sigma2=sigma2,
+            lambda=lambda,
+            splidx=splidx,
            coef=coef,
-            looe=looe.min,
-            lambda=lambda
+            beta=beta
           )
  class(r) <- "nakr"
  return(r)
@ -87,7 +80,33 @@ function(o, newdata)
  }
  newdata <- scale(newdata,center=attr(o$X,"scaled:center"),
                   scale=attr(o$X,"scaled:scale"))
-  Ktest <- gausskernel(newdata, o$X, o$sigma2)
-  yh <- Ktest %*% o$coef
+  Ktest <- gausskernel(newdata, as.matrix(o$X[o$splidx,]), o$sigma2)
+  yh <- Ktest %*% o$beta
  yh <- (yh * attr(o$y,"scaled:scale")) + attr(o$y,"scaled:center")
 }
+
+kfold.nakr <-
+function(X, y, K=5, lambdas=NULL, sigma2=NULL, landmarks=NULL, nb.landmarks=NULL)
+{
+  if(is.null(lambdas)) { lambdas <- 10^seq(-8, 2, by=1) }
+
+  N <- nrow(X)
+  folds <- rep_len(1:K, N)
+  folds <- sample(folds, N)
+  maes <- matrix(data = NA, nrow = K, ncol = length(lambdas))
+  colnames(maes) <- lambdas
+  lambda_idx <- 1
+  for(lambda in lambdas) {
+    for(k in 1:K) {
+      fold <- folds == k
+      nakrm <- nakr(X[!fold,], y[!fold], sigma2, lambda, landmarks, nb.landmarks)
+      pred <- predict(nakrm, X[fold,])
+      maes[k,lambda_idx] <- mean(abs(pred - y[fold]))
+    }
+    lambda_idx <- lambda_idx + 1
+  }
+  mmaes <- colMeans(maes)
+  minmmaes <- min(mmaes)
+  bestlambda <- lambdas[which(mmaes == minmmaes)]
+  nakrm <- nakr(X, y, sigma2, bestlambda, landmarks, nb.landmarks)
+}