intro_to_ml/05_d_svd_mca_code.R

# https://www.r-bloggers.com/2016/07/round-values-while-preserve-their-rounded-sum-in-r/
# E.G.
# > sum(c(0.333, 0.333, 0.334))
# [1] 1
# > sum(round(c(0.333, 0.333, 0.334), 2))
# [1] 0.99
# > sum(round_preserve_sum(c(0.333, 0.333, 0.334), 2))
# [1] 1.00
round_preserve_sum <- function(x, digits = 0) {
  up <- 10 ^ digits
  x <- x * up
  y <- floor(x)
  indices <- tail(order(x-y), round(sum(x)) - sum(y))
  y[indices] <- y[indices] + 1
  y / up
}

# `kcuts` cuts variable `x` into `centers` categories with k-means.
kcuts <-
function(x, centers)
{
  set.seed(1123)
  km <- kmeans(x = x, centers = centers)
  cuts <- unlist(lapply(order(km$centers), function(clustId) {
    min(x[km$cluster == clustId]) }))
  cuts <- c(cuts, max(x))
}

# ventilate modality `mod` of categorical var `cat`
# into the remaining modalities according to their relative frequencies.
ventilate <-
function(cat, mod)
{
  if (mod == "NA") isna <- TRUE else isna <- FALSE
  tab <- table(cat)
  if (isna)
  {
    sup_i <- which(is.na(cat))
    sup_n <- length(sup_i)
    act_mod <- tab
  }
  else
  {
    sup_i <- which(cat == mod)
    sup_n <- tab[mod]
    act_mod <- tab[! names(tab) %in% mod]
  }
  prob <- act_mod / sum(act_mod)
  smpl <- sample(names(prob), size = sup_n, replace = TRUE, prob = prob)
  r <- list(sup_mod = mod,
            sup_n = sup_n,
            sup_i = sup_i,
            smpl = smpl)
  return(r)
}

# pre-processing for exploratory analytics of the housing dataset

datasetHousing.mca <-
function(dat=NULL)
{
  # loading dataset in memory
  if(is.null(dat)) { dat <- read.csv(file="data/housing.csv", header=TRUE) }

  # transform ocean_proximity into a factor
  # a factor being R representation of categorical variables
  dat$ocean_proximity <- as.factor(dat$ocean_proximity)
  levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="<1H OCEAN"] <- "O:<1H"
  levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="ISLAND"] <- "O:ISL"
  levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="INLAND"] <- "O:INL"
  levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR BAY"] <- "O:NB"
  levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR OCEAN"] <- "O:NO"

  # discretization of longitude
  cuts <- kcuts(x = dat$longitude, centers = 4)
  dat$c_longitude <- cut(x = dat$longitude, unique(cuts), include.lowest = TRUE)
  levels(dat$c_longitude) <- c('LO:W', 'LO:MW', 'LO:ME', 'LO:E')

  # discretization of latitude
  cuts <- kcuts(x = dat$latitude, centers = 4)
  dat$c_latitude <- cut(x = dat$latitude, unique(cuts), include.lowest = TRUE)
  levels(dat$c_latitude) <- c('LA:S','LA:MS','LA:MN','LA:N')

  # discretization of housing_median_age
  cuts <- c(min(dat$housing_median_age), 15, 25, 35, 51, 52)
  dat$c_age <- cut(x = dat$housing_median_age, unique(cuts),
                   include.lowest = TRUE)
  levels(dat$c_age) <- c('AG:15]','AG:25]','AG:35]','AG:51]', 'AG:52')

  # creation and discretization of rooms
  dat$rooms <- dat$total_rooms / dat$households
  cuts <- c(min(dat$rooms), 4, 6, 8, max(dat$rooms))
  dat$c_rooms <- cut(x = dat$rooms, unique(cuts), include.lowest = TRUE)
  levels(dat$c_rooms) <- c('RO:4]','RO:6]','RO:8]', 'RO:>8')

  # creation and discretization of bedrooms
  dat$bedrooms <- dat$total_bedrooms / dat$households
  cuts <- c(min(dat$bedrooms, na.rm = TRUE), 1.1, max(dat$bedrooms, na.rm = TRUE))
  dat$c_bedrooms <- cut(x = dat$bedrooms, unique(cuts), include.lowest = TRUE)
  levels(dat$c_bedrooms) <- c('BE:1]','BE:>1')

  # creation and discretization of pop
  dat$pop <- dat$population / dat$households
  cuts <- c(min(dat$pop), 2, 3, 4, max(dat$pop))
  dat$c_pop <- cut(x = dat$pop, unique(cuts), include.lowest = TRUE)
  levels(dat$c_pop) <- c('PO:2]','PO:3]', 'PO:4]', 'PO:>4')

  # discretization of households
  cuts <- c(min(dat$households), 300, 400, 600, max(dat$households))
  dat$c_households <- cut(x = dat$households, cuts, include.lowest = TRUE)
  levels(dat$c_households) <- c('HH:3]', 'HH:4]', 'HH:6]', 'HH:>6')

  # discretization of median_income
  cuts <- quantile(dat$median_income, probs = seq(0,1,1/4))
  cuts <- c(cuts[1:length(cuts)-1], 15, max(dat$median_income))
  dat$c_income <- cut(x = dat$median_income, cuts, include.lowest = TRUE)
  levels(dat$c_income) <- c('IC:L', 'IC:ML', 'IC:MH', 'IC:H', 'IC:>15')

  # discretization of median_house_value
  cuts <- c(min(dat$median_house_value), 115000, 175000, 250000, 500000,
            max(dat$median_house_value))
  dat$c_house_value <- cut(x = dat$median_house_value, cuts,
                           include.lowest = TRUE)
  levels(dat$c_house_value) <- c('HV:A', 'HV:B', 'HV:C', 'HV:D', 'HV:E')

  # discretized version of the entire dataset
  dat <- dat[c('ocean_proximity', 'c_longitude', 'c_latitude', 'c_age',
                   'c_rooms', 'c_bedrooms', 'c_pop', 'c_households', 'c_income',
                   'c_house_value')]
  vent <- list()
  # ventilation of modality RO:>8 of variable c_rooms
  vent$c_rooms <- ventilate(dat$c_rooms, "RO:>8")
  dat$c_rooms[vent$c_rooms$sup_i] <- vent$c_rooms$smpl

  # ventilation of modality IC:>15 of variable c_income
  vent$c_income <- ventilate(dat$c_income, "IC:>15")
  dat$c_income[vent$c_income$sup_i] <- vent$c_income$smpl

  # ventilation of modality O:ISL of variable ocean_proximity
  vent$ocean_proximity <- ventilate(dat$ocean_proximity, "O:ISL")
  dat$ocean_proximity[vent$ocean_proximity$sup_i] <- vent$ocean_proximity$smpl

  # ventilation of missing values of variable c_bedrooms
  vent$c_bedrooms <- ventilate(dat$c_bedrooms, "NA")
  dat$c_bedrooms[vent$c_bedrooms$sup_i] <- vent$c_bedrooms$smpl

  # removal of empty modalities following the various ventilations
  dat <- droplevels(dat)

  # c_house_value, the target, is considered as a supplementary variable
  supvar <- which(names(dat) == "c_house_value")

  r <- list( vent=vent, dat=dat, supvar=supvar )

  return(r)
}

# one hot encoding of a dataframe made of categorical variables
onehot_enc <-
function(dat.cat)
{
  if(!all(as.logical(lapply(dat.cat, is.factor)))) {
    warning("All variables must be factors.")
    return(invisible(NULL))
  }
  lev_n <- unlist(lapply(dat.cat, nlevels))
  n <- cumsum(lev_n)
  J_t <- sum(lev_n)
  Q_t <- dim(dat.cat)[2]
  I <- dim(dat.cat)[1]
  Z <- matrix(0, nrow = I, ncol = J_t)
  numdat <- lapply(dat.cat, as.numeric)
  offset <- c(0, n[-length(n)])
  for (i in 1:Q_t) Z[1:I + (I * (offset[i] + numdat[[i]] - 1))] <- 1
  dimnames(Z)[[1]] <- as.character(1:I)
  dimnames(Z)[[2]] <- as.character(unlist(lapply(dat.cat, levels)))
  return(Z)
}

mca <-
function(dat, nclst = 100)
{
  # Indicator matrix
  Z <- onehot_enc(dat$dat)
  lev_n <- unlist(lapply(dat$dat, nlevels))
  n <- cumsum(lev_n)
  Z_sup_min <- n[dat$supvar[1] - 1] + 1
  Z_sup_max <- n[dat$supvar[length(dat$supvar)]]
  Z_sup_ind <- Z_sup_min : Z_sup_max
  Z_act <- Z[,-Z_sup_ind]
  J <- dim(Z_act)[2]
  Q <- dim(dat$dat)[2] - length(dat$supvar)

  # Burt matrix
  B <- t(Z_act) %*% Z_act

  # Decomposition of inertia
  P     <- B / sum(B)
  r     <- apply(P, 2, sum)
  rr    <- r %*% t(r)
  S     <- (P - rr) / sqrt(rr)
  dec   <- eigen(S)
  # Last Q eigen value must be zero
  K <- J - Q
  sv  <- dec$values[1 : K]

  # standard coordinates
  a <- sweep(dec$vectors, 1, sqrt(r), FUN = "/")
  a <- a[,(1 : K)]

  # principal coordinates
  f <- a %*% diag(sv)

  fac_names <- paste("F", paste(1 : K), sep = "")
  rownames(a) <- colnames(Z_act)
  colnames(a) <- fac_names
  rownames(f) <- colnames(Z_act)
  colnames(f) <- fac_names

  # contributions
  temp <- sweep(f^2, 1, r, FUN = "*")
  sum_ctr <- apply(temp, 2, sum)
  ctr <- sweep(temp, 2, sum_ctr, FUN = "/")

  # correlations
  temp <- f^2
  sum_cor <- apply(temp, 1, sum)
  cor <- sweep(temp, 1, sum_cor, FUN="/")

  # profiles of "supplementary" individuals
  si <- sweep(Z_act, 1, apply(Z_act, 1, sum), FUN = "/")
  fsi <- si %*% a
  temp <- fsi^2
  sum_cor <- apply(temp, 1, sum)
  sicor <- sweep(temp, 1, sum_cor, FUN="/")

  # profiles of "supplementary" variables
  Z_sup <- Z[,Z_sup_ind]
  sj <- t(Z_sup) %*% Z_act
  sj <- sweep(sj, 1, apply(sj, 1, sum), FUN = "/")
  fsj <- sj %*% a
  temp <- fsj^2
  sum_cor <- apply(temp, 1, sum)
  sjcor <- sweep(temp, 1, sum_cor, FUN="/")

  # clusters of individuals
  # Keep enough singular values to capture at least 90% of variance
  nsv <- which(cumsum(sv / sum(sv)) > 0.9)[1]
  nstart <- 25
  set.seed(1123)
  clsti <- kmeans(fsi[,1:nsv], nclst, nstart)
  # Principal coordinates of clusters' centers
  fclst <- clsti$centers

  r <- list(f=f,          # principal coordinates
            ctr=ctr,      # contributions of the modalities
                          #   to the principal axes
            cor=cor,      # correlations of the modalities
                          #   with the principal axes
            r=r,          # marginal profile of the modalities
            sv=sv,        # K (=J-Q) singular values
            fsi=fsi,      # principal coordinates of the individuals
            sicor=sicor,  # correlations of the individuals
                          #   with the principal axes
            fsj=fsj,      # principal coordinates of the modalities
                          #   of the supplementary variables
            sjcor=sjcor,  # correlations of the supplementary variables
                          #   with the principal axes
            clsti=clsti)  # kmeans clustering of the individuals
  class(r) <- "mca"
  return(r)
}

# compute clusters' correlations with factors
clstcor.mca <-
function(o)
{
  if(class(o) != "mca") {
    warning("Object is not of class 'mca'")
    return(invisible(NULL))
  }

  nclst <- length(o$clsti$size)

  # Correlation of clusters' centers and factorial axes
  temp <- o$clsti$centers^2
  sum_cor <- apply(temp, 1, sum)
  clstcor <- sweep(temp, 1, sum_cor, FUN="/")

  nfact <- 3

  # select first nfact most correlated factors for each clst
  selMostCorFact <- function(i) {
    temp1 <- (sort(clstcor[i,], decreasing = TRUE))[1:nfact]
    temp2 <- round_preserve_sum(1000 * temp1)
    temp3 <- strtoi(sub('.', '', names(temp1)))
    rbind(temp3, temp2)
  }

  tblClstCor <- t(sapply(1:nclst, selMostCorFact))
  tblClstCor <- cbind(tblClstCor, o$clsti$size)
  rwithinss <- o$clsti$withinss / o$clsti$size # within sum of squares
                                               # relative to cluster size
  clstqlty <- round_preserve_sum(1000 * rwithinss / sum(rwithinss))
  tblClstCor <- cbind(tblClstCor, clstqlty)
  rownames(tblClstCor) <- paste0('clst-', 1:nclst)
  colnames(tblClstCor) <- c( rep(c("F", "COR"), nfact), "SIZE", "QLTY" )

  tblClstCor[order(tblClstCor[,1], tblClstCor[,3], tblClstCor[,5]),]
}

textSize.mca <-
function(o, d1 = NULL, d2 = NULL)
{
  if(class(o) != "mca") {
    warning("Object is not of class 'mca'")
    return(invisible(NULL))
  }
  if(is.null(d1)) d1<-1
  if(is.null(d2)) d2<-2

  # proportion of inertia of factorial plan d1-d2 explained by each profile
  cont <- o$r * (o$f[,d1]^2 + o$f[,d2]^2) / (o$sv[d1]^2 + o$sv[d2]^2)
  names <- rownames(o$f)
  names[cont < 0.05] <- "."
  optimPar <- nonlinearFontSize.mca(cont)
  sizes <- log(1 + exp(optimPar[1]) * cont^optimPar[2])
  sizes[cont < 0.01] <- 1
  r <- list(names=names, sizes=sizes)
  return(r)
}

plot.mca <-
function(o, d1 = NULL, d2 = NULL)
{
  if(class(o) != "mca") {
    warning("Object is not of class 'mca'")
    UseMethod("plot")
    return(invisible(NULL))
  }
  if(is.null(d1)) d1<-1
  if(is.null(d2)) d2<-2

  ns <- textSize.mca(o, d1, d2) # names and sizes

  plot(o$f[,d1], o$f[,d2], type = "n",
     xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n")
  text(o$f[,d1], o$f[,d2], ns$names, adj = 0, cex = ns$sizes,
       col = 'blue', font = 2)
  points(0, 0, pch = 3)
}

plotjsup.mca <-
function(o, d1 = NULL, d2 = NULL)
{
  if(class(o) != "mca") {
    warning("Object is not of class 'mca'")
    return(invisible(NULL))
  }
  if(is.null(d1)) d1<-1
  if(is.null(d2)) d2<-2

  ns <- textSize.mca(o, d1, d2) # names and sizes

  plot(c(o$f[,d1], o$fsj[,d1]), c(o$f[,d2], o$fsj[,d2]), type = "n",
     xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n")
  text(o$f[,d1], o$f[,d2], ns$names, adj = 0, cex = ns$sizes,
       col = 'blue', font = 2)
  text(o$fsj[,d1], o$fsj[,d2], rownames(o$fsj), adj = 0,
       font = 4, cex = 0.7, col = 'red')
  points(0, 0, pch = 3)
}

plotisupjsup.mca <-
function(o, d1 = NULL, d2 = NULL)
{
  if(class(o) != "mca") {
    warning("Object is not of class 'mca'")
    return(invisible(NULL))
  }
  if(is.null(d1)) d1<-1
  if(is.null(d2)) d2<-2

  nsm <- textSize.mca(o, d1, d2) # names and sizes for modalities
  nsi <- textSizeClst.mca(o, d1, d2) # names and sizes for clust of individuals

  plot(c(o$f[,d1], o$fsj[,d1], o$clsti$centers[,d1]),
       c(o$f[,d2], o$fsj[,d2], o$clsti$centers[,d2]),
       type = "n", xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n")
  text(o$f[,d1], o$f[,d2], nsm$names, adj = 0, cex = nsm$sizes,
       col = 'blue', font = 2)
  text(o$fsj[,d1], o$fsj[,d2], rownames(o$fsj), adj = 0, font = 4,
       cex = 0.7, col = 'red')
  text(o$clsti$centers[,d1], o$clsti$centers[,d2],
       nsi$names, adj = 0, cex = nsi$sizes, font = 3)
  points(0, 0, pch = 3)
}

textSizeClst.mca <-
function(o, d1 = NULL, d2 = NULL)
{
  if(class(o) != "mca") {
    warning("Object is not of class 'mca'")
    return(invisible(NULL))
  }
  if(is.null(d1)) d1<-1
  if(is.null(d2)) d2<-2

  # correlation of each cluster center profile with factors d1+d2
  temp <- o$clsti$centers^2
  sum_cor <- apply(temp, 1, sum)
  clstcor <- sweep(temp, 1, sum_cor, FUN="/")
  clstcor <- rowSums(clstcor[,c(d1,d2)])
  names <- names(clstcor)
  names[clstcor < 0.05] <- "x"
  optimPar <- nonlinearFontSize.mca(clstcor)
  sizes <- log(1 + exp(optimPar[1]) * clstcor^optimPar[2])
  sizes[clstcor < 0.05] <- 1
  r <- list(names=names, sizes=sizes)
  return(r)
}

nonlinearFontSize.mca <-
function(dat, minFontCex = 0.3, maxFontCex = 1)
{
  # discover a non-linear transform of data that maps to font sizes
  # the smallest font factor should be cex ~= minFontCex
  # the largest font factor should be cex ~= maxFontCex
  optim.err <- function(par) {
    err <- (minFontCex - log(1 + exp(par[1]) * min(dat)^par[2]))^2
    err <- err + (maxFontCex - log(1 + exp(par[1]) * max(dat)^par[2]))^2
    return(err)
  }
  optim.res <- optim(par = c(1,0.3), fn = optim.err)
  return(optim.res$par)
}

# barplot of categorical variable `cat`
# for individuals of cluster `clstnb`
# `o` is the correspondence analysis model
# `dat` is the dataset
barplotClst.mca <-
function(o, dat, clstnb, cat)
{
  barplot(table(dat$dat[which(o$clsti$cluster == clstnb), cat]))
}