# https://www.r-bloggers.com/2016/07/round-values-while-preserve-their-rounded-sum-in-r/ # E.G. # > sum(c(0.333, 0.333, 0.334)) # [1] 1 # > sum(round(c(0.333, 0.333, 0.334), 2)) # [1] 0.99 # > sum(round_preserve_sum(c(0.333, 0.333, 0.334), 2)) # [1] 1.00 round_preserve_sum <- function(x, digits = 0) { up <- 10 ^ digits x <- x * up y <- floor(x) indices <- tail(order(x-y), round(sum(x)) - sum(y)) y[indices] <- y[indices] + 1 y / up } # `kcuts` cuts variable `x` into `centers` categories with k-means. kcuts <- function(x, centers) { set.seed(1123) km <- kmeans(x = x, centers = centers) cuts <- unlist(lapply(order(km$centers), function(clustId) { min(x[km$cluster == clustId]) })) cuts <- c(cuts, max(x)) } # ventilate modality `mod` of categorical var `cat` # into the remaining modalities according to their relative frequencies. ventilate <- function(cat, mod) { if (mod == "NA") isna <- TRUE else isna <- FALSE tab <- table(cat) if (isna) { sup_i <- which(is.na(cat)) sup_n <- length(sup_i) act_mod <- tab } else { sup_i <- which(cat == mod) sup_n <- tab[mod] act_mod <- tab[! names(tab) %in% mod] } prob <- act_mod / sum(act_mod) smpl <- sample(names(prob), size = sup_n, replace = TRUE, prob = prob) r <- list(sup_mod = mod, sup_n = sup_n, sup_i = sup_i, smpl = smpl) return(r) } # pre-processing for exploratory analytics of the housing dataset datasetHousing.mca <- function(dat=NULL) { # loading dataset in memory if(is.null(dat)) { dat <- read.csv(file="data/housing.csv", header=TRUE) } # transform ocean_proximity into a factor # a factor being R representation of categorical variables dat$ocean_proximity <- as.factor(dat$ocean_proximity) levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="<1H OCEAN"] <- "O:<1H" levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="ISLAND"] <- "O:ISL" levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="INLAND"] <- "O:INL" levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR BAY"] <- "O:NB" levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR OCEAN"] <- "O:NO" # discretization of longitude cuts <- kcuts(x = dat$longitude, centers = 4) dat$c_longitude <- cut(x = dat$longitude, unique(cuts), include.lowest = TRUE) levels(dat$c_longitude) <- c('LO:W', 'LO:MW', 'LO:ME', 'LO:E') # discretization of latitude cuts <- kcuts(x = dat$latitude, centers = 4) dat$c_latitude <- cut(x = dat$latitude, unique(cuts), include.lowest = TRUE) levels(dat$c_latitude) <- c('LA:S','LA:MS','LA:MN','LA:N') # discretization of housing_median_age cuts <- c(min(dat$housing_median_age), 15, 25, 35, 51, 52) dat$c_age <- cut(x = dat$housing_median_age, unique(cuts), include.lowest = TRUE) levels(dat$c_age) <- c('AG:15]','AG:25]','AG:35]','AG:51]', 'AG:52') # creation and discretization of rooms dat$rooms <- dat$total_rooms / dat$households cuts <- c(min(dat$rooms), 4, 6, 8, max(dat$rooms)) dat$c_rooms <- cut(x = dat$rooms, unique(cuts), include.lowest = TRUE) levels(dat$c_rooms) <- c('RO:4]','RO:6]','RO:8]', 'RO:>8') # creation and discretization of bedrooms dat$bedrooms <- dat$total_bedrooms / dat$households cuts <- c(min(dat$bedrooms, na.rm = TRUE), 1.1, max(dat$bedrooms, na.rm = TRUE)) dat$c_bedrooms <- cut(x = dat$bedrooms, unique(cuts), include.lowest = TRUE) levels(dat$c_bedrooms) <- c('BE:1]','BE:>1') # creation and discretization of pop dat$pop <- dat$population / dat$households cuts <- c(min(dat$pop), 2, 3, 4, max(dat$pop)) dat$c_pop <- cut(x = dat$pop, unique(cuts), include.lowest = TRUE) levels(dat$c_pop) <- c('PO:2]','PO:3]', 'PO:4]', 'PO:>4') # discretization of households cuts <- c(min(dat$households), 300, 400, 600, max(dat$households)) dat$c_households <- cut(x = dat$households, cuts, include.lowest = TRUE) levels(dat$c_households) <- c('HH:3]', 'HH:4]', 'HH:6]', 'HH:>6') # discretization of median_income cuts <- quantile(dat$median_income, probs = seq(0,1,1/4)) cuts <- c(cuts[1:length(cuts)-1], 15, max(dat$median_income)) dat$c_income <- cut(x = dat$median_income, cuts, include.lowest = TRUE) levels(dat$c_income) <- c('IC:L', 'IC:ML', 'IC:MH', 'IC:H', 'IC:>15') # discretization of median_house_value cuts <- c(min(dat$median_house_value), 115000, 175000, 250000, 500000, max(dat$median_house_value)) dat$c_house_value <- cut(x = dat$median_house_value, cuts, include.lowest = TRUE) levels(dat$c_house_value) <- c('HV:A', 'HV:B', 'HV:C', 'HV:D', 'HV:E') # discretized version of the entire dataset dat <- dat[c('ocean_proximity', 'c_longitude', 'c_latitude', 'c_age', 'c_rooms', 'c_bedrooms', 'c_pop', 'c_households', 'c_income', 'c_house_value')] vent <- list() # ventilation of modality RO:>8 of variable c_rooms vent$c_rooms <- ventilate(dat$c_rooms, "RO:>8") dat$c_rooms[vent$c_rooms$sup_i] <- vent$c_rooms$smpl # ventilation of modality IC:>15 of variable c_income vent$c_income <- ventilate(dat$c_income, "IC:>15") dat$c_income[vent$c_income$sup_i] <- vent$c_income$smpl # ventilation of modality O:ISL of variable ocean_proximity vent$ocean_proximity <- ventilate(dat$ocean_proximity, "O:ISL") dat$ocean_proximity[vent$ocean_proximity$sup_i] <- vent$ocean_proximity$smpl # ventilation of missing values of variable c_bedrooms vent$c_bedrooms <- ventilate(dat$c_bedrooms, "NA") dat$c_bedrooms[vent$c_bedrooms$sup_i] <- vent$c_bedrooms$smpl # removal of empty modalities following the various ventilations dat <- droplevels(dat) # c_house_value, the target, is considered as a supplementary variable supvar <- which(names(dat) == "c_house_value") r <- list( vent=vent, dat=dat, supvar=supvar ) return(r) } # one hot encoding of a dataframe made of categorical variables onehot_enc <- function(dat.cat) { if(!all(as.logical(lapply(dat.cat, is.factor)))) { warning("All variables must be factors.") return(invisible(NULL)) } lev_n <- unlist(lapply(dat.cat, nlevels)) n <- cumsum(lev_n) J_t <- sum(lev_n) Q_t <- dim(dat.cat)[2] I <- dim(dat.cat)[1] Z <- matrix(0, nrow = I, ncol = J_t) numdat <- lapply(dat.cat, as.numeric) offset <- c(0, n[-length(n)]) for (i in 1:Q_t) Z[1:I + (I * (offset[i] + numdat[[i]] - 1))] <- 1 dimnames(Z)[[1]] <- as.character(1:I) dimnames(Z)[[2]] <- as.character(unlist(lapply(dat.cat, levels))) return(Z) } mca <- function(dat, nclst = 100) { # Indicator matrix Z <- onehot_enc(dat$dat) lev_n <- unlist(lapply(dat$dat, nlevels)) n <- cumsum(lev_n) Z_sup_min <- n[dat$supvar[1] - 1] + 1 Z_sup_max <- n[dat$supvar[length(dat$supvar)]] Z_sup_ind <- Z_sup_min : Z_sup_max Z_act <- Z[,-Z_sup_ind] J <- dim(Z_act)[2] Q <- dim(dat$dat)[2] - length(dat$supvar) # Burt matrix B <- t(Z_act) %*% Z_act # Decomposition of inertia P <- B / sum(B) r <- apply(P, 2, sum) rr <- r %*% t(r) S <- (P - rr) / sqrt(rr) dec <- eigen(S) # Last Q eigen value must be zero K <- J - Q sv <- dec$values[1 : K] # standard coordinates a <- sweep(dec$vectors, 1, sqrt(r), FUN = "/") a <- a[,(1 : K)] # principal coordinates f <- a %*% diag(sv) fac_names <- paste("F", paste(1 : K), sep = "") rownames(a) <- colnames(Z_act) colnames(a) <- fac_names rownames(f) <- colnames(Z_act) colnames(f) <- fac_names # contributions temp <- sweep(f^2, 1, r, FUN = "*") sum_ctr <- apply(temp, 2, sum) ctr <- sweep(temp, 2, sum_ctr, FUN = "/") # correlations temp <- f^2 sum_cor <- apply(temp, 1, sum) cor <- sweep(temp, 1, sum_cor, FUN="/") # profiles of "supplementary" individuals si <- sweep(Z_act, 1, apply(Z_act, 1, sum), FUN = "/") fsi <- si %*% a temp <- fsi^2 sum_cor <- apply(temp, 1, sum) sicor <- sweep(temp, 1, sum_cor, FUN="/") # profiles of "supplementary" variables Z_sup <- Z[,Z_sup_ind] sj <- t(Z_sup) %*% Z_act sj <- sweep(sj, 1, apply(sj, 1, sum), FUN = "/") fsj <- sj %*% a temp <- fsj^2 sum_cor <- apply(temp, 1, sum) sjcor <- sweep(temp, 1, sum_cor, FUN="/") # clusters of individuals # Keep enough singular values to capture at least 90% of variance nsv <- which(cumsum(sv / sum(sv)) > 0.9)[1] nstart <- 25 set.seed(1123) clsti <- kmeans(fsi[,1:nsv], nclst, nstart) # Principal coordinates of clusters' centers fclst <- clsti$centers r <- list(f=f, # principal coordinates ctr=ctr, # contributions of the modalities # to the principal axes cor=cor, # correlations of the modalities # with the principal axes r=r, # marginal profile of the modalities sv=sv, # K (=J-Q) singular values fsi=fsi, # principal coordinates of the individuals sicor=sicor, # correlations of the individuals # with the principal axes fsj=fsj, # principal coordinates of the modalities # of the supplementary variables sjcor=sjcor, # correlations of the supplementary variables # with the principal axes clsti=clsti) # kmeans clustering of the individuals class(r) <- "mca" return(r) } # compute clusters' correlations with factors clstcor.mca <- function(o) { if(class(o) != "mca") { warning("Object is not of class 'mca'") return(invisible(NULL)) } nclst <- length(o$clsti$size) # Correlation of clusters' centers and factorial axes temp <- o$clsti$centers^2 sum_cor <- apply(temp, 1, sum) clstcor <- sweep(temp, 1, sum_cor, FUN="/") nfact <- 3 # select first nfact most correlated factors for each clst selMostCorFact <- function(i) { temp1 <- (sort(clstcor[i,], decreasing = TRUE))[1:nfact] temp2 <- round_preserve_sum(1000 * temp1) temp3 <- strtoi(sub('.', '', names(temp1))) rbind(temp3, temp2) } tblClstCor <- t(sapply(1:nclst, selMostCorFact)) tblClstCor <- cbind(tblClstCor, o$clsti$size) rwithinss <- o$clsti$withinss / o$clsti$size # within sum of squares # relative to cluster size clstqlty <- round_preserve_sum(1000 * rwithinss / sum(rwithinss)) tblClstCor <- cbind(tblClstCor, clstqlty) rownames(tblClstCor) <- paste0('clst-', 1:nclst) colnames(tblClstCor) <- c( rep(c("F", "COR"), nfact), "SIZE", "QLTY" ) tblClstCor[order(tblClstCor[,1], tblClstCor[,3], tblClstCor[,5]),] } textSize.mca <- function(o, d1 = NULL, d2 = NULL) { if(class(o) != "mca") { warning("Object is not of class 'mca'") return(invisible(NULL)) } if(is.null(d1)) d1<-1 if(is.null(d2)) d2<-2 # proportion of inertia of factorial plan d1-d2 explained by each profile cont <- o$r * (o$f[,d1]^2 + o$f[,d2]^2) / (o$sv[d1]^2 + o$sv[d2]^2) names <- rownames(o$f) names[cont < 0.05] <- "." optimPar <- nonlinearFontSize.mca(cont) sizes <- log(1 + exp(optimPar[1]) * cont^optimPar[2]) sizes[cont < 0.01] <- 1 r <- list(names=names, sizes=sizes) return(r) } plot.mca <- function(o, d1 = NULL, d2 = NULL) { if(class(o) != "mca") { warning("Object is not of class 'mca'") UseMethod("plot") return(invisible(NULL)) } if(is.null(d1)) d1<-1 if(is.null(d2)) d2<-2 ns <- textSize.mca(o, d1, d2) # names and sizes plot(o$f[,d1], o$f[,d2], type = "n", xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n") text(o$f[,d1], o$f[,d2], ns$names, adj = 0, cex = ns$sizes, col = 'blue', font = 2) points(0, 0, pch = 3) } plotjsup.mca <- function(o, d1 = NULL, d2 = NULL) { if(class(o) != "mca") { warning("Object is not of class 'mca'") return(invisible(NULL)) } if(is.null(d1)) d1<-1 if(is.null(d2)) d2<-2 ns <- textSize.mca(o, d1, d2) # names and sizes plot(c(o$f[,d1], o$fsj[,d1]), c(o$f[,d2], o$fsj[,d2]), type = "n", xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n") text(o$f[,d1], o$f[,d2], ns$names, adj = 0, cex = ns$sizes, col = 'blue', font = 2) text(o$fsj[,d1], o$fsj[,d2], rownames(o$fsj), adj = 0, font = 4, cex = 0.7, col = 'red') points(0, 0, pch = 3) } plotisupjsup.mca <- function(o, d1 = NULL, d2 = NULL) { if(class(o) != "mca") { warning("Object is not of class 'mca'") return(invisible(NULL)) } if(is.null(d1)) d1<-1 if(is.null(d2)) d2<-2 nsm <- textSize.mca(o, d1, d2) # names and sizes for modalities nsi <- textSizeClst.mca(o, d1, d2) # names and sizes for clust of individuals plot(c(o$f[,d1], o$fsj[,d1], o$clsti$centers[,d1]), c(o$f[,d2], o$fsj[,d2], o$clsti$centers[,d2]), type = "n", xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n") text(o$f[,d1], o$f[,d2], nsm$names, adj = 0, cex = nsm$sizes, col = 'blue', font = 2) text(o$fsj[,d1], o$fsj[,d2], rownames(o$fsj), adj = 0, font = 4, cex = 0.7, col = 'red') text(o$clsti$centers[,d1], o$clsti$centers[,d2], nsi$names, adj = 0, cex = nsi$sizes, font = 3) points(0, 0, pch = 3) } textSizeClst.mca <- function(o, d1 = NULL, d2 = NULL) { if(class(o) != "mca") { warning("Object is not of class 'mca'") return(invisible(NULL)) } if(is.null(d1)) d1<-1 if(is.null(d2)) d2<-2 # correlation of each cluster center profile with factors d1+d2 temp <- o$clsti$centers^2 sum_cor <- apply(temp, 1, sum) clstcor <- sweep(temp, 1, sum_cor, FUN="/") clstcor <- rowSums(clstcor[,c(d1,d2)]) names <- names(clstcor) names[clstcor < 0.05] <- "x" optimPar <- nonlinearFontSize.mca(clstcor) sizes <- log(1 + exp(optimPar[1]) * clstcor^optimPar[2]) sizes[clstcor < 0.05] <- 1 r <- list(names=names, sizes=sizes) return(r) } nonlinearFontSize.mca <- function(dat, minFontCex = 0.3, maxFontCex = 1) { # discover a non-linear transform of data that maps to font sizes # the smallest font factor should be cex ~= minFontCex # the largest font factor should be cex ~= maxFontCex optim.err <- function(par) { err <- (minFontCex - log(1 + exp(par[1]) * min(dat)^par[2]))^2 err <- err + (maxFontCex - log(1 + exp(par[1]) * max(dat)^par[2]))^2 return(err) } optim.res <- optim(par = c(1,0.3), fn = optim.err) return(optim.res$par) } # barplot of categorical variable `cat` # for individuals of cluster `clstnb` # `o` is the correspondence analysis model # `dat` is the dataset barplotClst.mca <- function(o, dat, clstnb, cat) { barplot(table(dat$dat[which(o$clsti$cluster == clstnb), cat])) }