You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

447 lines 14 KiB Raw Permalink Blame History

 ```# https://www.r-bloggers.com/2016/07/round-values-while-preserve-their-rounded-sum-in-r/ ``` ```# E.G. ``` ```# > sum(c(0.333, 0.333, 0.334)) ``` ```# [1] 1 ``` ```# > sum(round(c(0.333, 0.333, 0.334), 2)) ``` ```# [1] 0.99 ``` ```# > sum(round_preserve_sum(c(0.333, 0.333, 0.334), 2)) ``` ```# [1] 1.00 ``` ```round_preserve_sum <- function(x, digits = 0) { ``` ``` up <- 10 ^ digits ``` ``` x <- x * up ``` ``` y <- floor(x) ``` ``` indices <- tail(order(x-y), round(sum(x)) - sum(y)) ``` ``` y[indices] <- y[indices] + 1 ``` ``` y / up ``` ```} ``` ``` ``` ```# `kcuts` cuts variable `x` into `centers` categories with k-means. ``` ```kcuts <- ``` ```function(x, centers) ``` ```{ ``` ``` set.seed(1123) ``` ``` km <- kmeans(x = x, centers = centers) ``` ``` cuts <- unlist(lapply(order(km\$centers), function(clustId) { ``` ``` min(x[km\$cluster == clustId]) })) ``` ``` cuts <- c(cuts, max(x)) ``` ```} ``` ``` ``` ```# ventilate modality `mod` of categorical var `cat` ``` ```# into the remaining modalities according to their relative frequencies. ``` ```ventilate <- ``` ```function(cat, mod) ``` ```{ ``` ``` if (mod == "NA") isna <- TRUE else isna <- FALSE ``` ``` tab <- table(cat) ``` ``` if (isna) ``` ``` { ``` ``` sup_i <- which(is.na(cat)) ``` ``` sup_n <- length(sup_i) ``` ``` act_mod <- tab ``` ``` } ``` ``` else ``` ``` { ``` ``` sup_i <- which(cat == mod) ``` ``` sup_n <- tab[mod] ``` ``` act_mod <- tab[! names(tab) %in% mod] ``` ``` } ``` ``` prob <- act_mod / sum(act_mod) ``` ``` smpl <- sample(names(prob), size = sup_n, replace = TRUE, prob = prob) ``` ``` r <- list(sup_mod = mod, ``` ``` sup_n = sup_n, ``` ``` sup_i = sup_i, ``` ``` smpl = smpl) ``` ``` return(r) ``` ```} ``` ``` ``` ```# pre-processing for exploratory analytics of the housing dataset ``` ``` ``` ```datasetHousing.mca <- ``` ```function(dat=NULL) ``` ```{ ``` ``` # loading dataset in memory ``` ``` if(is.null(dat)) { dat <- read.csv(file="data/housing.csv", header=TRUE) } ``` ``` ``` ``` # transform ocean_proximity into a factor ``` ``` # a factor being R representation of categorical variables ``` ``` dat\$ocean_proximity <- as.factor(dat\$ocean_proximity) ``` ``` levels(dat\$ocean_proximity)[levels(dat\$ocean_proximity)=="<1H OCEAN"] <- "O:<1H" ``` ``` levels(dat\$ocean_proximity)[levels(dat\$ocean_proximity)=="ISLAND"] <- "O:ISL" ``` ``` levels(dat\$ocean_proximity)[levels(dat\$ocean_proximity)=="INLAND"] <- "O:INL" ``` ``` levels(dat\$ocean_proximity)[levels(dat\$ocean_proximity)=="NEAR BAY"] <- "O:NB" ``` ``` levels(dat\$ocean_proximity)[levels(dat\$ocean_proximity)=="NEAR OCEAN"] <- "O:NO" ``` ``` ``` ``` # discretization of longitude ``` ``` cuts <- kcuts(x = dat\$longitude, centers = 4) ``` ``` dat\$c_longitude <- cut(x = dat\$longitude, unique(cuts), include.lowest = TRUE) ``` ``` levels(dat\$c_longitude) <- c('LO:W', 'LO:MW', 'LO:ME', 'LO:E') ``` ``` ``` ``` # discretization of latitude ``` ``` cuts <- kcuts(x = dat\$latitude, centers = 4) ``` ``` dat\$c_latitude <- cut(x = dat\$latitude, unique(cuts), include.lowest = TRUE) ``` ``` levels(dat\$c_latitude) <- c('LA:S','LA:MS','LA:MN','LA:N') ``` ``` ``` ``` # discretization of housing_median_age ``` ``` cuts <- c(min(dat\$housing_median_age), 15, 25, 35, 51, 52) ``` ``` dat\$c_age <- cut(x = dat\$housing_median_age, unique(cuts), ``` ``` include.lowest = TRUE) ``` ``` levels(dat\$c_age) <- c('AG:15]','AG:25]','AG:35]','AG:51]', 'AG:52') ``` ``` ``` ``` # creation and discretization of rooms ``` ``` dat\$rooms <- dat\$total_rooms / dat\$households ``` ``` cuts <- c(min(dat\$rooms), 4, 6, 8, max(dat\$rooms)) ``` ``` dat\$c_rooms <- cut(x = dat\$rooms, unique(cuts), include.lowest = TRUE) ``` ``` levels(dat\$c_rooms) <- c('RO:4]','RO:6]','RO:8]', 'RO:>8') ``` ``` ``` ``` # creation and discretization of bedrooms ``` ``` dat\$bedrooms <- dat\$total_bedrooms / dat\$households ``` ``` cuts <- c(min(dat\$bedrooms, na.rm = TRUE), 1.1, max(dat\$bedrooms, na.rm = TRUE)) ``` ``` dat\$c_bedrooms <- cut(x = dat\$bedrooms, unique(cuts), include.lowest = TRUE) ``` ``` levels(dat\$c_bedrooms) <- c('BE:1]','BE:>1') ``` ``` ``` ``` # creation and discretization of pop ``` ``` dat\$pop <- dat\$population / dat\$households ``` ``` cuts <- c(min(dat\$pop), 2, 3, 4, max(dat\$pop)) ``` ``` dat\$c_pop <- cut(x = dat\$pop, unique(cuts), include.lowest = TRUE) ``` ``` levels(dat\$c_pop) <- c('PO:2]','PO:3]', 'PO:4]', 'PO:>4') ``` ``` ``` ``` # discretization of households ``` ``` cuts <- c(min(dat\$households), 300, 400, 600, max(dat\$households)) ``` ``` dat\$c_households <- cut(x = dat\$households, cuts, include.lowest = TRUE) ``` ``` levels(dat\$c_households) <- c('HH:3]', 'HH:4]', 'HH:6]', 'HH:>6') ``` ``` ``` ``` # discretization of median_income ``` ``` cuts <- quantile(dat\$median_income, probs = seq(0,1,1/4)) ``` ``` cuts <- c(cuts[1:length(cuts)-1], 15, max(dat\$median_income)) ``` ``` dat\$c_income <- cut(x = dat\$median_income, cuts, include.lowest = TRUE) ``` ``` levels(dat\$c_income) <- c('IC:L', 'IC:ML', 'IC:MH', 'IC:H', 'IC:>15') ``` ``` ``` ``` # discretization of median_house_value ``` ``` cuts <- c(min(dat\$median_house_value), 115000, 175000, 250000, 500000, ``` ``` max(dat\$median_house_value)) ``` ``` dat\$c_house_value <- cut(x = dat\$median_house_value, cuts, ``` ``` include.lowest = TRUE) ``` ``` levels(dat\$c_house_value) <- c('HV:A', 'HV:B', 'HV:C', 'HV:D', 'HV:E') ``` ``` ``` ``` # discretized version of the entire dataset ``` ``` dat <- dat[c('ocean_proximity', 'c_longitude', 'c_latitude', 'c_age', ``` ``` 'c_rooms', 'c_bedrooms', 'c_pop', 'c_households', 'c_income', ``` ``` 'c_house_value')] ``` ``` vent <- list() ``` ``` # ventilation of modality RO:>8 of variable c_rooms ``` ``` vent\$c_rooms <- ventilate(dat\$c_rooms, "RO:>8") ``` ``` dat\$c_rooms[vent\$c_rooms\$sup_i] <- vent\$c_rooms\$smpl ``` ``` ``` ``` # ventilation of modality IC:>15 of variable c_income ``` ``` vent\$c_income <- ventilate(dat\$c_income, "IC:>15") ``` ``` dat\$c_income[vent\$c_income\$sup_i] <- vent\$c_income\$smpl ``` ``` ``` ``` # ventilation of modality O:ISL of variable ocean_proximity ``` ``` vent\$ocean_proximity <- ventilate(dat\$ocean_proximity, "O:ISL") ``` ``` dat\$ocean_proximity[vent\$ocean_proximity\$sup_i] <- vent\$ocean_proximity\$smpl ``` ``` ``` ``` # ventilation of missing values of variable c_bedrooms ``` ``` vent\$c_bedrooms <- ventilate(dat\$c_bedrooms, "NA") ``` ``` dat\$c_bedrooms[vent\$c_bedrooms\$sup_i] <- vent\$c_bedrooms\$smpl ``` ``` ``` ``` # removal of empty modalities following the various ventilations ``` ``` dat <- droplevels(dat) ``` ``` ``` ``` # c_house_value, the target, is considered as a supplementary variable ``` ``` supvar <- which(names(dat) == "c_house_value") ``` ``` ``` ``` r <- list( vent=vent, dat=dat, supvar=supvar ) ``` ``` ``` ``` return(r) ``` ```} ``` ``` ``` ```# one hot encoding of a dataframe made of categorical variables ``` ```onehot_enc <- ``` ```function(dat.cat) ``` ```{ ``` ``` if(!all(as.logical(lapply(dat.cat, is.factor)))) { ``` ``` warning("All variables must be factors.") ``` ``` return(invisible(NULL)) ``` ``` } ``` ``` lev_n <- unlist(lapply(dat.cat, nlevels)) ``` ``` n <- cumsum(lev_n) ``` ``` J_t <- sum(lev_n) ``` ``` Q_t <- dim(dat.cat)[2] ``` ``` I <- dim(dat.cat)[1] ``` ``` Z <- matrix(0, nrow = I, ncol = J_t) ``` ``` numdat <- lapply(dat.cat, as.numeric) ``` ``` offset <- c(0, n[-length(n)]) ``` ``` for (i in 1:Q_t) Z[1:I + (I * (offset[i] + numdat[[i]] - 1))] <- 1 ``` ``` dimnames(Z)[[1]] <- as.character(1:I) ``` ``` dimnames(Z)[[2]] <- as.character(unlist(lapply(dat.cat, levels))) ``` ``` return(Z) ``` ```} ``` ``` ``` ```mca <- ``` ```function(dat, nclst = 100) ``` ```{ ``` ``` # Indicator matrix ``` ``` Z <- onehot_enc(dat\$dat) ``` ``` lev_n <- unlist(lapply(dat\$dat, nlevels)) ``` ``` n <- cumsum(lev_n) ``` ``` Z_sup_min <- n[dat\$supvar[1] - 1] + 1 ``` ``` Z_sup_max <- n[dat\$supvar[length(dat\$supvar)]] ``` ``` Z_sup_ind <- Z_sup_min : Z_sup_max ``` ``` Z_act <- Z[,-Z_sup_ind] ``` ``` J <- dim(Z_act)[2] ``` ``` Q <- dim(dat\$dat)[2] - length(dat\$supvar) ``` ``` ``` ``` # Burt matrix ``` ``` B <- t(Z_act) %*% Z_act ``` ``` ``` ``` # Decomposition of inertia ``` ``` P <- B / sum(B) ``` ``` r <- apply(P, 2, sum) ``` ``` rr <- r %*% t(r) ``` ``` S <- (P - rr) / sqrt(rr) ``` ``` dec <- eigen(S) ``` ``` # Last Q eigen value must be zero ``` ``` K <- J - Q ``` ``` sv <- dec\$values[1 : K] ``` ``` ``` ``` # standard coordinates ``` ``` a <- sweep(dec\$vectors, 1, sqrt(r), FUN = "/") ``` ``` a <- a[,(1 : K)] ``` ``` ``` ``` # principal coordinates ``` ``` f <- a %*% diag(sv) ``` ``` ``` ``` fac_names <- paste("F", paste(1 : K), sep = "") ``` ``` rownames(a) <- colnames(Z_act) ``` ``` colnames(a) <- fac_names ``` ``` rownames(f) <- colnames(Z_act) ``` ``` colnames(f) <- fac_names ``` ``` ``` ``` # contributions ``` ``` temp <- sweep(f^2, 1, r, FUN = "*") ``` ``` sum_ctr <- apply(temp, 2, sum) ``` ``` ctr <- sweep(temp, 2, sum_ctr, FUN = "/") ``` ``` ``` ``` # correlations ``` ``` temp <- f^2 ``` ``` sum_cor <- apply(temp, 1, sum) ``` ``` cor <- sweep(temp, 1, sum_cor, FUN="/") ``` ``` ``` ``` # profiles of "supplementary" individuals ``` ``` si <- sweep(Z_act, 1, apply(Z_act, 1, sum), FUN = "/") ``` ``` fsi <- si %*% a ``` ``` temp <- fsi^2 ``` ``` sum_cor <- apply(temp, 1, sum) ``` ``` sicor <- sweep(temp, 1, sum_cor, FUN="/") ``` ``` ``` ``` # profiles of "supplementary" variables ``` ``` Z_sup <- Z[,Z_sup_ind] ``` ``` sj <- t(Z_sup) %*% Z_act ``` ``` sj <- sweep(sj, 1, apply(sj, 1, sum), FUN = "/") ``` ``` fsj <- sj %*% a ``` ``` temp <- fsj^2 ``` ``` sum_cor <- apply(temp, 1, sum) ``` ``` sjcor <- sweep(temp, 1, sum_cor, FUN="/") ``` ``` ``` ``` # clusters of individuals ``` ``` # Keep enough singular values to capture at least 90% of variance ``` ``` nsv <- which(cumsum(sv / sum(sv)) > 0.9)[1] ``` ``` nstart <- 25 ``` ``` set.seed(1123) ``` ``` clsti <- kmeans(fsi[,1:nsv], nclst, nstart) ``` ``` # Principal coordinates of clusters' centers ``` ``` fclst <- clsti\$centers ``` ``` ``` ``` r <- list(f=f, # principal coordinates ``` ``` ctr=ctr, # contributions of the modalities ``` ``` # to the principal axes ``` ``` cor=cor, # correlations of the modalities ``` ``` # with the principal axes ``` ``` r=r, # marginal profile of the modalities ``` ``` sv=sv, # K (=J-Q) singular values ``` ``` fsi=fsi, # principal coordinates of the individuals ``` ``` sicor=sicor, # correlations of the individuals ``` ``` # with the principal axes ``` ``` fsj=fsj, # principal coordinates of the modalities ``` ``` # of the supplementary variables ``` ``` sjcor=sjcor, # correlations of the supplementary variables ``` ``` # with the principal axes ``` ``` clsti=clsti) # kmeans clustering of the individuals ``` ``` class(r) <- "mca" ``` ``` return(r) ``` ```} ``` ``` ``` ```# compute clusters' correlations with factors ``` ```clstcor.mca <- ``` ```function(o) ``` ```{ ``` ``` if(class(o) != "mca") { ``` ``` warning("Object is not of class 'mca'") ``` ``` return(invisible(NULL)) ``` ``` } ``` ``` ``` ``` nclst <- length(o\$clsti\$size) ``` ``` ``` ``` # Correlation of clusters' centers and factorial axes ``` ``` temp <- o\$clsti\$centers^2 ``` ``` sum_cor <- apply(temp, 1, sum) ``` ``` clstcor <- sweep(temp, 1, sum_cor, FUN="/") ``` ``` ``` ``` nfact <- 3 ``` ``` ``` ``` # select first nfact most correlated factors for each clst ``` ``` selMostCorFact <- function(i) { ``` ``` temp1 <- (sort(clstcor[i,], decreasing = TRUE))[1:nfact] ``` ``` temp2 <- round_preserve_sum(1000 * temp1) ``` ``` temp3 <- strtoi(sub('.', '', names(temp1))) ``` ``` rbind(temp3, temp2) ``` ``` } ``` ``` ``` ``` tblClstCor <- t(sapply(1:nclst, selMostCorFact)) ``` ``` tblClstCor <- cbind(tblClstCor, o\$clsti\$size) ``` ``` rwithinss <- o\$clsti\$withinss / o\$clsti\$size # within sum of squares ``` ``` # relative to cluster size ``` ``` clstqlty <- round_preserve_sum(1000 * rwithinss / sum(rwithinss)) ``` ``` tblClstCor <- cbind(tblClstCor, clstqlty) ``` ``` rownames(tblClstCor) <- paste0('clst-', 1:nclst) ``` ``` colnames(tblClstCor) <- c( rep(c("F", "COR"), nfact), "SIZE", "QLTY" ) ``` ``` ``` ``` tblClstCor[order(tblClstCor[,1], tblClstCor[,3], tblClstCor[,5]),] ``` ```} ``` ``` ``` ```textSize.mca <- ``` ```function(o, d1 = NULL, d2 = NULL) ``` ```{ ``` ``` if(class(o) != "mca") { ``` ``` warning("Object is not of class 'mca'") ``` ``` return(invisible(NULL)) ``` ``` } ``` ``` if(is.null(d1)) d1<-1 ``` ``` if(is.null(d2)) d2<-2 ``` ``` ``` ``` # proportion of inertia of factorial plan d1-d2 explained by each profile ``` ``` cont <- o\$r * (o\$f[,d1]^2 + o\$f[,d2]^2) / (o\$sv[d1]^2 + o\$sv[d2]^2) ``` ``` names <- rownames(o\$f) ``` ``` names[cont < 0.05] <- "." ``` ``` optimPar <- nonlinearFontSize.mca(cont) ``` ``` sizes <- log(1 + exp(optimPar[1]) * cont^optimPar[2]) ``` ``` sizes[cont < 0.01] <- 1 ``` ``` r <- list(names=names, sizes=sizes) ``` ``` return(r) ``` ```} ``` ``` ``` ```plot.mca <- ``` ```function(o, d1 = NULL, d2 = NULL) ``` ```{ ``` ``` if(class(o) != "mca") { ``` ``` warning("Object is not of class 'mca'") ``` ``` UseMethod("plot") ``` ``` return(invisible(NULL)) ``` ``` } ``` ``` if(is.null(d1)) d1<-1 ``` ``` if(is.null(d2)) d2<-2 ``` ``` ``` ``` ns <- textSize.mca(o, d1, d2) # names and sizes ``` ``` ``` ``` plot(o\$f[,d1], o\$f[,d2], type = "n", ``` ``` xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n") ``` ``` text(o\$f[,d1], o\$f[,d2], ns\$names, adj = 0, cex = ns\$sizes, ``` ``` col = 'blue', font = 2) ``` ``` points(0, 0, pch = 3) ``` ```} ``` ``` ``` ```plotjsup.mca <- ``` ```function(o, d1 = NULL, d2 = NULL) ``` ```{ ``` ``` if(class(o) != "mca") { ``` ``` warning("Object is not of class 'mca'") ``` ``` return(invisible(NULL)) ``` ``` } ``` ``` if(is.null(d1)) d1<-1 ``` ``` if(is.null(d2)) d2<-2 ``` ``` ``` ``` ns <- textSize.mca(o, d1, d2) # names and sizes ``` ``` ``` ``` plot(c(o\$f[,d1], o\$fsj[,d1]), c(o\$f[,d2], o\$fsj[,d2]), type = "n", ``` ``` xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n") ``` ``` text(o\$f[,d1], o\$f[,d2], ns\$names, adj = 0, cex = ns\$sizes, ``` ``` col = 'blue', font = 2) ``` ``` text(o\$fsj[,d1], o\$fsj[,d2], rownames(o\$fsj), adj = 0, ``` ``` font = 4, cex = 0.7, col = 'red') ``` ``` points(0, 0, pch = 3) ``` ```} ``` ``` ``` ```plotisupjsup.mca <- ``` ```function(o, d1 = NULL, d2 = NULL) ``` ```{ ``` ``` if(class(o) != "mca") { ``` ``` warning("Object is not of class 'mca'") ``` ``` return(invisible(NULL)) ``` ``` } ``` ``` if(is.null(d1)) d1<-1 ``` ``` if(is.null(d2)) d2<-2 ``` ``` ``` ``` nsm <- textSize.mca(o, d1, d2) # names and sizes for modalities ``` ``` nsi <- textSizeClst.mca(o, d1, d2) # names and sizes for clust of individuals ``` ``` ``` ``` plot(c(o\$f[,d1], o\$fsj[,d1], o\$clsti\$centers[,d1]), ``` ``` c(o\$f[,d2], o\$fsj[,d2], o\$clsti\$centers[,d2]), ``` ``` type = "n", xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n") ``` ``` text(o\$f[,d1], o\$f[,d2], nsm\$names, adj = 0, cex = nsm\$sizes, ``` ``` col = 'blue', font = 2) ``` ``` text(o\$fsj[,d1], o\$fsj[,d2], rownames(o\$fsj), adj = 0, font = 4, ``` ``` cex = 0.7, col = 'red') ``` ``` text(o\$clsti\$centers[,d1], o\$clsti\$centers[,d2], ``` ``` nsi\$names, adj = 0, cex = nsi\$sizes, font = 3) ``` ``` points(0, 0, pch = 3) ``` ```} ``` ``` ``` ```textSizeClst.mca <- ``` ```function(o, d1 = NULL, d2 = NULL) ``` ```{ ``` ``` if(class(o) != "mca") { ``` ``` warning("Object is not of class 'mca'") ``` ``` return(invisible(NULL)) ``` ``` } ``` ``` if(is.null(d1)) d1<-1 ``` ``` if(is.null(d2)) d2<-2 ``` ``` ``` ``` # correlation of each cluster center profile with factors d1+d2 ``` ``` temp <- o\$clsti\$centers^2 ``` ``` sum_cor <- apply(temp, 1, sum) ``` ``` clstcor <- sweep(temp, 1, sum_cor, FUN="/") ``` ``` clstcor <- rowSums(clstcor[,c(d1,d2)]) ``` ``` names <- names(clstcor) ``` ``` names[clstcor < 0.05] <- "x" ``` ``` optimPar <- nonlinearFontSize.mca(clstcor) ``` ``` sizes <- log(1 + exp(optimPar[1]) * clstcor^optimPar[2]) ``` ``` sizes[clstcor < 0.05] <- 1 ``` ``` r <- list(names=names, sizes=sizes) ``` ``` return(r) ``` ```} ``` ``` ``` ```nonlinearFontSize.mca <- ``` ```function(dat, minFontCex = 0.3, maxFontCex = 1) ``` ```{ ``` ``` # discover a non-linear transform of data that maps to font sizes ``` ``` # the smallest font factor should be cex ~= minFontCex ``` ``` # the largest font factor should be cex ~= maxFontCex ``` ``` optim.err <- function(par) { ``` ``` err <- (minFontCex - log(1 + exp(par[1]) * min(dat)^par[2]))^2 ``` ``` err <- err + (maxFontCex - log(1 + exp(par[1]) * max(dat)^par[2]))^2 ``` ``` return(err) ``` ``` } ``` ``` optim.res <- optim(par = c(1,0.3), fn = optim.err) ``` ``` return(optim.res\$par) ``` ```} ``` ``` ``` ```# barplot of categorical variable `cat` ``` ```# for individuals of cluster `clstnb` ``` ```# `o` is the correspondence analysis model ``` ```# `dat` is the dataset ``` ```barplotClst.mca <- ``` ```function(o, dat, clstnb, cat) ``` ```{ ``` ``` barplot(table(dat\$dat[which(o\$clsti\$cluster == clstnb), cat])) ``` ```} ```