419 lines
13 KiB
R
419 lines
13 KiB
R
# https://www.r-bloggers.com/2016/07/round-values-while-preserve-their-rounded-sum-in-r/
|
|
# E.G.
|
|
# > sum(c(0.333, 0.333, 0.334))
|
|
# [1] 1
|
|
# > sum(round(c(0.333, 0.333, 0.334), 2))
|
|
# [1] 0.99
|
|
# > sum(round_preserve_sum(c(0.333, 0.333, 0.334), 2))
|
|
# [1] 1.00
|
|
round_preserve_sum <- function(x, digits = 0) {
|
|
up <- 10 ^ digits
|
|
x <- x * up
|
|
y <- floor(x)
|
|
indices <- tail(order(x-y), round(sum(x)) - sum(y))
|
|
y[indices] <- y[indices] + 1
|
|
y / up
|
|
}
|
|
|
|
# `kcuts` cuts variable `x` into `centers` categories with k-means.
|
|
kcuts <-
|
|
function(x, centers)
|
|
{
|
|
set.seed(1123)
|
|
km <- kmeans(x = x, centers = centers)
|
|
cuts <- unlist(lapply(order(km$centers), function(clustId) {
|
|
min(x[km$cluster == clustId]) }))
|
|
cuts <- c(cuts, max(x))
|
|
}
|
|
|
|
# ventilate modality `mod` of categorical var `cat`
|
|
# into the remaining modalities according to their relative frequencies.
|
|
ventilate <-
|
|
function(cat, mod)
|
|
{
|
|
if (mod == "NA") isna <- TRUE else isna <- FALSE
|
|
tab <- table(cat)
|
|
if (isna)
|
|
{
|
|
sup_i <- which(is.na(cat))
|
|
sup_n <- length(sup_i)
|
|
act_mod <- tab
|
|
}
|
|
else
|
|
{
|
|
sup_i <- which(cat == mod)
|
|
sup_n <- tab[mod]
|
|
act_mod <- tab[! names(tab) %in% mod]
|
|
}
|
|
prob <- act_mod / sum(act_mod)
|
|
smpl <- sample(names(prob), size = sup_n, replace = TRUE, prob = prob)
|
|
r <- list(sup_mod = mod,
|
|
sup_n = sup_n,
|
|
sup_i = sup_i,
|
|
smpl = smpl)
|
|
return(r)
|
|
}
|
|
|
|
# pre-processing for exploratory analytics of the housing dataset
|
|
|
|
makeDatasetHousing <-
|
|
function()
|
|
{
|
|
# chargement du jeu de données
|
|
dat <- read.csv(file="data/housing.csv", header=TRUE)
|
|
|
|
# transformation de ocean_proximity en facteur
|
|
dat$ocean_proximity <- as.factor(dat$ocean_proximity)
|
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="<1H OCEAN"] <- "O:<1H"
|
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="ISLAND"] <- "O:ISL"
|
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="INLAND"] <- "O:INL"
|
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR BAY"] <- "O:NB"
|
|
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR OCEAN"] <- "O:NO"
|
|
|
|
# quantification de longitude
|
|
cuts <- kcuts(x = dat$longitude, centers = 4)
|
|
dat$c_longitude <- cut(x = dat$longitude, unique(cuts), include.lowest = TRUE)
|
|
levels(dat$c_longitude) <- c('LO:W', 'LO:MW', 'LO:ME', 'LO:E')
|
|
|
|
# quantification de latitude
|
|
cuts <- kcuts(x = dat$latitude, centers = 4)
|
|
dat$c_latitude <- cut(x = dat$latitude, unique(cuts), include.lowest = TRUE)
|
|
levels(dat$c_latitude) <- c('LA:S','LA:MS','LA:MN','LA:N')
|
|
|
|
# quantification de housing_median_age
|
|
cuts <- c(min(dat$housing_median_age), 15, 25, 35, 51, 52)
|
|
dat$c_age <- cut(x = dat$housing_median_age, unique(cuts), include.lowest = TRUE)
|
|
levels(dat$c_age) <- c('AG:15]','AG:25]','AG:35]','AG:51]', 'AG:52')
|
|
|
|
# création et quantification de rooms
|
|
dat$rooms <- dat$total_rooms / dat$households
|
|
cuts <- c(min(dat$rooms), 4, 6, 8, max(dat$rooms))
|
|
dat$c_rooms <- cut(x = dat$rooms, unique(cuts), include.lowest = TRUE)
|
|
levels(dat$c_rooms) <- c('RO:4]','RO:6]','RO:8]', 'RO:>8')
|
|
|
|
# création et quantification de bedrooms
|
|
dat$bedrooms <- dat$total_bedrooms / dat$households
|
|
cuts <- c(min(dat$bedrooms, na.rm = TRUE), 1.1, max(dat$bedrooms, na.rm = TRUE))
|
|
dat$c_bedrooms <- cut(x = dat$bedrooms, unique(cuts), include.lowest = TRUE)
|
|
levels(dat$c_bedrooms) <- c('BE:1]','BE:>1')
|
|
|
|
# création et quantification de pop
|
|
dat$pop <- dat$population / dat$households
|
|
cuts <- c(min(dat$pop), 2, 3, 4, max(dat$pop))
|
|
dat$c_pop <- cut(x = dat$pop, unique(cuts), include.lowest = TRUE)
|
|
levels(dat$c_pop) <- c('PO:2]','PO:3]', 'PO:4]', 'PO:>4')
|
|
|
|
# quantification de households
|
|
cuts <- c(min(dat$households), 300, 400, 600, max(dat$households))
|
|
dat$c_households <- cut(x = dat$households, cuts, include.lowest = TRUE)
|
|
levels(dat$c_households) <- c('HH:3]', 'HH:4]', 'HH:6]', 'HH:>6')
|
|
|
|
# quantification de median_income
|
|
cuts <- quantile(dat$median_income, probs = seq(0,1,1/4))
|
|
cuts <- c(cuts[1:length(cuts)-1], 15, max(dat$median_income))
|
|
dat$c_income <- cut(x = dat$median_income, cuts, include.lowest = TRUE)
|
|
levels(dat$c_income) <- c('IC:L', 'IC:ML', 'IC:MH', 'IC:H', 'IC:>15')
|
|
|
|
# quantification de median_house_value
|
|
cuts <- c(min(dat$median_house_value), 115000, 175000, 250000, 500000,
|
|
max(dat$median_house_value))
|
|
dat$c_house_value <- cut(x = dat$median_house_value, cuts, include.lowest = TRUE)
|
|
levels(dat$c_house_value) <- c('HV:A', 'HV:B', 'HV:C', 'HV:D', 'HV:E')
|
|
|
|
# création du jeu de données quantifié
|
|
dat.all <- dat
|
|
dat <- dat.all[c('ocean_proximity', 'c_longitude', 'c_latitude', 'c_age',
|
|
'c_rooms', 'c_bedrooms', 'c_pop', 'c_households', 'c_income',
|
|
'c_house_value')]
|
|
vent <- list()
|
|
# ventilation de la modalité RO:>8 de c_rooms
|
|
vent$c_rooms <- ventilate(dat$c_rooms, "RO:>8")
|
|
dat$c_rooms[vent$c_rooms$sup_i] <- vent$c_rooms$smpl
|
|
|
|
# ventilation de la modalité IC:>15 de c_income
|
|
vent$c_income <- ventilate(dat$c_income, "IC:>15")
|
|
dat$c_income[vent$c_income$sup_i] <- vent$c_income$smpl
|
|
|
|
# ventilation de la modalité O:ISL de ocean_proximity
|
|
vent$ocean_proximity <- ventilate(dat$ocean_proximity, "O:ISL")
|
|
dat$ocean_proximity[vent$ocean_proximity$sup_i] <- vent$ocean_proximity$smpl
|
|
|
|
# ventilation des valeurs manquantes de c_bedrooms
|
|
vent$c_bedrooms <- ventilate(dat$c_bedrooms, "NA")
|
|
dat$c_bedrooms[vent$c_bedrooms$sup_i] <- vent$c_bedrooms$smpl
|
|
|
|
# suppression des modalités vides après ventilation
|
|
dat <- droplevels(dat)
|
|
|
|
# positionnement de c_house_value en variable supplémentaire
|
|
supind <- which(names(dat) == "c_house_value")
|
|
|
|
r <- list( vent=vent, dat=dat, supind=supind )
|
|
|
|
return(r)
|
|
}
|
|
|
|
mca <-
|
|
function(dat, nclst = 100)
|
|
{
|
|
lev_n <- unlist(lapply(dat$dat, nlevels))
|
|
n <- cumsum(lev_n)
|
|
J_t <- sum(lev_n)
|
|
Q_t <- dim(dat$dat)[2]
|
|
I <- dim(dat$dat)[1]
|
|
|
|
# Indicator matrix
|
|
Z <- matrix(0, nrow = I, ncol = J_t)
|
|
numdat <- lapply(dat$dat, as.numeric)
|
|
offset <- c(0, n[-length(n)])
|
|
for (i in 1:Q_t)
|
|
Z[1:I + (I * (offset[i] + numdat[[i]] - 1))] <- 1
|
|
dimnames(Z)[[1]] <- as.character(1:I)
|
|
dimnames(Z)[[2]] <- as.character(unlist(lapply(dat$dat, levels)))
|
|
Z_sup_min <- n[dat$supind[1] - 1] + 1
|
|
Z_sup_max <- n[dat$supind[length(dat$supind)]]
|
|
Z_sup_ind <- Z_sup_min : Z_sup_max
|
|
Z_act <- Z[,-Z_sup_ind]
|
|
J <- dim(Z_act)[2]
|
|
Q <- dim(dat$dat)[2] - length(dat$supind)
|
|
|
|
# Burt matrix
|
|
B <- t(Z_act) %*% Z_act
|
|
|
|
# Decomposition of inertia
|
|
P <- B / sum(B)
|
|
r <- apply(P, 2, sum)
|
|
rr <- r %*% t(r)
|
|
S <- (P - rr) / sqrt(rr)
|
|
dec <- eigen(S)
|
|
# Last Q eigen value must be zero
|
|
K <- J - Q
|
|
sv <- dec$values[1 : K]
|
|
|
|
# standard coordinates
|
|
a <- sweep(dec$vectors, 1, sqrt(r), FUN = "/")
|
|
a <- a[,(1 : K)]
|
|
|
|
# principal coordinates
|
|
f <- a %*% diag(sv)
|
|
|
|
fac_names <- paste("F", paste(1 : K), sep = "")
|
|
rownames(a) <- colnames(Z_act)
|
|
colnames(a) <- fac_names
|
|
rownames(f) <- colnames(Z_act)
|
|
colnames(f) <- fac_names
|
|
|
|
# contributions
|
|
temp <- sweep(f^2, 1, r, FUN = "*")
|
|
sum_ctr <- apply(temp, 2, sum)
|
|
ctr <- sweep(temp, 2, sum_ctr, FUN = "/")
|
|
|
|
# correlations
|
|
temp <- f^2
|
|
sum_cor <- apply(temp, 1, sum)
|
|
cor <- sweep(temp, 1, sum_cor, FUN="/")
|
|
|
|
# profiles of "supplementary" individuals
|
|
si <- sweep(Z_act, 1, apply(Z_act, 1, sum), FUN = "/")
|
|
fsi <- si %*% a
|
|
temp <- fsi^2
|
|
sum_cor <- apply(temp, 1, sum)
|
|
sicor <- sweep(temp, 1, sum_cor, FUN="/")
|
|
|
|
# profiles of "supplementary" variables
|
|
Z_sup <- Z[,Z_sup_ind]
|
|
sj <- t(Z_sup) %*% Z_act
|
|
sj <- sweep(sj, 1, apply(sj, 1, sum), FUN = "/")
|
|
fsj <- sj %*% a
|
|
temp <- fsj^2
|
|
sum_cor <- apply(temp, 1, sum)
|
|
sjcor <- sweep(temp, 1, sum_cor, FUN="/")
|
|
|
|
# clusters of individuals
|
|
# Keep enough singular values to capture at least 90% of variance
|
|
nsv <- which(cumsum(sv / sum(sv)) > 0.9)[1]
|
|
nstart <- 25
|
|
set.seed(1123)
|
|
clsti <- kmeans(fsi[,1:nsv], nclst, nstart)
|
|
# Principal coordinates of clusters' centers
|
|
fclst <- clsti$centers
|
|
|
|
r <- list(f=f, ctr=ctr, cor=cor, r=r, sv=sv,
|
|
fsi=fsi, sicor=sicor, fsj=fsj, sjcor=sjcor,
|
|
clsti=clsti)
|
|
class(r) <- "mca"
|
|
return(r)
|
|
}
|
|
|
|
# compute clusters' correlations with factors
|
|
clstcor.mca <-
|
|
function(o)
|
|
{
|
|
if(class(o) != "mca") {
|
|
warning("Object is not of class 'mca'")
|
|
return(invisible(NULL))
|
|
}
|
|
|
|
nclst <- length(o$clsti$size)
|
|
|
|
# Correlation of clusters and factorial axes
|
|
temp <- o$clsti$centers^2
|
|
sum_cor <- apply(temp, 1, sum)
|
|
clstcor <- sweep(temp, 1, sum_cor, FUN="/")
|
|
|
|
nfact <- 3
|
|
|
|
# select first nfact most correlated factors for each clst
|
|
selMostCorFact <- function(i) {
|
|
temp1 <- (sort(clstcor[i,], decreasing = TRUE))[1:nfact]
|
|
temp2 <- round_preserve_sum(1000 * temp1)
|
|
temp3 <- strtoi(sub('.', '', names(temp1)))
|
|
rbind(temp3, temp2)
|
|
}
|
|
|
|
tblClstCor <- t(sapply(1:nclst, selMostCorFact))
|
|
tblClstCor <- cbind(tblClstCor, o$clsti$size)
|
|
rwithinss <- o$clsti$withinss / o$clsti$size # withinss relative to the cluster size
|
|
clstqlty <- round_preserve_sum(1000 * rwithinss / sum(rwithinss))
|
|
tblClstCor <- cbind(tblClstCor, clstqlty)
|
|
rownames(tblClstCor) <- paste0('clst-', 1:nclst)
|
|
colnames(tblClstCor) <- c( rep(c("F", "COR"), nfact), "SIZE", "QLTY" )
|
|
|
|
tblClstCor[order(tblClstCor[,1], tblClstCor[,3], tblClstCor[,5]),]
|
|
}
|
|
|
|
textSize.mca <-
|
|
function(o, d1 = NULL, d2 = NULL)
|
|
{
|
|
if(class(o) != "mca") {
|
|
warning("Object is not of class 'mca'")
|
|
return(invisible(NULL))
|
|
}
|
|
if(is.null(d1)) d1<-1
|
|
if(is.null(d2)) d2<-2
|
|
|
|
# Part de l'inertie du plan factoriel d1-d2 expliquée par chaque profil
|
|
cont <- o$r * (o$f[,d1]^2 + o$f[,d2]^2) / (o$sv[d1]^2 + o$sv[d2]^2)
|
|
names <- rownames(o$f)
|
|
names[cont < 0.01] <- "."
|
|
optimPar <- nonlinearFontSize.mca(cont)
|
|
sizes <- log(1 + exp(optimPar[1]) * cont^optimPar[2])
|
|
sizes[cont < 0.01] <- 1
|
|
r <- list(names=names, sizes=sizes)
|
|
return(r)
|
|
}
|
|
|
|
plot.mca <-
|
|
function(o, d1 = NULL, d2 = NULL)
|
|
{
|
|
if(class(o) != "mca") {
|
|
warning("Object is not of class 'mca'")
|
|
UseMethod("plot")
|
|
return(invisible(NULL))
|
|
}
|
|
if(is.null(d1)) d1<-1
|
|
if(is.null(d2)) d2<-2
|
|
|
|
ns <- textSize.mca(o, d1, d2) # names and sizes
|
|
|
|
plot(o$f[,d1], o$f[,d2], type = "n",
|
|
xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n")
|
|
text(o$f[,d1], o$f[,d2], ns$names, adj = 0, cex = ns$sizes, col = 'blue', font = 2)
|
|
points(0, 0, pch = 3)
|
|
}
|
|
|
|
plotjsup.mca <-
|
|
function(o, d1 = NULL, d2 = NULL)
|
|
{
|
|
if(class(o) != "mca") {
|
|
warning("Object is not of class 'mca'")
|
|
return(invisible(NULL))
|
|
}
|
|
if(is.null(d1)) d1<-1
|
|
if(is.null(d2)) d2<-2
|
|
|
|
ns <- textSize.mca(o, d1, d2) # names and sizes
|
|
|
|
plot(c(o$f[,d1], o$fsj[,d1]), c(o$f[,d2], o$fsj[,d2]), type = "n",
|
|
xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n")
|
|
text(o$f[,d1], o$f[,d2], ns$names, adj = 0, cex = ns$sizes,
|
|
col = 'blue', font = 2)
|
|
text(o$fsj[,d1], o$fsj[,d2], rownames(o$fsj), adj = 0,
|
|
font = 4, cex = 0.7, col = 'red')
|
|
points(0, 0, pch = 3)
|
|
}
|
|
|
|
plotisupjsup.mca <-
|
|
function(o, d1 = NULL, d2 = NULL)
|
|
{
|
|
if(class(o) != "mca") {
|
|
warning("Object is not of class 'mca'")
|
|
return(invisible(NULL))
|
|
}
|
|
if(is.null(d1)) d1<-1
|
|
if(is.null(d2)) d2<-2
|
|
|
|
nsm <- textSize.mca(o, d1, d2) # names and sizes for modalities
|
|
nsi <- textSizeClst.mca(o, d1, d2) # names and sizes for clusters of individuals
|
|
|
|
plot(c(o$f[,d1], o$fsj[,d1], o$clsti$centers[,d1]),
|
|
c(o$f[,d2], o$fsj[,d2], o$clsti$centers[,d2]),
|
|
type = "n", xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n")
|
|
text(o$f[,d1], o$f[,d2], nsm$names, adj = 0, cex = nsm$sizes,
|
|
col = 'blue', font = 2)
|
|
text(o$fsj[,d1], o$fsj[,d2], rownames(o$fsj), adj = 0, font = 4,
|
|
cex = 0.7, col = 'red')
|
|
text(o$clsti$centers[,d1], o$clsti$centers[,d2],
|
|
nsi$names, adj = 0, cex = nsi$sizes, font = 3)
|
|
points(0, 0, pch = 3)
|
|
}
|
|
|
|
textSizeClst.mca <-
|
|
function(o, d1 = NULL, d2 = NULL)
|
|
{
|
|
if(class(o) != "mca") {
|
|
warning("Object is not of class 'mca'")
|
|
return(invisible(NULL))
|
|
}
|
|
if(is.null(d1)) d1<-1
|
|
if(is.null(d2)) d2<-2
|
|
|
|
# correlation of each cluster center profile with factors d1+d2
|
|
temp <- o$clsti$centers^2
|
|
sum_cor <- apply(temp, 1, sum)
|
|
clstcor <- sweep(temp, 1, sum_cor, FUN="/")
|
|
clstcor <- rowSums(clstcor[,c(d1,d2)])
|
|
names <- names(clstcor)
|
|
names[clstcor < 0.01] <- "x"
|
|
optimPar <- nonlinearFontSize.mca(clstcor)
|
|
sizes <- log(1 + exp(optimPar[1]) * clstcor^optimPar[2])
|
|
sizes[clstcor < 0.01] <- 1
|
|
r <- list(names=names, sizes=sizes)
|
|
return(r)
|
|
}
|
|
|
|
nonlinearFontSize.mca <-
|
|
function(dat, minFontCex = 0.3, maxFontCex = 1)
|
|
{
|
|
# discover a non-linear transform of data that maps to font sizes
|
|
# the smallest font factor should be cex ~= minFontCex
|
|
# the largest font factor should be cex ~= maxFontCex
|
|
optim.err <- function(par) {
|
|
err <- (minFontCex - log(1 + exp(par[1]) * min(dat)^par[2]))^2
|
|
err <- err + (maxFontCex - log(1 + exp(par[1]) * max(dat)^par[2]))^2
|
|
return(err)
|
|
}
|
|
optim.res <- optim(par = c(1,0.3), fn = optim.err)
|
|
return(optim.res$par)
|
|
}
|
|
|
|
# barplot of categorical variable `cat`
|
|
# for individuals of cluster `clstnb`
|
|
# `o` is the correspondence analysis model
|
|
# `dat` is the dataset
|
|
barplotClst.mca <-
|
|
function(o, dat, clstnb, cat)
|
|
{
|
|
barplot(table(dat$dat[which(o$clsti$cluster == clstnb), cat]))
|
|
}
|