intro_to_ml/05_d_svd_mca_code.R

242 lines
7.9 KiB
R

# `kcuts` cuts variable `x` into `centers` categories with k-means.
kcuts <-
function(x, centers)
{
km <- kmeans(x = x, centers = centers)
cuts <- unlist(lapply(order(km$centers), function(clustId) {
min(x[km$cluster == clustId]) }))
cuts <- c(cuts, max(x))
}
# ventilate modality `mod` of categorical var `cat`
# into the remaining modalities according to their relative frequencies.
ventilate <-
function(cat, mod)
{
if (mod == "NA") isna <- TRUE else isna <- FALSE
tab <- table(cat)
if (isna)
{
sup_i <- which(is.na(cat))
sup_n <- length(sup_i)
act_mod <- tab
}
else
{
sup_i <- which(cat == mod)
sup_n <- tab[mod]
act_mod <- tab[! names(tab) %in% mod]
}
prob <- act_mod / sum(act_mod)
smpl <- sample(names(prob), size = sup_n, replace = TRUE, prob = prob)
r <- list(sup_mod = mod,
sup_n = sup_n,
sup_i = sup_i,
smpl = smpl)
return(r)
}
# pre-processing for exploratory analytics of the housing dataset
makeDatasetHousing <-
function()
{
# chargement du jeu de données
dat <- read.csv(file="data/housing.csv", header=TRUE)
# transformation de ocean_proximity en facteur
dat$ocean_proximity <- as.factor(dat$ocean_proximity)
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="<1H OCEAN"] <- "O:<1H"
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="ISLAND"] <- "O:ISL"
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="INLAND"] <- "O:INL"
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR BAY"] <- "O:NB"
levels(dat$ocean_proximity)[levels(dat$ocean_proximity)=="NEAR OCEAN"] <- "O:NO"
# quantification de longitude
cuts <- kcuts(x = dat$longitude, centers = 4)
dat$c_longitude <- cut(x = dat$longitude, unique(cuts), include.lowest = TRUE)
levels(dat$c_longitude) <- c('LO:W', 'LO:MW', 'LO:ME', 'LO:E')
# quantification de latitude
cuts <- kcuts(x = dat$latitude, centers = 4)
dat$c_latitude <- cut(x = dat$latitude, unique(cuts), include.lowest = TRUE)
levels(dat$c_latitude) <- c('LA:S','LA:MS','LA:MN','LA:N')
# quantification de housing_median_age
cuts <- c(min(dat$housing_median_age), 15, 25, 35, 51, 52)
dat$c_age <- cut(x = dat$housing_median_age, unique(cuts), include.lowest = TRUE)
levels(dat$c_age) <- c('AG:15]','AG:25]','AG:35]','AG:51]', 'AG:52')
# création et quantification de rooms
dat$rooms <- dat$total_rooms / dat$households
cuts <- c(min(dat$rooms), 4, 6, 8, max(dat$rooms))
dat$c_rooms <- cut(x = dat$rooms, unique(cuts), include.lowest = TRUE)
levels(dat$c_rooms) <- c('RO:4]','RO:6]','RO:8]', 'RO:>8')
# création et quantification de bedrooms
dat$bedrooms <- dat$total_bedrooms / dat$households
cuts <- c(min(dat$bedrooms, na.rm = TRUE), 1.1, max(dat$bedrooms, na.rm = TRUE))
dat$c_bedrooms <- cut(x = dat$bedrooms, unique(cuts), include.lowest = TRUE)
levels(dat$c_bedrooms) <- c('BE:1]','BE:>1')
# création et quantification de pop
dat$pop <- dat$population / dat$households
cuts <- c(min(dat$pop), 2, 3, 4, max(dat$pop))
dat$c_pop <- cut(x = dat$pop, unique(cuts), include.lowest = TRUE)
levels(dat$c_pop) <- c('PO:2]','PO:3]', 'PO:4]', 'PO:>4')
# quantification de households
cuts <- c(min(dat$households), 300, 400, 600, max(dat$households))
dat$c_households <- cut(x = dat$households, cuts, include.lowest = TRUE)
levels(dat$c_households) <- c('HH:3]', 'HH:4]', 'HH:6]', 'HH:>6')
# quantification de median_income
cuts <- quantile(dat$median_income, probs = seq(0,1,1/4))
cuts <- c(cuts[1:length(cuts)-1], 15, max(dat$median_income))
dat$c_income <- cut(x = dat$median_income, cuts, include.lowest = TRUE)
levels(dat$c_income) <- c('IC:L', 'IC:ML', 'IC:MH', 'IC:H', 'IC:>15')
# quantification de median_house_value
cuts <- c(min(dat$median_house_value), 115000, 175000, 250000, 500000,
max(dat$median_house_value))
dat$c_house_value <- cut(x = dat$median_house_value, cuts, include.lowest = TRUE)
levels(dat$c_house_value) <- c('HV:A', 'HV:B', 'HV:C', 'HV:D', 'HV:E')
# création du jeu de données quantifié
dat.all <- dat
dat <- dat.all[c('ocean_proximity', 'c_longitude', 'c_latitude', 'c_age',
'c_rooms', 'c_bedrooms', 'c_pop', 'c_households', 'c_income',
'c_house_value')]
vent <- list()
# ventilation de la modalité RO:>8 de c_rooms
vent$c_rooms <- ventilate(dat$c_rooms, "RO:>8")
dat$c_rooms[vent$c_rooms$sup_i] <- vent$c_rooms$smpl
# ventilation de la modalité IC:>15 de c_income
vent$c_income <- ventilate(dat$c_income, "IC:>15")
dat$c_income[vent$c_income$sup_i] <- vent$c_income$smpl
# ventilation de la modalité O:ISL de ocean_proximity
vent$ocean_proximity <- ventilate(dat$ocean_proximity, "O:ISL")
dat$ocean_proximity[vent$ocean_proximity$sup_i] <- vent$ocean_proximity$smpl
# ventilation des valeurs manquantes de c_bedrooms
vent$c_bedrooms <- ventilate(dat$c_bedrooms, "NA")
dat$c_bedrooms[vent$c_bedrooms$sup_i] <- vent$c_bedrooms$smpl
# suppression des modalités vides après ventilation
dat <- droplevels(dat)
# positionnement de c_house_value en variable supplémentaire
supind <- which(names(dat) == "c_house_value")
r <- list( vent=vent, dat=dat, supind=supind )
return(r)
}
mca <-
function(dat)
{
lev_n <- unlist(lapply(dat$dat, nlevels))
n <- cumsum(lev_n)
J_t <- sum(lev_n)
Q_t <- dim(dat$dat)[2]
I <- dim(dat$dat)[1]
# Indicator matrix
Z <- matrix(0, nrow = I, ncol = J_t)
numdat <- lapply(dat$dat, as.numeric)
offset <- c(0, n[-length(n)])
for (i in 1:Q_t)
Z[1:I + (I * (offset[i] + numdat[[i]] - 1))] <- 1
dimnames(Z)[[1]] <- as.character(1:I)
dimnames(Z)[[2]] <- as.character(unlist(lapply(dat$dat, levels)))
Z_sup_min <- n[dat$supind[1] - 1] + 1
Z_sup_max <- n[dat$supind[length(dat$supind)]]
Z_sup_ind <- Z_sup_min : Z_sup_max
Z_act <- Z[,-Z_sup_ind]
J <- dim(Z_act)[2]
Q <- dim(dat$dat)[2] - length(dat$supind)
# Burt matrix
B <- t(Z_act) %*% Z_act
# Decomposition of inertia
P <- B / sum(B)
r <- apply(P, 2, sum)
rr <- r %*% t(r)
S <- (P - rr) / sqrt(rr)
dec <- eigen(S)
# Last Q eigen value must be zero
K <- J - Q
sv <- dec$values[1 : K]
# standard coordinates
a <- sweep(dec$vectors, 1, sqrt(r), FUN = "/")
a <- a[,(1 : K)]
# principal coordinates
f <- a %*% diag(sv)
fac_names <- paste("F", paste(1 : K), sep = "")
rownames(a) <- colnames(Z_act)
colnames(a) <- fac_names
rownames(f) <- colnames(Z_act)
colnames(f) <- fac_names
# contributions
temp <- sweep(f^2, 1, r, FUN = "*")
sum_ctr <- apply(temp, 2, sum)
ctr <- sweep(temp, 2, sum_ctr, FUN = "/")
# correlations
temp <- f^2
sum_cor <- apply(temp, 1, sum)
cor <- sweep(temp, 1, sum_cor, FUN="/")
# profiles of "supplementary" individuals
si <- sweep(Z_act, 1, apply(Z_act, 1, sum), FUN = "/")
fsi <- si %*% a
temp <- fsi^2
sum_cor <- apply(temp, 1, sum)
sicor <- sweep(temp, 1, sum_cor, FUN="/")
# profiles of "supplementary" variables
Z_sup <- Z[,Z_sup_ind]
sj <- t(Z_sup) %*% Z_act
sj <- sweep(sj, 1, apply(sj, 1, sum), FUN = "/")
fsj <- sj %*% a
temp <- fsj^2
sum_cor <- apply(temp, 1, sum)
sjcor <- sweep(temp, 1, sum_cor, FUN="/")
r <- list(f=f, ctr=ctr, cor=cor, r=r, sv=sv,
fsi=fsi, sicor=sicor, fsj=fsj, sjcor=sjcor)
class(r) <- "mca"
return(r)
}
plot.mca <-
function(o, d1 = NULL, d2 = NULL)
{
if(class(o) != "mca") {
warning("Object is not of class 'mca'")
UseMethod("plot")
return(invisible(NULL))
}
if(is.null(d1)) d1<-1
if(is.null(d2)) d2<-2
# Part de l'inertie du plan factoriel d1-d2 expliquée par chaque profil
rowcon <- o$r * (o$f[,d1]^2 + o$f[,d2]^2) / (o$sv[d1]^2 + o$sv[d2]^2)
rnames <- rownames(o$f)
rnames[rowcon < 0.01] <- "."
rsize <- log(1 + exp(1) * rowcon^0.3)
rsize[rowcon < 0.01] <- 1
plot(o$f[,d1], o$f[,d2], type = "n",
xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n")
text(o$f[,d1], o$f[,d2], rnames, adj = 0, cex = rsize)
points(0, 0, pch = 3)
}