406 lines
13 KiB
R
406 lines
13 KiB
R
## -----------------------------------------------------------------------------
|
|
set.seed(1123)
|
|
source('05_d_svd_mca_code.R')
|
|
|
|
|
|
## ----05-d-mat-Z---------------------------------------------------------------
|
|
Z <- matrix( c(0,1,1,0,0,0,1,1,
|
|
1,0,0,1,1,1,0,0,
|
|
0,1,0,0,0,0,0,0,
|
|
1,0,1,1,0,0,1,0,
|
|
0,0,0,0,1,1,0,1,
|
|
0,0,1,0,0,0,1,0,
|
|
1,1,0,1,1,1,0,1),
|
|
nrow = 8, ncol = 7,
|
|
dimnames = list(
|
|
c("i1", "i2", "i3", "i4",
|
|
"i5", "i6", "i7", "i8"),
|
|
c("j1-1", "j1-2", "j2-1", "j2-2", "j2-3",
|
|
"j3-1", "j3-2")))
|
|
kbl(Z, caption = "Exemple jouet d'un tableau disjonctif complet",
|
|
booktabs = TRUE) %>%
|
|
kable_styling(latex_options = "striped")
|
|
|
|
|
|
## ----05-d-mat-C---------------------------------------------------------------
|
|
C = t(Z) %*% Z
|
|
kbl(C, caption = "Matrice de Burt pour l'exemple jouet",
|
|
booktabs = TRUE) %>%
|
|
kable_styling(latex_options = "striped")
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
dat <- read.csv(file="data/housing.csv", header=TRUE)
|
|
str(dat)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
dat$ocean_proximity <- as.factor(dat$ocean_proximity)
|
|
summary(dat)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
hist(dat$longitude)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
cuts <- quantile(dat$longitude, probs = seq(0,1,1/4))
|
|
hist(dat$longitude)
|
|
abline(v=cuts, lwd=4)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
cuts <- kcuts(x = dat$longitude, centers = 4)
|
|
cuts
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
hist(dat$longitude)
|
|
abline(v=cuts, lwd=4)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
dat$c_longitude <- cut(x = dat$longitude, unique(cuts), include.lowest = TRUE)
|
|
levels(dat$c_longitude) <- c('LO-W', 'LO-MW', 'LO-ME', 'LO-E')
|
|
summary(dat$c_longitude)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
cuts <- kcuts(x = dat$latitude, centers = 4)
|
|
cuts
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
hist(dat$latitude)
|
|
abline(v=cuts, lwd=3)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
dat$c_latitude <- cut(x = dat$latitude, unique(cuts), include.lowest = TRUE)
|
|
levels(dat$c_latitude) <- c('LA-S','LA-MS','LA-MN','LA-N')
|
|
summary(dat$c_latitude)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
hist(dat$housing_median_age)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
table(dat$housing_median_age[dat$housing_median_age>45])
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
nb_age_52 <- length(dat$housing_median_age[dat$housing_median_age == 52])
|
|
pc_age_52 <- round(100 * (nb_age_52 / dim(dat)[1]))
|
|
hist(dat$housing_median_age, breaks = 40)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
quantile(dat$housing_median_age[dat$housing_median_age<52])
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
cuts <- c(min(dat$housing_median_age), 15, 25, 35, 51, 52)
|
|
hist(dat$housing_median_age, breaks = 40)
|
|
abline(v=cuts, lwd=3)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
dat$c_age <- cut(x = dat$housing_median_age, unique(cuts), include.lowest = TRUE)
|
|
levels(dat$c_age) <- c('A<=15','A(15,25]','A(25,35]','A(35,51]', 'A=52')
|
|
summary(dat$c_age)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
cuts <- quantile(dat$total_rooms)
|
|
hist(dat$total_rooms)
|
|
abline(v=cuts, lwd=3)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
dat$rooms <- dat$total_rooms / dat$households
|
|
nb_rooms_gt_8 <- length(dat$rooms[dat$rooms>8])
|
|
pc_rooms_gt_8 <- round(100 * (nb_rooms_gt_8 / dim(dat)[1]))
|
|
quantile(dat$rooms)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
cuts <- c(min(dat$rooms), 4, 6, 8, max(dat$rooms))
|
|
hist(log10(dat$rooms))
|
|
abline(v=log10(cuts), lwd=3)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
dat$c_rooms <- cut(x = dat$rooms, unique(cuts), include.lowest = TRUE)
|
|
levels(dat$c_rooms) <- c('R<=4','R(4,6]','R(6,8]', 'R>8')
|
|
summary(dat$c_rooms)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
dat$bedrooms <- dat$total_bedrooms / dat$households
|
|
quantile(dat$bedrooms, na.rm = TRUE)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
cuts <- c(min(dat$bedrooms, na.rm = TRUE), 1.1, max(dat$bedrooms, na.rm = TRUE))
|
|
hist(log10(dat$bedrooms))
|
|
abline(v=log10(cuts), lwd=3)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
dat$c_bedrooms <- cut(x = dat$bedrooms, unique(cuts), include.lowest = TRUE)
|
|
levels(dat$c_bedrooms) <- c('B<=1','B>1')
|
|
summary(dat$c_bedrooms)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
dat$pop <- dat$population / dat$households
|
|
quantile(dat$pop, probs = seq(0,1,1/4))
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
cuts <- c(min(dat$pop), 2, 3, 4, max(dat$pop))
|
|
hist(log10(dat$pop))
|
|
abline(v=log10(cuts), lwd=3)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
dat$c_pop <- cut(x = dat$pop, unique(cuts), include.lowest = TRUE)
|
|
levels(dat$c_pop) <- c('P<=2','P(2,3]', 'P(3,4]', 'P>4')
|
|
summary(dat$c_pop)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
quantile(dat$households, probs = seq(0,1,1/4))
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
cuts <- c(min(dat$households), 300, 400, 600, max(dat$households))
|
|
hist(log10(dat$households))
|
|
abline(v=log10(cuts), lwd=3)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
dat$c_households <- cut(x = dat$households, cuts, include.lowest = TRUE)
|
|
levels(dat$c_households) <- c('H<=3', 'H(3,4]', 'H(4,6]', 'H>6')
|
|
summary(dat$c_households)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
cuts <- quantile(dat$median_income, probs = seq(0,1,1/4))
|
|
hist(dat$median_income)
|
|
abline(v=cuts, lwd=3)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
table(dat$median_income[dat$median_income>14])
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
nb_income_gt_15 <- length(dat$median_income[dat$median_income > 15])
|
|
pc_income_gt_15 <- round(100 * (nb_income_gt_15 / dim(dat)[1]), digits = 2)
|
|
hist(dat$median_income, breaks = 40)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
cuts <- c(cuts[1:length(cuts)-1], 15, max(dat$median_income))
|
|
dat$c_income <- cut(x = dat$median_income, cuts, include.lowest = TRUE)
|
|
levels(dat$c_income) <- c('IL', 'IML', 'IMH', 'IH', 'I>15')
|
|
summary(dat$c_income)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
hist(dat$median_house_value, breaks = 30)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
loc_mhv_gt_50k <- dat$median_house_value > 500000
|
|
nb_house_value_gt_50k <- length(dat$median_house_value[loc_mhv_gt_50k])
|
|
pc_house_value_gt_50k <- round(100 * (nb_house_value_gt_50k / dim(dat)[1]),
|
|
digits = 2)
|
|
table(dat$median_house_value[dat$median_house_value>499000])
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
quantile(dat$median_house_value[dat$median_house_value < 500000])
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
cuts <- c(min(dat$median_house_value), 115000, 175000, 250000, 500000,
|
|
max(dat$median_house_value))
|
|
dat$c_house_value <- cut(x = dat$median_house_value, cuts, include.lowest = TRUE)
|
|
levels(dat$c_house_value) <- c('V<=115', 'V(115,175]', 'V(175,250]', 'V(250,500]',
|
|
'V>500')
|
|
summary(dat$c_house_value)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
hist(dat$median_house_value)
|
|
abline(v=cuts, lwd=3)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
dat.all <- dat
|
|
dat <- dat.all[c('ocean_proximity', 'c_longitude', 'c_latitude', 'c_age',
|
|
'c_rooms', 'c_bedrooms', 'c_pop', 'c_households', 'c_income',
|
|
'c_house_value')]
|
|
summary(dat)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
c_rooms_sup <- ventilate(dat$c_rooms, "R>8")
|
|
dat$c_rooms[c_rooms_sup$sup_i] <- c_rooms_sup$smpl
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
c_income_sup <- ventilate(dat$c_income, "I>15")
|
|
dat$c_income[c_income_sup$sup_i] <- c_income_sup$smpl
|
|
|
|
ocean_proximity_sup <- ventilate(dat$ocean_proximity, "ISLAND")
|
|
dat$ocean_proximity[ocean_proximity_sup$sup_i] <- ocean_proximity_sup$smpl
|
|
|
|
c_bedrooms_sup <- ventilate(dat$c_bedrooms, "NA")
|
|
dat$c_bedrooms[c_bedrooms_sup$sup_i] <- c_bedrooms_sup$smpl
|
|
|
|
dat <- droplevels(dat)
|
|
summary(dat)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
# les catégories supplémentaires doivent être en dernières positions
|
|
# du tableau de données
|
|
sup_ind <- which(names(dat) == "c_house_value")
|
|
dat_act <- dat[,-sup_ind]
|
|
dat_sup <- dat[,sup_ind]
|
|
I <- dim(dat_act)[1]
|
|
Q <- dim(dat_act)[2]
|
|
dat[c(1:5, I),]
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
lev_n <- unlist(lapply(dat, nlevels))
|
|
n <- cumsum(lev_n)
|
|
J_t <- sum(lev_n)
|
|
Q_t <- dim(dat)[2]
|
|
Z <- matrix(0, nrow = I, ncol = J_t)
|
|
numdat <- lapply(dat, as.numeric)
|
|
offset <- c(0, n[-length(n)])
|
|
for (i in 1:Q_t)
|
|
Z[1:I + (I * (offset[i] + numdat[[i]] - 1))] <- 1
|
|
cn <- rep(names(dat), lev_n)
|
|
ln <- unlist(lapply(dat, levels))
|
|
dimnames(Z)[[1]] <- as.character(1:I)
|
|
dimnames(Z)[[2]] <- paste(cn, ln, sep = "")
|
|
|
|
Z_sup_min <- n[sup_ind[1] - 1] + 1
|
|
Z_sup_max <- n[sup_ind[length(sup_ind)]]
|
|
Z_sup_ind <- Z_sup_min : Z_sup_max
|
|
Z_act <- Z[,-Z_sup_ind]
|
|
J <- dim(Z_act)[2]
|
|
|
|
# A titre d'illustration, quelques lignes et colonnes de Z_act
|
|
Z_act[c(1:5, I), c(1,2,J)]
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
B <- t(Z_act) %*% Z_act
|
|
B[1:5, 1:5]
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
P <- B / sum(B)
|
|
r <- apply(P, 2, sum)
|
|
rr <- r %*% t(r)
|
|
S <- (P - rr) / sqrt(rr)
|
|
dec <- eigen(S)
|
|
# les Q dernières valeurs propres sont nécessairement nulles.
|
|
delt <- dec$values[1 : (J-Q)]
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
K <- J - Q
|
|
a <- sweep(dec$vectors, 1, sqrt(r), FUN = "/")
|
|
a <- a[,(1 : K)]
|
|
f <- a %*% diag(delt)
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
lbl_dic <- c(
|
|
"O:<1H" = "ocean_proximity<1H OCEAN",
|
|
"O:INL" = "ocean_proximityINLAND",
|
|
"O:NB" = "ocean_proximityNEAR BAY",
|
|
"O:NO" = "ocean_proximityNEAR OCEAN",
|
|
"LO:W" = "c_longitudeLO-W",
|
|
"LO:MW" = "c_longitudeLO-MW",
|
|
"LO:ME" = "c_longitudeLO-ME",
|
|
"LO:E" = "c_longitudeLO-E",
|
|
"LA:S" = "c_latitudeLA-S",
|
|
"LA:MS" = "c_latitudeLA-MS",
|
|
"LA:MN" = "c_latitudeLA-MN",
|
|
"LA:N" = "c_latitudeLA-N",
|
|
"AG:15]" = "c_ageA<=15",
|
|
"AG:25]" = "c_ageA(15,25]",
|
|
"AG:35]" = "c_ageA(25,35]",
|
|
"AG:51]" = "c_ageA(35,51]",
|
|
"AG:52" = "c_ageA=52",
|
|
"RO:4]" = "c_roomsR<=4",
|
|
"RO:6]" = "c_roomsR(4,6]",
|
|
"RO:8]" = "c_roomsR(6,8]",
|
|
"BE:1]" = "c_bedroomsB<=1",
|
|
"BE:>1" = "c_bedroomsB>1",
|
|
"PO:2]" = "c_popP<=2",
|
|
"PO:3]" = "c_popP(2,3]",
|
|
"PO:4]" = "c_popP(3,4]",
|
|
"PO:>4" = "c_popP>4",
|
|
"HH:3]" = "c_householdsH<=3",
|
|
"HH:4]" = "c_householdsH(3,4]",
|
|
"HH:6]" = "c_householdsH(4,6]",
|
|
"HH:>6" = "c_householdsH>6",
|
|
"IC:L" = "c_incomeIL",
|
|
"IC:ML" = "c_incomeIML",
|
|
"IC:MH" = "c_incomeIMH",
|
|
"IC:H" = "c_incomeIH",
|
|
"HV:A" = "c_house_valueV<=115",
|
|
"HV:B" = "c_house_valueV(115,175]",
|
|
"HV:C" = "c_house_valueV(175,250]",
|
|
"HV:D" = "c_house_valueV(250,500]",
|
|
"HV:E" = "c_house_valueV>500"
|
|
)
|
|
|
|
lbl_act_dic <- lbl_dic[1:J]
|
|
fac_names <- paste("F", paste(1 : K), sep = "")
|
|
rownames(a) <- names(lbl_act_dic)
|
|
colnames(a) <- fac_names
|
|
rownames(f) <- names(lbl_act_dic)
|
|
colnames(f) <- fac_names
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
temp <- sweep(f^2, 1, r, FUN = "*")
|
|
sum_ctr <- apply(temp, 2, sum)
|
|
ctr <- sweep(temp, 2, sum_ctr, FUN = "/")
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
temp <- f^2
|
|
sum_cor <- apply(temp, 1, sum)
|
|
cor <- sweep(temp, 1, sum_cor, FUN="/")
|
|
|
|
|
|
## -----------------------------------------------------------------------------
|
|
# Parts de l'inertie du plan factoriel 1-2 expliquée par chaque profil
|
|
rowcon <- r * (f[,1]^2 + f[,2]^2) / sum(delt[1:2]^2)
|
|
rnames <- rownames(f)
|
|
rnames[rowcon < 0.01] <- "."
|
|
rsize <- log(1 + exp(1) * rowcon^0.3)
|
|
rsize[rowcon < 0.01] <- 1
|
|
|
|
|
|
## ----05-d-map-1-2, fig.width = 6, fig.cap = "Carte selon les facteurs 1 (x) et 2 (y)"----
|
|
plot(f[,1], f[,2], type = "n",
|
|
xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n")
|
|
text(f[,1], f[,2], rnames,
|
|
adj = 0, cex = rsize)
|
|
points(0, 0, pch = 3)
|
|
|