diff --git a/pad.R b/pad.R index 858f206..e609f28 100644 --- a/pad.R +++ b/pad.R @@ -24,4 +24,201 @@ # c(rownames(P)[selI12], colnames(P)[selJ12]), # adj = 0, cex = 0.6) # points(0, 0, pch = 3) -# ``` \ No newline at end of file +# ``` + +# Extrait de 05_d_svd_mca.Rmd +## Synthèse des transformations opérées sur le jeu de données + +Nous rappelons ci-dessous l'ensemble des transformations opérées sur le jeu de données. + +```{r, eval = FALSE} +source('05_d_svd_mca_code.R') + +# chargement du jeu de données +dat <- read.csv(file="data/housing.csv", header=TRUE) + +# transformation de ocean_proximity en facteur +dat$ocean_proximity <- as.factor(dat$ocean_proximity) + +# quantification de longitude +cuts <- kcuts(x = dat$longitude, centers = 4) +dat$c_longitude <- cut(x = dat$longitude, unique(cuts), include.lowest = TRUE) +levels(dat$c_longitude) <- c('LO-W', 'LO-MW', 'LO-ME', 'LO-E') + +# quantification de latitude +cuts <- kcuts(x = dat$latitude, centers = 4) +dat$c_latitude <- cut(x = dat$latitude, unique(cuts), include.lowest = TRUE) +levels(dat$c_latitude) <- c('LA-S','LA-MS','LA-MN','LA-N') + +# quantification de housing_median_age +cuts <- c(min(dat$housing_median_age), 15, 25, 35, 51, 52) +dat$c_age <- cut(x = dat$housing_median_age, unique(cuts), include.lowest = TRUE) +levels(dat$c_age) <- c('A<=15','A(15,25]','A(25,35]','A(35,51]', 'A=52') + +# création et quantification de rooms +dat$rooms <- dat$total_rooms / dat$households +cuts <- c(min(dat$rooms), 4, 6, 8, max(dat$rooms)) +dat$c_rooms <- cut(x = dat$rooms, unique(cuts), include.lowest = TRUE) +levels(dat$c_rooms) <- c('R<=4','R(4,6]','R(6,8]', 'R>8') + +# création et quantification de bedrooms +dat$bedrooms <- dat$total_bedrooms / dat$households +cuts <- c(min(dat$bedrooms, na.rm = TRUE), 1.1, max(dat$bedrooms, na.rm = TRUE)) +dat$c_bedrooms <- cut(x = dat$bedrooms, unique(cuts), include.lowest = TRUE) +levels(dat$c_bedrooms) <- c('B<=1','B>1') + +# création et quantification de pop +dat$pop <- dat$population / dat$households +cuts <- c(min(dat$pop), 2, 3, 4, max(dat$pop)) +dat$c_pop <- cut(x = dat$pop, unique(cuts), include.lowest = TRUE) +levels(dat$c_pop) <- c('P<=2','P(2,3]', 'P(3,4]', 'P>4') + +# quantification de households +cuts <- c(min(dat$households), 300, 400, 600, max(dat$households)) +dat$c_households <- cut(x = dat$households, cuts, include.lowest = TRUE) +levels(dat$c_households) <- c('H<=3', 'H(3,4]', 'H(4,6]', 'H>6') + +# quantification de median_income +cuts <- quantile(dat$median_income, probs = seq(0,1,1/4)) +cuts <- c(cuts[1:length(cuts)-1], 15, max(dat$median_income)) +dat$c_income <- cut(x = dat$median_income, cuts, include.lowest = TRUE) +levels(dat$c_income) <- c('IL', 'IML', 'IMH', 'IH', 'I>15') + +# quantification de median_house_value +cuts <- c(min(dat$median_house_value), 115000, 175000, 250000, 500000, + max(dat$median_house_value)) +dat$c_house_value <- cut(x = dat$median_house_value, cuts, include.lowest = TRUE) +levels(dat$c_house_value) <- c('V<=115', 'V(115,175]', 'V(175,250]', 'V(250,500]', + 'V>500') + +# création du jeu de données quantifié +dat.all <- dat +dat <- dat.all[c('ocean_proximity', 'c_longitude', 'c_latitude', 'c_age', + 'c_rooms', 'c_bedrooms', 'c_pop', 'c_households', 'c_income', + 'c_house_value')] + +# ventilation de la modalité R>8 de c_rooms +c_rooms_sup <- ventilate(dat$c_rooms, "R>8") +dat$c_rooms[c_rooms_sup$sup_i] <- c_rooms_sup$smpl + +# ventilation de la modalité I>15 de c_income +c_income_sup <- ventilate(dat$c_income, "I>15") +dat$c_income[c_income_sup$sup_i] <- c_income_sup$smpl + +# ventilation de la modalité ISLAND de ocean_proximity +ocean_proximity_sup <- ventilate(dat$ocean_proximity, "ISLAND") +dat$ocean_proximity[ocean_proximity_sup$sup_i] <- ocean_proximity_sup$smpl + +# ventilation des valeurs manquantes de c_bedrooms +c_bedrooms_sup <- ventilate(dat$c_bedrooms, "NA") +dat$c_bedrooms[c_bedrooms_sup$sup_i] <- c_bedrooms_sup$smpl + +# suppression des modalités vides après ventilation +dat <- droplevels(dat) + +# positionnement de c_house_value en variable supplémentaire +sup_ind <- which(names(dat) == "c_house_value") +dat_act <- dat[,-sup_ind] +dat_sup <- dat[,sup_ind] +I <- dim(dat_act)[1] +Q <- dim(dat_act)[2] + +# construction du tableau disjonctif complet +lev_n <- unlist(lapply(dat, nlevels)) +n <- cumsum(lev_n) +J_t <- sum(lev_n) +Q_t <- dim(dat)[2] +Z <- matrix(0, nrow = I, ncol = J_t) +numdat <- lapply(dat, as.numeric) +offset <- c(0, n[-length(n)]) +for (i in 1:Q_t) + Z[1:I + (I * (offset[i] + numdat[[i]] - 1))] <- 1 +cn <- rep(names(dat), lev_n) +ln <- unlist(lapply(dat, levels)) +dimnames(Z)[[1]] <- as.character(1:I) +dimnames(Z)[[2]] <- paste(cn, ln, sep = "") + +Z_sup_min <- n[sup_ind[1] - 1] + 1 +Z_sup_max <- n[sup_ind[length(sup_ind)]] +Z_sup_ind <- Z_sup_min : Z_sup_max +Z_act <- Z[,-Z_sup_ind] +J <- dim(Z_act)[2] + +# Construction de la matrice de Burt +B <- t(Z_act) %*% Z_act + +# Analyse des correspondances +P <- B / sum(B) +r <- apply(P, 2, sum) +rr <- r %*% t(r) +S <- (P - rr) / sqrt(rr) +dec <- eigen(S) +# les Q dernières valeurs propres sont nécessairement nulles. +delt <- dec$values[1 : (J-Q)] + +# Calcul des coordonnées standard (a) et principales (f) +K <- J - Q +a <- sweep(dec$vectors, 1, sqrt(r), FUN = "/") +a <- a[,(1 : K)] +f <- a %*% diag(delt) + +# Noms des facteurs et des modalités +lbl_dic <- c( + "O:<1H" = "ocean_proximity<1H OCEAN", + "O:INL" = "ocean_proximityINLAND", + "O:NB" = "ocean_proximityNEAR BAY", + "O:NO" = "ocean_proximityNEAR OCEAN", + "LO:W" = "c_longitudeLO-W", + "LO:MW" = "c_longitudeLO-MW", + "LO:ME" = "c_longitudeLO-ME", + "LO:E" = "c_longitudeLO-E", + "LA:S" = "c_latitudeLA-S", + "LA:MS" = "c_latitudeLA-MS", + "LA:MN" = "c_latitudeLA-MN", + "LA:N" = "c_latitudeLA-N", + "AG:15]" = "c_ageA<=15", + "AG:25]" = "c_ageA(15,25]", + "AG:35]" = "c_ageA(25,35]", + "AG:51]" = "c_ageA(35,51]", + "AG:52" = "c_ageA=52", + "RO:4]" = "c_roomsR<=4", + "RO:6]" = "c_roomsR(4,6]", + "RO:8]" = "c_roomsR(6,8]", + "BE:1]" = "c_bedroomsB<=1", + "BE:>1" = "c_bedroomsB>1", + "PO:2]" = "c_popP<=2", + "PO:3]" = "c_popP(2,3]", + "PO:4]" = "c_popP(3,4]", + "PO:>4" = "c_popP>4", + "HH:3]" = "c_householdsH<=3", + "HH:4]" = "c_householdsH(3,4]", + "HH:6]" = "c_householdsH(4,6]", + "HH:>6" = "c_householdsH>6", + "IC:L" = "c_incomeIL", + "IC:ML" = "c_incomeIML", + "IC:MH" = "c_incomeIMH", + "IC:H" = "c_incomeIH", + "HV:A" = "c_house_valueV<=115", + "HV:B" = "c_house_valueV(115,175]", + "HV:C" = "c_house_valueV(175,250]", + "HV:D" = "c_house_valueV(250,500]", + "HV:E" = "c_house_valueV>500" +) + +lbl_act_dic <- lbl_dic[1:J] +fac_names <- paste("F", paste(1 : K), sep = "") +rownames(a) <- names(lbl_act_dic) +colnames(a) <- fac_names +rownames(f) <- names(lbl_act_dic) +colnames(f) <- fac_names + +# Calcul des contributions +temp <- sweep(f^2, 1, r, FUN = "*") +sum_ctr <- apply(temp, 2, sum) +ctr <- sweep(temp, 2, sum_ctr, FUN = "/") + +# Calcul des corrélations +temp <- f^2 +sum_cor <- apply(temp, 1, sum) +cor <- sweep(temp, 1, sum_cor, FUN="/") +``` \ No newline at end of file