2022-04-16 10:08:31 +00:00
# Font /mnt/font/InputMonoNarrow-Regular/20a/font
# rm(list=ls())
# bookdown::render_book()
2022-06-14 16:27:01 +00:00
# :/^\#
2022-12-30 14:47:58 +00:00
# bash make_chapter 19_nystroem_approximation.Rmd
# knitr::purl("05_c_svd_ca.Rmd")
# Extrait de 05_c_svd_ca.Rmd
#
# Affichons encore une carte avec les coordonnées principales sur les dimensions n°1 et n°2, mais uniquement pour les profils lignes et les profils colonnes considérés importants.
#
# ```{r}
# selI <- CTRI > (1/I)
# selI12 <- selI[,1] | selI[,2]
# selJ <- CTRJ > (1/J)
# selJ12 <- selJ[,1] | selJ[,2]
# par(pty="s") # square plotting region
# plot(c(F[selI12,1], G[selJ12,1]), c(F[selI12,2], G[selJ12,2]),
# main = "x: d1, y: d2", type = "n",
# xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n")
# text(c(F[selI12,1], G[selJ12,1]), c(F[selI12,2], G[selJ12,2]),
# c(rownames(P)[selI12], colnames(P)[selJ12]),
# adj = 0, cex = 0.6)
# points(0, 0, pch = 3)
2023-01-02 20:48:45 +00:00
# ```
# Extrait de 05_d_svd_mca.Rmd
## Synthèse des transformations opérées sur le jeu de données
Nous rappelons ci - dessous l ' e n s e m b l e d e s t r a n s f o r m a t i o n s o p é r é e s s u r l e j e u d e d o n n é e s .
`` ` { r , eval = FALSE }
source ( ' 05_d_svd_mca_code.R' )
# chargement du jeu de données
dat <- read.csv ( file = " data/housing.csv" , header = TRUE )
# transformation de ocean_proximity en facteur
dat $ ocean_proximity <- as.factor ( dat $ ocean_proximity )
# quantification de longitude
cuts <- kcuts ( x = dat $ longitude , centers = 4 )
dat $ c_longitude <- cut ( x = dat $ longitude , unique ( cuts ) , include.lowest = TRUE )
levels ( dat $ c_longitude ) <- c ( ' LO-W' , ' LO-MW' , ' LO-ME' , ' LO-E' )
# quantification de latitude
cuts <- kcuts ( x = dat $ latitude , centers = 4 )
dat $ c_latitude <- cut ( x = dat $ latitude , unique ( cuts ) , include.lowest = TRUE )
levels ( dat $ c_latitude ) <- c ( ' LA-S' , ' LA-MS' , ' LA-MN' , ' LA-N' )
# quantification de housing_median_age
cuts <- c ( min ( dat $ housing_median_age ) , 15 , 25 , 35 , 51 , 52 )
dat $ c_age <- cut ( x = dat $ housing_median_age , unique ( cuts ) , include.lowest = TRUE )
levels ( dat $ c_age ) <- c ( ' A<=15' , ' A(15,25]' , ' A(25,35]' , ' A(35,51]' , ' A=52' )
# création et quantification de rooms
dat $ rooms <- dat $ total_rooms / dat $ households
cuts <- c ( min ( dat $ rooms ) , 4 , 6 , 8 , max ( dat $ rooms ) )
dat $ c_rooms <- cut ( x = dat $ rooms , unique ( cuts ) , include.lowest = TRUE )
levels ( dat $ c_rooms ) <- c ( ' R<=4' , ' R(4,6]' , ' R(6,8]' , ' R>8' )
# création et quantification de bedrooms
dat $ bedrooms <- dat $ total_bedrooms / dat $ households
cuts <- c ( min ( dat $ bedrooms , na.rm = TRUE ) , 1.1 , max ( dat $ bedrooms , na.rm = TRUE ) )
dat $ c_bedrooms <- cut ( x = dat $ bedrooms , unique ( cuts ) , include.lowest = TRUE )
levels ( dat $ c_bedrooms ) <- c ( ' B<=1' , ' B>1' )
# création et quantification de pop
dat $ pop <- dat $ population / dat $ households
cuts <- c ( min ( dat $ pop ) , 2 , 3 , 4 , max ( dat $ pop ) )
dat $ c_pop <- cut ( x = dat $ pop , unique ( cuts ) , include.lowest = TRUE )
levels ( dat $ c_pop ) <- c ( ' P<=2' , ' P(2,3]' , ' P(3,4]' , ' P>4' )
# quantification de households
cuts <- c ( min ( dat $ households ) , 300 , 400 , 600 , max ( dat $ households ) )
dat $ c_households <- cut ( x = dat $ households , cuts , include.lowest = TRUE )
levels ( dat $ c_households ) <- c ( ' H<=3' , ' H(3,4]' , ' H(4,6]' , ' H>6' )
# quantification de median_income
cuts <- quantile ( dat $ median_income , probs = seq ( 0 , 1 , 1 / 4 ) )
cuts <- c ( cuts [1 : length ( cuts ) -1 ] , 15 , max ( dat $ median_income ) )
dat $ c_income <- cut ( x = dat $ median_income , cuts , include.lowest = TRUE )
levels ( dat $ c_income ) <- c ( ' IL' , ' IML' , ' IMH' , ' IH' , ' I>15' )
# quantification de median_house_value
cuts <- c ( min ( dat $ median_house_value ) , 115000 , 175000 , 250000 , 500000 ,
max ( dat $ median_house_value ) )
dat $ c_house_value <- cut ( x = dat $ median_house_value , cuts , include.lowest = TRUE )
levels ( dat $ c_house_value ) <- c ( ' V<=115' , ' V(115,175]' , ' V(175,250]' , ' V(250,500]' ,
' V>500' )
# création du jeu de données quantifié
dat.all <- dat
dat <- dat.all [c ( ' ocean_proximity' , ' c_longitude' , ' c_latitude' , ' c_age' ,
' c_rooms' , ' c_bedrooms' , ' c_pop' , ' c_households' , ' c_income' ,
' c_house_value' ) ]
# ventilation de la modalité R>8 de c_rooms
c_rooms_sup <- ventilate ( dat $ c_rooms , " R>8" )
dat $ c_rooms [c_rooms_sup $ sup_i ] <- c_rooms_sup $ smpl
# ventilation de la modalité I>15 de c_income
c_income_sup <- ventilate ( dat $ c_income , " I>15" )
dat $ c_income [c_income_sup $ sup_i ] <- c_income_sup $ smpl
# ventilation de la modalité ISLAND de ocean_proximity
ocean_proximity_sup <- ventilate ( dat $ ocean_proximity , " ISLAND" )
dat $ ocean_proximity [ocean_proximity_sup $ sup_i ] <- ocean_proximity_sup $ smpl
# ventilation des valeurs manquantes de c_bedrooms
c_bedrooms_sup <- ventilate ( dat $ c_bedrooms , " NA" )
dat $ c_bedrooms [c_bedrooms_sup $ sup_i ] <- c_bedrooms_sup $ smpl
# suppression des modalités vides après ventilation
dat <- droplevels ( dat )
# positionnement de c_house_value en variable supplémentaire
sup_ind <- which ( names ( dat ) == " c_house_value" )
dat_act <- dat [ , - sup_ind ]
dat_sup <- dat [ , sup_ind ]
I <- dim ( dat_act ) [1 ]
Q <- dim ( dat_act ) [2 ]
# construction du tableau disjonctif complet
lev_n <- unlist ( lapply ( dat , nlevels ) )
n <- cumsum ( lev_n )
J_t <- sum ( lev_n )
Q_t <- dim ( dat ) [2 ]
Z <- matrix ( 0 , nrow = I , ncol = J_t )
numdat <- lapply ( dat , as.numeric )
offset <- c ( 0 , n [ - length ( n ) ] )
for ( i in 1 : Q_t )
Z [1 : I + ( I * ( offset [i ] + numdat [ [i ] ] - 1 ) ) ] <- 1
cn <- rep ( names ( dat ) , lev_n )
ln <- unlist ( lapply ( dat , levels ) )
dimnames ( Z ) [ [1 ] ] <- as.character ( 1 : I )
dimnames ( Z ) [ [2 ] ] <- paste ( cn , ln , sep = " " )
Z_sup_min <- n [sup_ind [1 ] - 1 ] + 1
Z_sup_max <- n [sup_ind [length ( sup_ind ) ] ]
Z_sup_ind <- Z_sup_min : Z_sup_max
Z_act <- Z [ , - Z_sup_ind ]
J <- dim ( Z_act ) [2 ]
# Construction de la matrice de Burt
B <- t ( Z_act ) %*% Z_act
# Analyse des correspondances
P <- B / sum ( B )
r <- apply ( P , 2 , sum )
rr <- r %*% t ( r )
S <- ( P - rr ) / sqrt ( rr )
dec <- eigen ( S )
# les Q dernières valeurs propres sont nécessairement nulles.
delt <- dec $ values [1 : ( J - Q ) ]
# Calcul des coordonnées standard (a) et principales (f)
K <- J - Q
a <- sweep ( dec $ vectors , 1 , sqrt ( r ) , FUN = " /" )
a <- a [ , ( 1 : K ) ]
f <- a %*% diag ( delt )
# Noms des facteurs et des modalités
lbl_dic <- c (
" O:<1H" = " ocean_proximity<1H OCEAN" ,
" O:INL" = " ocean_proximityINLAND" ,
" O:NB" = " ocean_proximityNEAR BAY" ,
" O:NO" = " ocean_proximityNEAR OCEAN" ,
" LO:W" = " c_longitudeLO-W" ,
" LO:MW" = " c_longitudeLO-MW" ,
" LO:ME" = " c_longitudeLO-ME" ,
" LO:E" = " c_longitudeLO-E" ,
" LA:S" = " c_latitudeLA-S" ,
" LA:MS" = " c_latitudeLA-MS" ,
" LA:MN" = " c_latitudeLA-MN" ,
" LA:N" = " c_latitudeLA-N" ,
" AG:15]" = " c_ageA<=15" ,
" AG:25]" = " c_ageA(15,25]" ,
" AG:35]" = " c_ageA(25,35]" ,
" AG:51]" = " c_ageA(35,51]" ,
" AG:52" = " c_ageA=52" ,
" RO:4]" = " c_roomsR<=4" ,
" RO:6]" = " c_roomsR(4,6]" ,
" RO:8]" = " c_roomsR(6,8]" ,
" BE:1]" = " c_bedroomsB<=1" ,
" BE:>1" = " c_bedroomsB>1" ,
" PO:2]" = " c_popP<=2" ,
" PO:3]" = " c_popP(2,3]" ,
" PO:4]" = " c_popP(3,4]" ,
" PO:>4" = " c_popP>4" ,
" HH:3]" = " c_householdsH<=3" ,
" HH:4]" = " c_householdsH(3,4]" ,
" HH:6]" = " c_householdsH(4,6]" ,
" HH:>6" = " c_householdsH>6" ,
" IC:L" = " c_incomeIL" ,
" IC:ML" = " c_incomeIML" ,
" IC:MH" = " c_incomeIMH" ,
" IC:H" = " c_incomeIH" ,
" HV:A" = " c_house_valueV<=115" ,
" HV:B" = " c_house_valueV(115,175]" ,
" HV:C" = " c_house_valueV(175,250]" ,
" HV:D" = " c_house_valueV(250,500]" ,
" HV:E" = " c_house_valueV>500"
)
lbl_act_dic <- lbl_dic [1 : J ]
fac_names <- paste ( " F" , paste ( 1 : K ) , sep = " " )
rownames ( a ) <- names ( lbl_act_dic )
colnames ( a ) <- fac_names
rownames ( f ) <- names ( lbl_act_dic )
colnames ( f ) <- fac_names
# Calcul des contributions
temp <- sweep ( f ^2 , 1 , r , FUN = " *" )
sum_ctr <- apply ( temp , 2 , sum )
ctr <- sweep ( temp , 2 , sum_ctr , FUN = " /" )
# Calcul des corrélations
temp <- f ^2
sum_cor <- apply ( temp , 1 , sum )
cor <- sweep ( temp , 1 , sum_cor , FUN = " /" )
`` `