From 919d5884cd0c0061f733a5070a6677a198ed4732 Mon Sep 17 00:00:00 2001 From: Pierre-Edouard Portier Date: Wed, 18 Jan 2023 16:32:24 +0100 Subject: [PATCH] factorization of onehot encoding in a function --- 05_d_svd_mca_code.R | 50 +++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/05_d_svd_mca_code.R b/05_d_svd_mca_code.R index fc09674..51809c1 100644 --- a/05_d_svd_mca_code.R +++ b/05_d_svd_mca_code.R @@ -121,8 +121,7 @@ function() levels(dat$c_house_value) <- c('HV:A', 'HV:B', 'HV:C', 'HV:D', 'HV:E') # création du jeu de données quantifié - dat.all <- dat - dat <- dat.all[c('ocean_proximity', 'c_longitude', 'c_latitude', 'c_age', + dat <- dat[c('ocean_proximity', 'c_longitude', 'c_latitude', 'c_age', 'c_rooms', 'c_bedrooms', 'c_pop', 'c_households', 'c_income', 'c_house_value')] vent <- list() @@ -146,36 +145,47 @@ function() dat <- droplevels(dat) # positionnement de c_house_value en variable supplémentaire - supind <- which(names(dat) == "c_house_value") + supvar <- which(names(dat) == "c_house_value") - r <- list( vent=vent, dat=dat, supind=supind ) + r <- list( vent=vent, dat=dat, supvar=supvar ) return(r) } +# one hot encoding (codage disjonctif complet) of a dataframe +# made of categorical variables +onehot_enc <- +function(dat.cat) +{ + if(!all(as.logical(lapply(dat.cat, is.factor)))) { + warning("All variables must be factors.") + return(invisible(NULL)) + } + lev_n <- unlist(lapply(dat.cat, nlevels)) + n <- cumsum(lev_n) + J_t <- sum(lev_n) + Q_t <- dim(dat.cat)[2] + I <- dim(dat.cat)[1] + Z <- matrix(0, nrow = I, ncol = J_t) + numdat <- lapply(dat.cat, as.numeric) + offset <- c(0, n[-length(n)]) + for (i in 1:Q_t) Z[1:I + (I * (offset[i] + numdat[[i]] - 1))] <- 1 + dimnames(Z)[[1]] <- as.character(1:I) + dimnames(Z)[[2]] <- as.character(unlist(lapply(dat.cat, levels))) + return(Z) +} + mca <- function(dat, nclst = 100) { - lev_n <- unlist(lapply(dat$dat, nlevels)) - n <- cumsum(lev_n) - J_t <- sum(lev_n) - Q_t <- dim(dat$dat)[2] - I <- dim(dat$dat)[1] - # Indicator matrix - Z <- matrix(0, nrow = I, ncol = J_t) - numdat <- lapply(dat$dat, as.numeric) - offset <- c(0, n[-length(n)]) - for (i in 1:Q_t) - Z[1:I + (I * (offset[i] + numdat[[i]] - 1))] <- 1 - dimnames(Z)[[1]] <- as.character(1:I) - dimnames(Z)[[2]] <- as.character(unlist(lapply(dat$dat, levels))) - Z_sup_min <- n[dat$supind[1] - 1] + 1 - Z_sup_max <- n[dat$supind[length(dat$supind)]] + Z <- onehot_enc(dat$dat) + Z_sup_min <- n[dat$supvar[1] - 1] + 1 + Z_sup_max <- n[dat$supvar[length(dat$supvar)]] Z_sup_ind <- Z_sup_min : Z_sup_max Z_act <- Z[,-Z_sup_ind] J <- dim(Z_act)[2] - Q <- dim(dat$dat)[2] - length(dat$supind) + Q <- dim(dat$dat)[2] - length(dat$supvar) # Burt matrix B <- t(Z_act) %*% Z_act