factorization of onehot encoding in a function

This commit is contained in:
Pierre-Edouard Portier 2023-01-18 16:32:24 +01:00
parent 0355a4a284
commit 919d5884cd
1 changed files with 30 additions and 20 deletions

View File

@ -121,8 +121,7 @@ function()
levels(dat$c_house_value) <- c('HV:A', 'HV:B', 'HV:C', 'HV:D', 'HV:E') levels(dat$c_house_value) <- c('HV:A', 'HV:B', 'HV:C', 'HV:D', 'HV:E')
# création du jeu de données quantifié # création du jeu de données quantifié
dat.all <- dat dat <- dat[c('ocean_proximity', 'c_longitude', 'c_latitude', 'c_age',
dat <- dat.all[c('ocean_proximity', 'c_longitude', 'c_latitude', 'c_age',
'c_rooms', 'c_bedrooms', 'c_pop', 'c_households', 'c_income', 'c_rooms', 'c_bedrooms', 'c_pop', 'c_households', 'c_income',
'c_house_value')] 'c_house_value')]
vent <- list() vent <- list()
@ -146,36 +145,47 @@ function()
dat <- droplevels(dat) dat <- droplevels(dat)
# positionnement de c_house_value en variable supplémentaire # positionnement de c_house_value en variable supplémentaire
supind <- which(names(dat) == "c_house_value") supvar <- which(names(dat) == "c_house_value")
r <- list( vent=vent, dat=dat, supind=supind ) r <- list( vent=vent, dat=dat, supvar=supvar )
return(r) return(r)
} }
# one hot encoding (codage disjonctif complet) of a dataframe
# made of categorical variables
onehot_enc <-
function(dat.cat)
{
if(!all(as.logical(lapply(dat.cat, is.factor)))) {
warning("All variables must be factors.")
return(invisible(NULL))
}
lev_n <- unlist(lapply(dat.cat, nlevels))
n <- cumsum(lev_n)
J_t <- sum(lev_n)
Q_t <- dim(dat.cat)[2]
I <- dim(dat.cat)[1]
Z <- matrix(0, nrow = I, ncol = J_t)
numdat <- lapply(dat.cat, as.numeric)
offset <- c(0, n[-length(n)])
for (i in 1:Q_t) Z[1:I + (I * (offset[i] + numdat[[i]] - 1))] <- 1
dimnames(Z)[[1]] <- as.character(1:I)
dimnames(Z)[[2]] <- as.character(unlist(lapply(dat.cat, levels)))
return(Z)
}
mca <- mca <-
function(dat, nclst = 100) function(dat, nclst = 100)
{ {
lev_n <- unlist(lapply(dat$dat, nlevels))
n <- cumsum(lev_n)
J_t <- sum(lev_n)
Q_t <- dim(dat$dat)[2]
I <- dim(dat$dat)[1]
# Indicator matrix # Indicator matrix
Z <- matrix(0, nrow = I, ncol = J_t) Z <- onehot_enc(dat$dat)
numdat <- lapply(dat$dat, as.numeric) Z_sup_min <- n[dat$supvar[1] - 1] + 1
offset <- c(0, n[-length(n)]) Z_sup_max <- n[dat$supvar[length(dat$supvar)]]
for (i in 1:Q_t)
Z[1:I + (I * (offset[i] + numdat[[i]] - 1))] <- 1
dimnames(Z)[[1]] <- as.character(1:I)
dimnames(Z)[[2]] <- as.character(unlist(lapply(dat$dat, levels)))
Z_sup_min <- n[dat$supind[1] - 1] + 1
Z_sup_max <- n[dat$supind[length(dat$supind)]]
Z_sup_ind <- Z_sup_min : Z_sup_max Z_sup_ind <- Z_sup_min : Z_sup_max
Z_act <- Z[,-Z_sup_ind] Z_act <- Z[,-Z_sup_ind]
J <- dim(Z_act)[2] J <- dim(Z_act)[2]
Q <- dim(dat$dat)[2] - length(dat$supind) Q <- dim(dat$dat)[2] - length(dat$supvar)
# Burt matrix # Burt matrix
B <- t(Z_act) %*% Z_act B <- t(Z_act) %*% Z_act