factorization of onehot encoding in a function
This commit is contained in:
parent
0355a4a284
commit
919d5884cd
@ -121,8 +121,7 @@ function()
|
||||
levels(dat$c_house_value) <- c('HV:A', 'HV:B', 'HV:C', 'HV:D', 'HV:E')
|
||||
|
||||
# création du jeu de données quantifié
|
||||
dat.all <- dat
|
||||
dat <- dat.all[c('ocean_proximity', 'c_longitude', 'c_latitude', 'c_age',
|
||||
dat <- dat[c('ocean_proximity', 'c_longitude', 'c_latitude', 'c_age',
|
||||
'c_rooms', 'c_bedrooms', 'c_pop', 'c_households', 'c_income',
|
||||
'c_house_value')]
|
||||
vent <- list()
|
||||
@ -146,36 +145,47 @@ function()
|
||||
dat <- droplevels(dat)
|
||||
|
||||
# positionnement de c_house_value en variable supplémentaire
|
||||
supind <- which(names(dat) == "c_house_value")
|
||||
supvar <- which(names(dat) == "c_house_value")
|
||||
|
||||
r <- list( vent=vent, dat=dat, supind=supind )
|
||||
r <- list( vent=vent, dat=dat, supvar=supvar )
|
||||
|
||||
return(r)
|
||||
}
|
||||
|
||||
# one hot encoding (codage disjonctif complet) of a dataframe
|
||||
# made of categorical variables
|
||||
onehot_enc <-
|
||||
function(dat.cat)
|
||||
{
|
||||
if(!all(as.logical(lapply(dat.cat, is.factor)))) {
|
||||
warning("All variables must be factors.")
|
||||
return(invisible(NULL))
|
||||
}
|
||||
lev_n <- unlist(lapply(dat.cat, nlevels))
|
||||
n <- cumsum(lev_n)
|
||||
J_t <- sum(lev_n)
|
||||
Q_t <- dim(dat.cat)[2]
|
||||
I <- dim(dat.cat)[1]
|
||||
Z <- matrix(0, nrow = I, ncol = J_t)
|
||||
numdat <- lapply(dat.cat, as.numeric)
|
||||
offset <- c(0, n[-length(n)])
|
||||
for (i in 1:Q_t) Z[1:I + (I * (offset[i] + numdat[[i]] - 1))] <- 1
|
||||
dimnames(Z)[[1]] <- as.character(1:I)
|
||||
dimnames(Z)[[2]] <- as.character(unlist(lapply(dat.cat, levels)))
|
||||
return(Z)
|
||||
}
|
||||
|
||||
mca <-
|
||||
function(dat, nclst = 100)
|
||||
{
|
||||
lev_n <- unlist(lapply(dat$dat, nlevels))
|
||||
n <- cumsum(lev_n)
|
||||
J_t <- sum(lev_n)
|
||||
Q_t <- dim(dat$dat)[2]
|
||||
I <- dim(dat$dat)[1]
|
||||
|
||||
# Indicator matrix
|
||||
Z <- matrix(0, nrow = I, ncol = J_t)
|
||||
numdat <- lapply(dat$dat, as.numeric)
|
||||
offset <- c(0, n[-length(n)])
|
||||
for (i in 1:Q_t)
|
||||
Z[1:I + (I * (offset[i] + numdat[[i]] - 1))] <- 1
|
||||
dimnames(Z)[[1]] <- as.character(1:I)
|
||||
dimnames(Z)[[2]] <- as.character(unlist(lapply(dat$dat, levels)))
|
||||
Z_sup_min <- n[dat$supind[1] - 1] + 1
|
||||
Z_sup_max <- n[dat$supind[length(dat$supind)]]
|
||||
Z <- onehot_enc(dat$dat)
|
||||
Z_sup_min <- n[dat$supvar[1] - 1] + 1
|
||||
Z_sup_max <- n[dat$supvar[length(dat$supvar)]]
|
||||
Z_sup_ind <- Z_sup_min : Z_sup_max
|
||||
Z_act <- Z[,-Z_sup_ind]
|
||||
J <- dim(Z_act)[2]
|
||||
Q <- dim(dat$dat)[2] - length(dat$supind)
|
||||
Q <- dim(dat$dat)[2] - length(dat$supvar)
|
||||
|
||||
# Burt matrix
|
||||
B <- t(Z_act) %*% Z_act
|
||||
|
Loading…
Reference in New Issue
Block a user