intro_to_ml/05_d_svd_mca_code.R

36 lines
953 B
R

# `kcuts` cuts variable `x` into `centers` categories with k-means.
kcuts <- function(x, centers)
{
km <- kmeans(x = x, centers = centers)
cuts <- unlist(lapply(order(km$centers), function(clustId) {
min(x[km$cluster == clustId]) }))
cuts <- c(cuts, max(x))
}
# ventilate modality `mod` of categorical var `cat` of dataframe `dat`
# into the remaining modalities according to their relative frequencies.
ventilate <- function(cat, mod)
{
if (mod == "NA") isna <- TRUE else isna <- FALSE
tab <- table(cat)
if (isna)
{
sup_i <- which(is.na(cat))
sup_n <- length(sup_i)
act_mod <- tab
}
else
{
sup_i <- which(cat == mod)
sup_n <- tab[mod]
act_mod <- tab[! names(tab) %in% mod]
}
prob <- act_mod / sum(act_mod)
smpl <- sample(names(prob), size = sup_n, replace = TRUE, prob = prob)
r <- list(sup_mod = mod,
sup_n = sup_n,
sup_i = sup_i,
smpl = smpl)
return(r)
}