intro_to_ml/05_c_svd_ca_code.R

176 lines
5.2 KiB
R

# https://www.r-bloggers.com/2016/07/round-values-while-preserve-their-rounded-sum-in-r/
# E.G.
# > sum(c(0.333, 0.333, 0.334))
# [1] 1
# > sum(round(c(0.333, 0.333, 0.334), 2))
# [1] 0.99
# > sum(round_preserve_sum(c(0.333, 0.333, 0.334), 2))
# [1] 1.00
round_preserve_sum <- function(x, digits = 0) {
up <- 10 ^ digits
x <- x * up
y <- floor(x)
indices <- tail(order(x-y), round(sum(x)) - sum(y))
y[indices] <- y[indices] + 1
y / up
}
ca <-
function(N)
{
n <- sum(sum(N))
P <- (1/n) * N
r <- rowSums(P)
c <- colSums(P)
R <- sweep(P, MARGIN = 1, 1/r, '*')
C <- sweep(t(P), MARGIN = 1, 1/c, '*')
scaleR <- 1/sqrt(r)
scaleC <- 1/sqrt(c)
S <- sweep(P - r %*% t(c), 1, scaleR, FUN = "*")
S <- sweep(S, 2, scaleC, FUN = "*")
dec <- svd(S)
Phi <- sweep(dec$u, 1, scaleR, FUN="*")
F <- sweep(Phi, 2, dec$d, FUN="*")
Gam <- sweep(dec$v, 1, scaleC, FUN="*")
G <- sweep(Gam, 2, dec$d, FUN="*")
temp <- sweep(F^2, 1, r, FUN = "*")
CTRI <- sweep(temp, 2, dec$d^2, FUN = "/")
temp <- sweep(G^2, 1, c, FUN = "*")
CTRJ <- sweep(temp, 2, dec$d^2, FUN = "/")
temp <- temp <- F^2
CORI <- sweep(temp, 1, rowSums(temp), FUN = "/")
temp <- temp <- G^2
CORJ <- sweep(temp, 1, rowSums(temp), FUN = "/")
INRI <- rowSums(sweep(F^2, 1, r, FUN="*"))
INRI <- INRI / sum(INRI)
INRJ <- rowSums(sweep(G^2, 1, c, FUN="*"))
INRJ <- INRJ / sum(INRJ)
r <- list(P=P, r=r, c=c, F=F, G=G, Phi=Phi, Gam=Gam, sv=dec$d,
CTRI=CTRI, CTRJ=CTRJ, CORI=CORI, CORJ=CORJ,
INRI=INRI, INRJ=INRJ)
class(r) <- "ca"
return(r)
}
plot.ca <-
function(o, d1 = NULL, d2 = NULL)
{
if(class(o) != "ca") {
warning("Object is not of class 'ca'")
UseMethod("plot")
return(invisible(NULL))
}
if(is.null(d1)) d1<-1
if(is.null(d2)) d2<-2
# Part de l'inertie du plan factoriel d1-d2 expliquée par chaque profil ligne
rowcon <- o$r * (o$F[,d1]^2 + o$F[,d2]^2) / (o$sv[d1]^2 + o$sv[d2]^2)
rnames <- rownames(o$P)
rnames[rowcon < 0.01] <- "."
rsize <- log(1 + exp(1) * rowcon^0.3)
rsize[rowcon < 0.01] <- 1
# Part de l'inertie du plan factoriel d1-d2 expliquée par chaque profil colonne
colcon <- o$c * (o$G[,d1]^2 + o$G[,d2]^2) / (o$sv[d1]^2 + o$sv[d2]^2)
cnames <- colnames(o$P)
cnames[colcon < 0.01] <- "."
csize <- log(1 + exp(1) * colcon^0.3)
csize[colcon < 0.01] <- 1
plot(c(o$F[,d1], o$G[,d1]), c(o$F[,d2], o$G[,d2]), type = "n",
xlab="", ylab="", asp = 1, xaxt = "n", yaxt = "n")
text(c(o$F[,d1], o$G[,d1]), c(o$F[,d2], o$G[,d2]), c(rnames, cnames),
adj = 0, cex = c(rsize, csize))
points(0, 0, pch = 3)
}
tblTotalInr.ca <-
function(o)
{
if(class(o) != "ca") {
warning("Object is not of class 'ca'")
return(invisible(NULL))
}
tblTotalInr <- cumsum(round_preserve_sum(1000 * (o$sv^2 / sum(o$sv^2))))
tblTotalInr <- matrix(tblTotalInr, nrow=1)
colnames(tblTotalInr) <- 1:length(o$sv)
r <- list(tbl=tblTotalInr, title="Proportion de l'inertie cumulée exprimée \
par les axes factoriels successifs (en millièmes)")
return(r)
}
printTbl <-
function(o)
{
kbl(o$tbl, caption = o$title, booktabs = TRUE)
}
printTblWithHeaders <-
function(o)
{
kbl(o$tbl, caption = o$title, booktabs = TRUE) %>%
add_header_above(o$headers) %>%
kable_styling(latex_options = c("striped", "scale_down"))
}
tblRowProfiles.ca <-
function(o, nbFact)
{
if(class(o) != "ca") {
warning("Object is not of class 'ca'")
return(invisible(NULL))
}
tempFactData <- c()
tempFactNames <- c()
subHeaders <- c(" " = 3) # first empty subheader for row names, PDS and INR
for (k in 1:nbFact) {
tempFactData <- c(tempFactData,
round_preserve_sum(1000*o$F[,k]),
round_preserve_sum(1000*o$CORI[,k]),
round_preserve_sum(1000*o$CTRI[,k]))
factIdStr <- paste('F#', k, sep="")
tempFactNames <- c(tempFactNames, factIdStr, 'COR', 'CTR')
subHeaders[[factIdStr]] <- 3
}
tblI <- matrix( c(round_preserve_sum(1000*o$r),
round_preserve_sum(1000*o$INRI),
tempFactData),
ncol = 2 + nbFact * 3)
rownames(tblI) <- rownames(o$P)
colnames(tblI) <- c('PDS', 'INR', tempFactNames)
r <- list(tbl=tblI, title="Synthèse des profils lignes", headers=subHeaders)
}
tblColProfiles.ca <-
function(o, nbFact)
{
if(class(o) != "ca") {
warning("Object is not of class 'ca'")
return(invisible(NULL))
}
tempFactData <- c()
tempFactNames <- c()
subHeaders <- c(" " = 3) # first empty subheader for row names, PDS and INR
for (k in 1:nbFact) {
tempFactData <- c(tempFactData,
round_preserve_sum(1000*o$G[,k]),
round_preserve_sum(1000*o$CORJ[,k]),
round_preserve_sum(1000*o$CTRJ[,k]))
factIdStr <- paste('F#', k, sep="")
tempFactNames <- c(tempFactNames, factIdStr, 'COR', 'CTR')
subHeaders[[factIdStr]] <- 3
}
tblJ <- matrix( c(round_preserve_sum(1000*o$c),
round_preserve_sum(1000*o$INRJ),
tempFactData),
ncol = 2 + nbFact * 3)
rownames(tblJ) <- colnames(o$P)
colnames(tblJ) <- c('PDS', 'INR', tempFactNames)
r <- list(tbl=tblJ, title="Synthèse des profils colonnes", headers=subHeaders)
}