création de 05_b_svd_pca_slides.Rmd et 15_loocv_slides.Rmd
This commit is contained in:
parent
b25e3a153a
commit
2f06eaed4b
4
.gitignore
vendored
4
.gitignore
vendored
@ -20,6 +20,9 @@
|
||||
05_b_svd_pca.pdf
|
||||
05_b_svd_pca_cache/
|
||||
05_b_svd_pca_files/
|
||||
05_b_svd_pca_slides.pdf
|
||||
05_b_svd_pca_slides_cache/
|
||||
05_b_svd_pca_slides_files/
|
||||
06_matrice_definie_non_negative.pdf
|
||||
07_multiplicateurs_de_lagrange.pdf
|
||||
08_derivation_svd.pdf
|
||||
@ -32,6 +35,7 @@
|
||||
14_geometrie_ridge_svd.pdf
|
||||
14_geometrie_ridge_svd_slides.pdf
|
||||
15_loocv.pdf
|
||||
15_loocv_slides.pdf
|
||||
16_biais_variance_estimateur.pdf
|
||||
17_biais_variance_ridge.pdf
|
||||
18_kernel_ridge_regression.pdf
|
233
05_b_svd_pca_slides.Rmd
Normal file
233
05_b_svd_pca_slides.Rmd
Normal file
@ -0,0 +1,233 @@
|
||||
---
|
||||
title: "05-b SVD et Analyse en Composantes Principales"
|
||||
author: Pierre-Edouard Portier
|
||||
date: mars 2022
|
||||
output:
|
||||
beamer_presentation:
|
||||
incremental: false
|
||||
---
|
||||
|
||||
# Projection sur les axes principaux
|
||||
|
||||
>- $\mathbf{X}\in\mathbb{R}^{n \times p} \quad ; \quad \mathbf{X} = \sum_{\alpha}\sqrt{\lambda_\alpha}\mathbf{u_\alpha}\mathbf{v_\alpha}^T \quad\equiv\quad \mathbf{X} = \mathbf{U} \mathbf{D} \mathbf{V}^T$
|
||||
>- Coordonnées (facteurs) des observations sur les axes principaux $\mathbf{F} \triangleq \mathbf{X}\mathbf{V} = \mathbf{U} \mathbf{D} \mathbf{V}^T\mathbf{V} = \mathbf{U} \mathbf{D}$
|
||||
>- Covariance des facteurs $\left(\mathbf{U} \mathbf{D}\right)^T\left(\mathbf{U} \mathbf{D}\right) = \mathbf{D}^2$
|
||||
>- Les facteurs sont orthogonaux (par construction)
|
||||
>- $\lambda_\alpha = \sum_i f_{i,\alpha}^2$ est la variance expliquée par l'axe $\mathbf{v_\alpha}$
|
||||
>- Contribution des observations aux axes $CTR_{i,\alpha} = \frac{f_{i,\alpha}^2}{\sum_i f_{i,\alpha}^2} = \frac{f_{i,\alpha}^2}{\lambda_\alpha}$
|
||||
>- $\sum_i CTR_{i,\alpha} = 1$ et $\mathbf{x_i}$ pèse sur $\mathbf{v_\alpha}$ quand $CTR_{i,\alpha} \geq 1/n$
|
||||
>- Contribution des axes aux observations $COS2_{i,\alpha} = \frac{f_{i,\alpha}^2}{\sum_\alpha f_{i,\alpha}^2} = \frac{f_{i,\alpha}^2}{d_{i,g}^2}$
|
||||
>- $d_{i,g}^2 = \sum_j \left(x_{i,j}-g_j\right)^2$ ($\sum_j x_{i,j}^2$ si données centrées)
|
||||
>- $\sum_i d_{i,g}^2 = \mathcal{I} = \sum_\alpha \lambda_\alpha$
|
||||
>- Contribution des variables aux axes $VARCTR_{j,\alpha} = \left(\frac{\lambda_\alpha}{\sqrt{n-1}}\mathbf{v_\alpha}\right)^2$
|
||||
|
||||
# Exemple `abalone`
|
||||
|
||||
## Récupération du jeu de données
|
||||
|
||||
```{r cache=TRUE}
|
||||
abalone.cols = c("sex", "length", "diameter", "height",
|
||||
"whole.wt", "shucked.wt", "viscera.wt",
|
||||
"shell.wt", "rings")
|
||||
|
||||
url <- 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
|
||||
abalone <- read.table(url, sep=",", row.names=NULL,
|
||||
col.names=abalone.cols,
|
||||
nrows=4177)
|
||||
abalone <- subset(abalone, height!=0)
|
||||
```
|
||||
|
||||
## Sélection des variables explicatives numériques
|
||||
|
||||
```{r}
|
||||
X <- abalone[,c("length", "diameter", "height", "whole.wt",
|
||||
"shucked.wt", "viscera.wt", "shell.wt")]
|
||||
```
|
||||
|
||||
# Exemple `abalone`
|
||||
|
||||
## Standardisation
|
||||
|
||||
```{r}
|
||||
standard <- function(X) {
|
||||
meanX <- apply(X, 2, mean);
|
||||
sdX <- apply(X, 2, sd);
|
||||
Y <- sweep(X, 2, meanX, FUN = "-");
|
||||
Y <- sweep(Y, 2, sdX, FUN = "/");
|
||||
res <- list("X" = Y, "meanX" = meanX, "sdX" = sdX)
|
||||
}
|
||||
```
|
||||
|
||||
# Exemple `abalone`
|
||||
|
||||
## Clustering
|
||||
|
||||
```{r warning=FALSE}
|
||||
set.seed(1123)
|
||||
std.X <- standard(X)
|
||||
A <- std.X$X
|
||||
nbClust <- 100
|
||||
clst <- kmeans(A, nbClust, nstart = 25)
|
||||
```
|
||||
|
||||
# Exemple `abalone`
|
||||
|
||||
## SVD des centres des clusters
|
||||
|
||||
```{r}
|
||||
std.centers <- standard(clst$centers)
|
||||
C <- std.centers$X
|
||||
svd.C <- svd(C)
|
||||
```
|
||||
|
||||
## Pourcentage de variance expliquée par chaque axe principal
|
||||
|
||||
```{r}
|
||||
round((svd.C$d^2 / sum(svd.C$d^2))*100, 2)
|
||||
```
|
||||
|
||||
# Exemple `abalone`
|
||||
|
||||
## Facteurs et des contributions des observations aux axes
|
||||
|
||||
- $\mathbf{F} \triangleq \mathbf{X}\mathbf{V} = \mathbf{U} \mathbf{D} \mathbf{V}^T\mathbf{V} = \mathbf{U} \mathbf{D}$
|
||||
- $CTR_{i,\alpha} = \frac{f_{i,\alpha}^2}{\sum_i f_{i,\alpha}^2} = \frac{f_{i,\alpha}^2}{\lambda_\alpha}$
|
||||
|
||||
```{r}
|
||||
fact <- svd.C$u %*% diag(svd.C$d)
|
||||
fact2 <- fact^2
|
||||
ctr <- sweep(fact2, 2, colSums(fact2), "/")
|
||||
```
|
||||
|
||||
# Exemple `abalone`
|
||||
|
||||
## Affichage des observations importantes pour les deux premiers axes principaux
|
||||
|
||||
```{r, eval=FALSE}
|
||||
n <- dim(fact)[1]
|
||||
d1BestCtr <- which(ctr[,1] > 1/n)
|
||||
d2BestCtr <- which(ctr[,2] > 1/n)
|
||||
d1d2BestCtr <- union(d1BestCtr,d2BestCtr)
|
||||
plot(fact[d1d2BestCtr,1], fact[d1d2BestCtr,2], pch="")
|
||||
text(fact[d1d2BestCtr,1], fact[d1d2BestCtr,2],
|
||||
d1d2BestCtr, cex=0.8)
|
||||
```
|
||||
|
||||
# Exemple `abalone`
|
||||
|
||||
```{r, echo=FALSE}
|
||||
n <- dim(fact)[1]
|
||||
d1BestCtr <- which(ctr[,1] > 1/n)
|
||||
d2BestCtr <- which(ctr[,2] > 1/n)
|
||||
d1d2BestCtr <- union(d1BestCtr,d2BestCtr)
|
||||
plot(fact[d1d2BestCtr,1], fact[d1d2BestCtr,2], pch="")
|
||||
text(fact[d1d2BestCtr,1], fact[d1d2BestCtr,2],
|
||||
d1d2BestCtr, cex=1)
|
||||
```
|
||||
|
||||
# Exemple `abalone`
|
||||
|
||||
```{r}
|
||||
f2.max <- which.max(abs(fact[,2]))
|
||||
f2.max.size <- clst$size[f2.max]
|
||||
f2.max.name <- names(clst$cluster[clst$cluster==f2.max])
|
||||
f2.max.ctr <- round(ctr[f2.max,2]*100, 2)
|
||||
```
|
||||
|
||||
- Le cluster `r f2.max` explique `r f2.max.ctr` % de la variance du second axe.
|
||||
- Il est composé de `r f2.max.size` élément
|
||||
|
||||
```{r}
|
||||
abalone[f2.max.name,]
|
||||
```
|
||||
|
||||
# Exemple `abalone`
|
||||
|
||||
## Nouvelle analyse
|
||||
|
||||
```{r, warning=FALSE}
|
||||
abalone <- abalone[!(row.names(abalone) %in% f2.max.name),]
|
||||
X <- abalone[,c("length", "diameter", "height", "whole.wt","shucked.wt", "viscera.wt",
|
||||
"shell.wt")]
|
||||
std.X <- standard(X)
|
||||
A <- std.X$X
|
||||
nbClust <- 100
|
||||
clst <- kmeans(A, nbClust, nstart = 25)
|
||||
std.centers <- standard(clst$centers)
|
||||
C <- std.centers$X
|
||||
svd.C <- svd(C)
|
||||
round((svd.C$d^2 / sum(svd.C$d^2))*100, 2)
|
||||
```
|
||||
|
||||
# Exemple `abalone`
|
||||
|
||||
## Nouvelle analyse (suite) et affichage des observations sur les deux premiers axes
|
||||
|
||||
```{r}
|
||||
fact <- svd.C$u %*% diag(svd.C$d)
|
||||
fact2 <- fact^2
|
||||
ctr <- sweep(fact2, 2, colSums(fact2), "/")
|
||||
n <- dim(fact)[1]
|
||||
d1BestCtr <- which(ctr[,1] > 1/n)
|
||||
d2BestCtr <- which(ctr[,2] > 1/n)
|
||||
d1d2BestCtr <- union(d1BestCtr,d2BestCtr)
|
||||
```
|
||||
|
||||
```{r, eval=FALSE}
|
||||
plot(fact[d1d2BestCtr,1], fact[d1d2BestCtr,2], pch="")
|
||||
text(fact[d1d2BestCtr,1], fact[d1d2BestCtr,2],
|
||||
d1d2BestCtr, cex=0.8)
|
||||
```
|
||||
|
||||
# Exemple `abalone`
|
||||
|
||||
```{r, echo=FALSE}
|
||||
plot(fact[d1d2BestCtr,1], fact[d1d2BestCtr,2], pch="")
|
||||
text(fact[d1d2BestCtr,1], fact[d1d2BestCtr,2], d1d2BestCtr, cex=0.8)
|
||||
```
|
||||
|
||||
# Exemple `abalone`
|
||||
|
||||
## Un cluster intrigant...
|
||||
|
||||
```{r}
|
||||
intrig<-43
|
||||
intrig.name <- names(clst$cluster[clst$cluster==intrig])
|
||||
abalone[intrig.name,]
|
||||
```
|
||||
|
||||
# Exemple `abalone`
|
||||
|
||||
## Un cluster intrigant...
|
||||
|
||||
```{r}
|
||||
boxplot(abalone$height)
|
||||
```
|
||||
|
||||
# Exemple `abalone`
|
||||
|
||||
## Un cluster intrigant...
|
||||
|
||||
- Contributions des axes aux écarts des observations au centre d'inertie $COS2_{i,\alpha} = \frac{f_{i,\alpha}^2}{\sum_\alpha f_{i,\alpha}^2} = \frac{f_{i,\alpha}^2}{d_{i,g}^2}$
|
||||
|
||||
```{r}
|
||||
cos2 <- sweep(fact2, 1, rowSums(fact2), "/")
|
||||
round(cos2[intrig,]*100,2)
|
||||
```
|
||||
|
||||
- Observation intrigante toute expliquée par les deux premiers axes.
|
||||
- Pas de risque à considérer comme significatif son écart à l'origine.
|
||||
|
||||
# Exemple `abalone`
|
||||
|
||||
## Contributions des variables aux axes principaux
|
||||
|
||||
- $VARCTR_{j,\alpha} = \left(\frac{\lambda_\alpha}{\sqrt{n-1}}\mathbf{v_\alpha}\right)^2$
|
||||
|
||||
```{r}
|
||||
varctr <- ((svd.C$v %*% diag(svd.C$d))/sqrt(n-1))^2
|
||||
rownames(varctr) <- colnames(X)
|
||||
round(varctr,2)
|
||||
```
|
||||
|
||||
- Variables très corrélées qui pèsente toutes fortement sur la variance expliquée par l'axe 1
|
163
15_loocv_slides.Rmd
Normal file
163
15_loocv_slides.Rmd
Normal file
@ -0,0 +1,163 @@
|
||||
---
|
||||
title: "15 Validation croisée un contre tous"
|
||||
author: Pierre-Edouard Portier
|
||||
date: mars 2022
|
||||
output:
|
||||
beamer_presentation:
|
||||
incremental: false
|
||||
---
|
||||
|
||||
```{r, include=FALSE}
|
||||
source("01_intro.R", local = knitr::knit_global())
|
||||
source("04_validation_croisee.R", local = knitr::knit_global())
|
||||
source("15_loocv.R", local = knitr::knit_global())
|
||||
```
|
||||
|
||||
# Validation croisée un contre tous (Leave-One-Out Cross-Validation)
|
||||
|
||||
>- Validation croisée à $n$-plis !
|
||||
>- $\hat{\boldsymbol\beta}_\lambda^{(-i)} = \left( \mathbf{X^{(-i)}}^T\mathbf{X^{(-i)}} + \lambda\mathbf{I} \right)^{-1} \mathbf{X^{(-i)}}^T \mathbf{y^{(-i)}}$
|
||||
>- $LOO_\lambda = \frac{1}{n} \sum_{i=1}^{n} \left( y_i - \mathbf{x_i}^T \hat{\boldsymbol\beta}_\lambda^{(-i)} \right)^2$
|
||||
>- $\mathbf{X^{(-i)}}^T\mathbf{X^{(-i)}} + \lambda\mathbf{I} = \mathbf{X^T}\mathbf{X} + \lambda\mathbf{I} - \mathbf{x_i}\mathbf{x_i}^T$
|
||||
|
||||
# Matrice chapeau
|
||||
|
||||
>- $\mathbf{\hat{y}_\lambda} = \mathbf{X}\hat{\boldsymbol\beta}_\lambda = \mathbf{X} \left( \mathbf{X}^T \mathbf{X} + \lambda \mathbf{I}\right)^{-1} \mathbf{X}^T \mathbf{y} = \mathbf{H} \mathbf{y}$
|
||||
>- $h_{ij}$ est l'influence qu'exerce $y_j$ sur la prédiction $\hat{y}_i$
|
||||
>- $h_{ii}$, l'influence de $y_i$ sur $\hat{y}_i$ identifie les observations à l'influence prépondérante
|
||||
>- $\mathbf{x_i}^T \left( \mathbf{X}^T \mathbf{X} + \lambda \mathbf{I}\right)^{-1} \mathbf{x_i} = h_{ii} \triangleq h_i$
|
||||
>- $\mathbf{\hat{y}_\lambda} = \sum_{d_j>0} \mathbf{u_j} \frac{d_j^2}{d_j^2 + \lambda} \mathbf{u_j}^T\mathbf{y}$
|
||||
>- $h_i = \sum_{d_j>0} \frac{d_j^2}{d_j^2 + \lambda} u_{ij}$
|
||||
>- $tr(\mathbf{H(\lambda)}) = \sum_i h_i = \sum_{d_j>0} \frac{d_j^2}{d_j^2 + \lambda}$
|
||||
>- $tr(\mathbf{H(0)}) = rang(\mathbf{X})$ ou \emph{degré de liberté} de la régression
|
||||
|
||||
# Formule de Morrison et calcul efficace de LOOCV
|
||||
|
||||
>- $\left( \mathbf{A} + \mathbf{u}\mathbf{v}^T \right)^{-1} = \mathbf{A}^{-1} - \frac{\mathbf{A}^{-1}\mathbf{u}\mathbf{v}^T\mathbf{A}^{-1}}{1 + \mathbf{v}^T\mathbf{A}^{-1}\mathbf{u}}$
|
||||
>- Permet de simplifier $\left( \mathbf{X^{(-i)}}^T\mathbf{X^{(-i)}} + \lambda\mathbf{I} \right)^{-1}$ pour obtenir...
|
||||
>- $$y_i - \hat{y}^{(-i)}_{\lambda i} = \frac{y_i - \hat{y}_{\lambda i}}{1 - h_i}$$
|
||||
>- $$
|
||||
\begin{aligned}
|
||||
&LOO_\lambda = \frac{1}{n} \sum_{i=1}^{n} \left( \frac{y_i - \hat{y}_{\lambda i}}{1 - h_i} \right)^2 \\
|
||||
&\text{avec } h_i = \sum_{d_j>0} \frac{d_j^2}{d_j^2 + \lambda} u_{ij}
|
||||
\end{aligned}
|
||||
$$
|
||||
>- Cette mesure peut être instable pour des $h_i$ proches de $1$...
|
||||
|
||||
# Validation croisée généralisée (GCV)
|
||||
|
||||
$$
|
||||
\begin{aligned}
|
||||
&GCV_\lambda = \frac{1}{n} \sum_{i=1}^{n} \left( \frac{y_i - \hat{y}_{\lambda i}}{1 - \frac{1}{n} tr(\mathbf{H}(\lambda))} \right)^2 \\
|
||||
&\text{avec } tr(\mathbf{H}(\lambda)) = \sum_{d_j>0} \frac{d_j^2}{d_j^2 + \lambda}
|
||||
\end{aligned}
|
||||
$$
|
||||
|
||||
# Implémentation de GCV
|
||||
|
||||
\small
|
||||
```{r, eval=FALSE}
|
||||
ridge.svd <- function(data, degre) {
|
||||
X <- data$X
|
||||
n <- nrow(X)
|
||||
A <- scale(outer(c(X), 1:degre, "^"))
|
||||
Ym <- mean(data$Y)
|
||||
Y <- data$Y - Ym
|
||||
As <- svd(A)
|
||||
d <- As$d
|
||||
function(lambda) {
|
||||
coef <- c(As$v %*% ((d / (d^2 + lambda)) * (t(As$u) %*% Y)))
|
||||
coef <- coef / attr(A,"scaled:scale")
|
||||
inter <- Ym - coef %*% attr(A,"scaled:center")
|
||||
coef <- c(inter, coef)
|
||||
trace.H <- sum(d^2 / (d^2 + lambda))
|
||||
pred <- polyeval(coef, X)
|
||||
gcv <- sum( ((Y - pred) / (1 - (trace.H / n))) ^ 2 ) / n
|
||||
list(coef = coef, gcv = gcv)
|
||||
}
|
||||
}
|
||||
```
|
||||
\normalsize
|
||||
|
||||
# Implémentation de GCV
|
||||
|
||||
\small
|
||||
```{r, eval=FALSE}
|
||||
ridge.loocv <- function(data, deg, lambdas) {
|
||||
errs <- double(length(lambdas))
|
||||
coefs <- matrix(data = NA, nrow = length(lambdas), ncol = 1+deg)
|
||||
ridge <- ridge.svd(data, deg)
|
||||
idx <- 1
|
||||
for(lambda in lambdas) {
|
||||
res <- ridge(lambda)
|
||||
coefs[idx,] <- res$coef
|
||||
errs[idx] <- res$gcv
|
||||
idx <- idx + 1
|
||||
}
|
||||
err.min <- min(errs)
|
||||
lambda.best <- lambdas[which(errs == err.min)]
|
||||
coef.best <- coefs[which(errs == err.min),]
|
||||
pred <- polyeval(coef.best, data$X)
|
||||
mae <- mean(abs(pred - data$Y))
|
||||
list(coef = coef.best, lambda = lambda.best, mae = mae)
|
||||
}
|
||||
```
|
||||
\normalsize
|
||||
|
||||
# Exemple
|
||||
|
||||
\small
|
||||
```{r, eval=FALSE}
|
||||
set.seed(1123)
|
||||
n <- 100
|
||||
deg <- 8
|
||||
data = gendat(n,0.2)
|
||||
splitres <- splitdata(data,0.8)
|
||||
entr <- splitres$entr
|
||||
test <- splitres$test
|
||||
lambdas <- c(1E-8, 1E-7, 1E-6, 1E-5, 1E-4, 1E-3, 1E-2, 1E-1, 1)
|
||||
ridge.res <- ridge.loocv(entr, deg, lambdas)
|
||||
plt(entr,f)
|
||||
pltpoly(ridge.res$coef)
|
||||
```
|
||||
\normalsize
|
||||
|
||||
# Exemple - jeu d'entraînement
|
||||
|
||||
```{r, echo=FALSE}
|
||||
set.seed(1123)
|
||||
n <- 100
|
||||
deg <- 8
|
||||
data = gendat(n,0.2)
|
||||
splitres <- splitdata(data,0.8)
|
||||
entr <- splitres$entr
|
||||
test <- splitres$test
|
||||
lambdas <- c(1E-8, 1E-7, 1E-6, 1E-5, 1E-4, 1E-3, 1E-2, 1E-1, 1)
|
||||
ridge.res <- ridge.loocv(entr, deg, lambdas)
|
||||
plt(entr,f)
|
||||
pltpoly(ridge.res$coef)
|
||||
```
|
||||
|
||||
```{r, echo=FALSE}
|
||||
bestLambda <- ridge.res$lambda
|
||||
maeTrain <- ridge.res$mae
|
||||
```
|
||||
|
||||
- $\lambda$ : `r bestLambda`
|
||||
- Erreur absolue moyenne sur le jeu d'entraînement : `r round(maeTrain,3)`
|
||||
|
||||
# Exemple - jeu de test
|
||||
|
||||
```{r, echo=FALSE}
|
||||
plt(test,f)
|
||||
pltpoly(ridge.res$coef)
|
||||
```
|
||||
|
||||
# Exemple - jeu de test
|
||||
|
||||
```{r}
|
||||
testpred <- polyeval(ridge.res$coef, test$X)
|
||||
testmae <- mean(abs(testpred - test$Y))
|
||||
```
|
||||
|
||||
- erreur absolue moyenne de `r round(testmae,3)` sur le jeu de test
|
Loading…
Reference in New Issue
Block a user