The most standard parametric approach in cluster analysis assumes data can be modelled by a finite mixture distribution. The approach has two steps; first, a finite mixture distribution with probability density function \[ f(\;\cdot\; ; \pi_1, \dots, \pi_k, \theta_1, \dots, \theta_k) = \pi_1 (\;\cdot\; ; \theta_1) + \dots + \pi_k f(\;\cdot\; ; \theta_k), \] with \(\sum_{j=1}^k = \pi_j =1\) is fitted to a sample $ X$, obtaining estimates \(\hat{\pi}_1, \dots, \hat{\pi}_k\) and \(\hat{\theta}_1 \dots \hat{\theta}_k\). After the fitting process, each observation $ x$ is assigned to the finite mixture component \(j\), \(1\leq j \leq k\), with \(\hat{\pi}_j f( x ; \hat{\theta}_j)\) maximum.
We are going to work with the dataset ex4.1 used in Baudry et el. (2010) and available in package mclust
. To fit a finite mixture of gaussian distributions we are going to use the same package.
library(mclust)
library(mixpack)
library(ggplot2)
library(dplyr)
data(Baudry_etal_2010_JCGS_examples)
qplot(data=ex4.1, X1, X2)
Function Mclust
allows us to fit a mixture function to a dataset.
m <- Mclust(ex4.1)
summary(m)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EEV (ellipsoidal, equal volume and shape) model with 6 components:
##
## log.likelihood n df BIC ICL
## -1957.786 600 25 -4075.494 -4248.158
##
## Clustering table:
## 1 2 3 4 5 6
## 79 122 121 107 132 39
Using the function dmixnorm_solution
we can evaluate the probability density function and calculate the posterior probabilities
dens.mixt = dmixnorm_solution(ex4.1, solution = m)
(df <- lapply(1:6, function(i)
(m$parameters$pro[i] * dmixnorm_solution(ex4.1, solution = m, part=i)) %>%
data.frame %>% {./dens.mixt} %>%
setNames(sprintf('p%02d', i)) ) %>% bind_cols) %>% tbl_df
## # A tibble: 600 × 6
## p01 p02 p03 p04 p05
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2.822820e-02 2.243024e-24 8.770852e-86 7.056364e-102 8.334248e-122
## 2 9.727873e-01 1.736318e-24 3.126308e-61 2.072396e-81 3.027038e-90
## 3 3.714200e-35 9.994207e-01 3.467474e-14 8.608995e-117 1.435930e-80
## 4 1.863401e-42 1.061134e-81 1.111488e-01 8.887923e-01 5.891570e-05
## 5 3.867843e-35 4.611201e-94 1.033876e-03 9.983918e-01 5.742791e-04
## 6 1.034939e-17 8.673403e-226 4.242040e-72 1.854718e-14 1.000000e+00
## 7 1.537934e-14 2.954088e-163 3.138887e-30 7.025241e-06 9.999930e-01
## 8 6.262560e-51 1.300271e-87 3.816734e-01 6.183265e-01 6.164994e-08
## 9 2.142260e-50 4.820499e-76 5.404207e-01 4.595778e-01 1.582597e-06
## 10 5.674921e-61 9.999993e-01 1.246030e-10 2.430742e-99 2.604146e-58
## # ... with 590 more rows, and 1 more variables: p06 <dbl>
The posterior probabilities are also available in the object returned by function Mclust
.
m$z %>% tbl_df
## # A tibble: 600 × 6
## V1 V2 V3 V4 V5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2.850411e-02 2.310219e-24 9.282937e-86 7.504895e-102 9.048667e-122
## 2 9.727797e-01 1.765001e-24 3.220138e-61 2.153801e-81 3.181109e-90
## 3 3.921316e-35 9.994268e-01 3.474206e-14 9.224481e-117 1.498631e-80
## 4 1.822696e-42 1.106672e-81 1.112261e-01 8.887149e-01 5.894606e-05
## 5 3.774169e-35 4.850409e-94 1.036081e-03 9.983893e-01 5.746424e-04
## 6 1.042332e-17 1.003888e-225 4.429355e-72 1.864876e-14 1.000000e+00
## 7 1.509739e-14 3.263020e-163 3.195662e-30 7.038944e-06 9.999930e-01
## 8 6.098899e-51 1.357183e-87 3.817205e-01 6.182795e-01 6.176813e-08
## 9 2.097704e-50 5.003153e-76 5.404484e-01 4.595500e-01 1.583895e-06
## 10 6.084225e-61 9.999993e-01 1.246642e-10 2.579598e-99 2.676552e-58
## # ... with 590 more rows, and 1 more variables: V6 <dbl>
xlimits = seq(-3, 11, 0.05)
ylimits = seq(-3, 8, 0.05)
cm0 = expand.grid(X1 = xlimits, X2 = ylimits) %>% tbl_df %>%
mutate(z = dmixnorm_solution(., solution=m))
ggplot() +
geom_point(data=ex4.1, aes(x=X1, y=X2),alpha=0.2) +
stat_contour(data = cm0, aes(x=X1, y=X2, z=z))
partition = list(1,2,3,4,5,6)
CN6 = lapply(partition, function(part){
expand.grid(X1 = xlimits, X2 = ylimits) %>%
tbl_df %>%
mutate(z = dmixnorm_solution(., m, part = part),
id = sprintf('{%s}',paste(part, collapse=',')))
}) %>% bind_rows
ggplot() +
geom_point(data=ex4.1, aes(x=X1, y=X2),alpha=0.2) +
stat_contour(data = CN6, aes(x=X1, y=X2, z=z, col=id))
partition = list(c(1,6,2),c(3,4),5)
CN6 = lapply(partition, function(part){
expand.grid(X1 = xlimits, X2 = ylimits) %>%
mutate(z = dmixnorm_solution(., m, part = part),
id = sprintf('{%s}',paste(part, collapse=',')))
}) %>% bind_rows
ggplot() +
geom_point(data=ex4.1, aes(x=X1, y=X2),alpha=0.2) +
stat_contour(data = CN6, aes(x=X1, y=X2, z=z, col=id))