K-means clustering using Mahalanobis distance. NOTE: think about only accepting standardised data to avoid redundant computation in successive calls of procedure. This and repeated Cholesky fractionisation are potential performance bottlenecks.
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
real(kind=wp), | intent(in) | :: | x(nd,nv) |
raw data (samples, variables) |
||
integer(kind=i4), | intent(in) | :: | nd |
number of data points |
||
integer(kind=i4), | intent(in) | :: | nv |
number of variables |
||
integer(kind=i4), | intent(in) | :: | nc |
number of clusters |
||
real(kind=wp), | intent(in) | :: | cm_in(nv,nc) |
initial centroids (raw, not standardised) |
||
real(kind=wp), | intent(out) | :: | gm(nv) |
global means |
||
real(kind=wp), | intent(out) | :: | cm(nv,nc) |
centroids (refined, standardised) |
||
integer(kind=i4), | intent(out) | :: | cl(nd) |
cluster assignments |
||
integer(kind=i4), | intent(out) | :: | cc(nc) |
cluster sizes |
||
real(kind=wp), | intent(out) | :: | cov(nv,nv) |
covariance matrix |
||
real(kind=wp), | intent(out) | :: | sigma(nv) |
standard deviations per variable |
||
real(kind=wp), | intent(in), | optional | :: | cov_in(nv,nv) |
optional covariance matrix |
pure subroutine s_nlp_kmeans_core(x, nd, nv, nc, cm_in, gm, & & cm, cl, cc, cov, sigma, cov_in) ! ==== Description !! K-means clustering using Mahalanobis distance. !! NOTE: think about only accepting standardised data to avoid redundant computation !! in successive calls of procedure. This and repeated Cholesky fractionisation are !! potential performance bottlenecks. ! ==== Declarations real(wp) , intent(in) :: x(nd, nv) !! raw data (samples, variables) integer(i4), intent(in) :: nd !! number of data points integer(i4), intent(in) :: nv !! number of variables integer(i4), intent(in) :: nc !! number of clusters real(wp) , intent(in) :: cm_in(nv, nc) !! initial centroids (raw, not standardised) real(wp) , intent(out) :: cm(nv, nc) !! centroids (refined, standardised) real(wp) , intent(out) :: gm(nv) !! global means integer(i4), intent(out) :: cl(nd) !! cluster assignments integer(i4), intent(out) :: cc(nc) !! cluster sizes real(wp) , intent(out) :: cov(nv,nv) !! covariance matrix real(wp) , intent(out) :: sigma(nv) !! standard deviations per variable real(wp) , intent(in) , optional :: cov_in(nv,nv) !! optional covariance matrix real(wp) :: x1(nd,nv) !! standardised data (mutable) real(wp) :: x2(nd,nv) !! standardised data (preserved) real(wp) :: vec(nd) !! temp vector for standardisation real(wp) :: tmp_cm(nv,nc) !! working centroids real(wp) :: total, mini !! total and minima real(wp) :: tmp_r1, tmp_r2 real(wp) :: pre_vec(nc), post_vec(nc) integer(i4) :: idx_in(nc), idx_out(nc) integer(i4) :: counter(nc) !! cluster counts integer(i4) :: i, j, k, l, cl1, iter integer(i4), parameter :: i_max = c_kmeans_i !! max iterations real(wp) , parameter :: tol = c_conv_tol !! convergence tolerance ! ==== Instructions ! ---- standardise variables and compute covariance (same as cluster_h) x1 = x x2 = x ! standardise do i = 1, nv vec = x1(:, i) ! mean tmp_r1 = f_sts_mean_core(vec) gm(i) = tmp_r1 ! standard deviation tmp_r2 = sqrt(f_sts_var_core(vec, ddf=1.0_wp)) sigma(i) = tmp_r2 ! standardise x1(:, i) = (x1(:, i) - tmp_r1) / tmp_r2 x2(:, i) = (x2(:, i) - tmp_r1) / tmp_r2 enddo ! covariance matrix on standardised data if (present(cov_in)) then cov = cov_in else do i = 1, nv do j = 1, nv cov(i, j) = f_sts_cov_core(x1(:, i), x1(:, j), ddf=1.0_wp) enddo enddo endif ! ---- initialise centroids (standardise cm_in) do i = 1, nc do j = 1, nv cm(j, i) = (cm_in(j, i) - gm(j)) / sigma(j) enddo enddo ! ---- iterative refinement do iter = 1, i_max ! assign each point to nearest centroid do j = 1, nd mini = huge(1.0_wp) cl1 = 1 do i = 1, nc total = f_lin_mahalanobis_core(x1(j, :), cm(:, i), cov) if (total .lt. mini) then mini = total cl1 = i endif enddo cl(j) = cl1 enddo ! reset accumulators tmp_cm = 0.0_wp counter = 0 ! sum up members do j = 1, nd i = cl(j) counter(i) = counter(i) + 1 do l = 1, nv tmp_cm(l, i) = tmp_cm(l, i) + x1(j, l) enddo enddo ! compute new centroids do i = 1, nc if (counter(i) .gt. 0) then tmp_cm(:, i) = tmp_cm(:, i) / real(counter(i), wp) else tmp_cm(:, i) = cm(:, i) endif enddo ! check convergence if (all(abs(tmp_cm - cm) .lt. tol)) exit cm = tmp_cm enddo ! ---- sort clusters by first variable ! sorting do i = 1, nc pre_vec(i) = cm(1, i) idx_in(i) = i enddo call s_utl_sort(pre_vec, nc, 2, idx_in, post_vec, idx_out) tmp_cm = cm do i = 1, nc cm(:, i) = tmp_cm(:, idx_out(i)) cc(i) = counter(idx_out(i)) enddo ! remap assignments to clusters do j = 1, nd do i = 1, nc if (cl(j) .eq. idx_out(i)) then cl(j) = i exit endif enddo enddo end subroutine s_nlp_kmeans_core