s_nlp_kmeans_core

public pure subroutine s_nlp_kmeans_core(x, nd, nv, nc, cm_in, gm, cm, cl, cc, cov, sigma, cov_in)

K-means clustering using Mahalanobis distance. NOTE: think about only accepting standardised data to avoid redundant computation in successive calls of procedure. This and repeated Cholesky fractionisation are potential performance bottlenecks.

Arguments

Type	Intent	Optional		Name
real(kind=wp),	intent(in)		::	x(nd,nv)	raw data (samples, variables)
integer(kind=i4),	intent(in)		::	nd	number of data points
integer(kind=i4),	intent(in)		::	nv	number of variables
integer(kind=i4),	intent(in)		::	nc	number of clusters
real(kind=wp),	intent(in)		::	cm_in(nv,nc)	initial centroids (raw, not standardised)
real(kind=wp),	intent(out)		::	gm(nv)	global means
real(kind=wp),	intent(out)		::	cm(nv,nc)	centroids (refined, standardised)
integer(kind=i4),	intent(out)		::	cl(nd)	cluster assignments
integer(kind=i4),	intent(out)		::	cc(nc)	cluster sizes
real(kind=wp),	intent(out)		::	cov(nv,nv)	covariance matrix
real(kind=wp),	intent(out)		::	sigma(nv)	standard deviations per variable
real(kind=wp),	intent(in),	optional	::	cov_in(nv,nv)	optional covariance matrix

Calls

Help

Called by

Help

Source Code

pure subroutine s_nlp_kmeans_core(x, nd, nv, nc, cm_in, gm, &
                                & cm, cl, cc, cov, sigma, cov_in)

! ==== Description
!! K-means clustering using Mahalanobis distance.
!! NOTE: think about only accepting standardised data to avoid redundant computation
!! in successive calls of procedure. This and repeated Cholesky fractionisation are
!! potential performance bottlenecks.

! ==== Declarations
  real(wp)   , intent(in)            :: x(nd, nv)               !! raw data (samples, variables)
  integer(i4), intent(in)            :: nd                      !! number of data points
  integer(i4), intent(in)            :: nv                      !! number of variables
  integer(i4), intent(in)            :: nc                      !! number of clusters
  real(wp)   , intent(in)            :: cm_in(nv, nc)           !! initial centroids (raw, not standardised)
  real(wp)   , intent(out)           :: cm(nv, nc)              !! centroids (refined, standardised)
  real(wp)   , intent(out)           :: gm(nv)                  !! global means
  integer(i4), intent(out)           :: cl(nd)                  !! cluster assignments
  integer(i4), intent(out)           :: cc(nc)                  !! cluster sizes
  real(wp)   , intent(out)           :: cov(nv,nv)              !! covariance matrix
  real(wp)   , intent(out)           :: sigma(nv)               !! standard deviations per variable
  real(wp)   , intent(in) , optional :: cov_in(nv,nv)           !! optional covariance matrix
  real(wp)                           :: x1(nd,nv)               !! standardised data (mutable)
  real(wp)                           :: x2(nd,nv)               !! standardised data (preserved)
  real(wp)                           :: vec(nd)                 !! temp vector for standardisation
  real(wp)                           :: tmp_cm(nv,nc)           !! working centroids
  real(wp)                           :: total, mini             !! total and minima
  real(wp)                           :: tmp_r1, tmp_r2
  real(wp)                           :: pre_vec(nc), post_vec(nc)
  integer(i4)                        :: idx_in(nc), idx_out(nc)
  integer(i4)                        :: counter(nc)             !! cluster counts
  integer(i4)                        :: i, j, k, l, cl1, iter
  integer(i4), parameter             :: i_max = c_kmeans_i      !! max iterations
  real(wp)   , parameter             :: tol = c_conv_tol        !! convergence tolerance

! ==== Instructions

  ! ---- standardise variables and compute covariance (same as cluster_h)
  x1 = x
  x2 = x

  ! standardise
  do i = 1, nv
     vec = x1(:, i)
     ! mean
     tmp_r1 = f_sts_mean_core(vec)
     gm(i)  = tmp_r1
     ! standard deviation
     tmp_r2   = sqrt(f_sts_var_core(vec, ddf=1.0_wp))
     sigma(i) = tmp_r2
     ! standardise
     x1(:, i) = (x1(:, i) - tmp_r1) / tmp_r2
     x2(:, i) = (x2(:, i) - tmp_r1) / tmp_r2
  enddo

  ! covariance matrix on standardised data
  if (present(cov_in)) then
     cov = cov_in
  else
     do i = 1, nv
        do j = 1, nv
           cov(i, j) = f_sts_cov_core(x1(:, i), x1(:, j), ddf=1.0_wp)
        enddo
     enddo
  endif

  ! ---- initialise centroids (standardise cm_in)
  do i = 1, nc
     do j = 1, nv
        cm(j, i) = (cm_in(j, i) - gm(j)) / sigma(j)
     enddo
  enddo

  ! ---- iterative refinement
  do iter = 1, i_max
     ! assign each point to nearest centroid
     do j = 1, nd
        mini = huge(1.0_wp)
        cl1  = 1
        do i = 1, nc
           total = f_lin_mahalanobis_core(x1(j, :), cm(:, i), cov)
           if (total .lt. mini) then
              mini = total
              cl1  = i
           endif
        enddo
        cl(j) = cl1
     enddo

     ! reset accumulators
     tmp_cm  = 0.0_wp
     counter = 0

     ! sum up members
     do j = 1, nd
        i = cl(j)
        counter(i) = counter(i) + 1
        do l = 1, nv
           tmp_cm(l, i) = tmp_cm(l, i) + x1(j, l)
        enddo
     enddo

     ! compute new centroids
     do i = 1, nc
        if (counter(i) .gt. 0) then
           tmp_cm(:, i) = tmp_cm(:, i) / real(counter(i), wp)
        else
           tmp_cm(:, i) = cm(:, i)
        endif
     enddo

     ! check convergence
     if (all(abs(tmp_cm - cm) .lt. tol)) exit
     cm = tmp_cm
  enddo

  ! ---- sort clusters by first variable

  ! sorting
  do i = 1, nc
     pre_vec(i) = cm(1, i)
     idx_in(i)     = i
  enddo
  call s_utl_sort(pre_vec, nc, 2, idx_in, post_vec, idx_out)

  tmp_cm = cm
  do i = 1, nc
     cm(:, i) = tmp_cm(:, idx_out(i))
     cc(i)        = counter(idx_out(i))
  enddo

  ! remap assignments to clusters
  do j = 1, nd
     do i = 1, nc
        if (cl(j) .eq. idx_out(i)) then
           cl(j) = i
           exit
        endif
     enddo
  enddo

end subroutine s_nlp_kmeans_core

s_nlp_kmeans_core Subroutine

Contents

Source Code

public pure subroutine s_nlp_kmeans_core(x, nd, nv, nc, cm_in, gm, cm, cl, cc, cov, sigma, cov_in)

Arguments

Calls

Called by

Source Code