s_nlp_hclust_core

public pure subroutine s_nlp_hclust_core(x, nd, nv, nc, gm, cm, cl, cc, cov, sigma)

Perform agglomerative hierarchical clustering using centroid linkage and the Mahalanobis distance. NOTE: The procedure is exact, but slow for large nd. For most pracitcal purposes, using Lance–Williams algorithm and other distances is advised. TODO: Implement distance switch and L-W algorithm + approx. distance updates.

Arguments

Type	Intent		Name
real(kind=wp),	intent(in)	::	x(nd,nv)	input data matrix (samples, variables)
integer(kind=i4),	intent(in)	::	nd	number of data points
integer(kind=i4),	intent(in)	::	nv	number of variables
integer(kind=i4),	intent(in)	::	nc	number of clusters (target)
real(kind=wp),	intent(out)	::	gm(nv)	global means for each variable
real(kind=wp),	intent(out)	::	cm(nv,nc)	cluster centroids
integer(kind=i4),	intent(out)	::	cl(nd)	cluster assignments for each data point
integer(kind=i4),	intent(out)	::	cc(nc)	cluster sizes
real(kind=wp),	intent(out)	::	cov(nv,nv)	covariance matrix
real(kind=wp),	intent(out)	::	sigma(nv)	standard deviation per variable

Calls

Help

Called by

Help

Source Code

pure subroutine s_nlp_hclust_core(x, nd, nv, nc, gm, cm, cl, cc, cov, sigma)

! ==== Description
!! Perform agglomerative hierarchical clustering using centroid linkage
!! and the Mahalanobis distance.
!! NOTE: The procedure is exact, but slow for large nd. For most pracitcal
!! purposes, using Lance–Williams algorithm and other distances is advised.
!! TODO: Implement distance switch and L-W algorithm + approx. distance updates.

! ==== Declarations
  real(wp)   , intent(in)  :: x(nd, nv)       !! input data matrix (samples, variables)
  integer(i4), intent(in)  :: nd              !! number of data points
  integer(i4), intent(in)  :: nv              !! number of variables
  integer(i4), intent(in)  :: nc              !! number of clusters (target)
  real(wp)   , intent(out) :: gm(nv)          !! global means for each variable
  real(wp)   , intent(out) :: cm(nv,nc)       !! cluster centroids
  integer(i4), intent(out) :: cl(nd)          !! cluster assignments for each data point
  integer(i4), intent(out) :: cc(nc)          !! cluster sizes
  real(wp)   , intent(out) :: cov(nv,nv)      !! covariance matrix
  real(wp)   , intent(out) :: sigma(nv)       !! standard deviation per variable
  real(wp)                 :: x1(nd,nv)       !! working matrix (standardised data, mutable)
  real(wp)                 :: x2(nd,nv)       !! working matrix (standardised data, preserved)
  real(wp)                 :: vec(nd)
  real(wp)                 :: tmp_r1, tmp_r2
  real(wp)                 :: mini, total
  integer(i4)              :: cluster(nd, nd), nde(nd), new(nd)
  logical                  :: remain(nd)
  integer(i4)              :: cl1, cl2, gone
  integer(i4)              :: idx, i, j, k, s
  integer(i4)              :: re_row(nc)
  real(wp)                 :: pre_vec(nc), post_vec(nc)
  integer(i4)              :: cidx(nc)
  real(wp)                 :: cluster_sum(nv, nd) !! running sum for centroids

! ==== Instructions

  ! ---- standardise variables and compute covariance matrix
  x1 = x
  x2 = x
  do i = 1, nv
     vec = x1(:, i)
     tmp_r1 = f_sts_mean_core(vec)
     gm(i)  = tmp_r1
     tmp_r2 = sqrt(f_sts_var_core(vec, ddf=1.0_wp))
     sigma(i) = tmp_r2
     x1(:, i) = (x1(:, i) - tmp_r1) / tmp_r2
     x2(:, i) = (x2(:, i) - tmp_r1) / tmp_r2
  enddo

  do i = 1, nv
     do j = 1, nv
        cov(i,j) = f_sts_cov_core(x1(:,i), x1(:,j), ddf=1.0_wp)
     enddo
  enddo

  ! ---- initialise cluster membership and running sums
  do j = 1, nd
     remain(j) = .true.
     cluster(j,1) = j
     nde(j) = 1
     cluster(j,2:nd) = 0
     cluster_sum(:, j) = x1(j, :)
  enddo

  ! ---- agglomerative merging
  do s = 1, nd - nc
     mini = huge(1.0_wp)
     do j = 2, nd
        if (.not. remain(j)) cycle       ! skip inactive clusters early
        do k = 1, j-1
           if (remain(k)) then
              ! compute Mahalanobis distance between centroids only
              total = f_lin_mahalanobis_core(cluster_sum(:,j)/real(nde(j),wp), &
                                             cluster_sum(:,k)/real(nde(k),wp), cov)
              if (total .le. mini) then
                 mini = total
                 cl1 = j
                 cl2 = k
              endif
           endif
        enddo
     enddo

     idx = min(cl1, cl2)
     gone = max(cl1, cl2)
     remain(gone) = .false.

     ! update centroid using running sum
     cluster_sum(:, idx) = cluster_sum(:, idx) + cluster_sum(:, gone)
     nde(idx) = nde(idx) + nde(gone)
     x1(idx, :) = cluster_sum(:, idx) / real(nde(idx), wp)

     ! merge membership lists
     new(idx) = nde(idx)
     cluster(idx, nde(idx)-nde(gone)+1:new(idx)) = cluster(gone, 1:nde(gone))
  enddo

  ! ---- sort clusters by first variable
  k = 0
  do j = 1, nd
     if (remain(j)) then
        k = k + 1
        pre_vec(k) = x1(j,1)
        re_row(k) = j
     endif
  enddo
  call s_utl_sort(pre_vec, nc, 2, re_row, post_vec, cidx)

  ! ---- assign centroids and memberships
  do i = 1, nc
     cm(:,i) = x1(re_row(i),:)
  enddo

  cc = 0
  do i = 1, nc
     idx = re_row(i)
     cc(i) = nde(idx)
     do j = 1, nde(idx)
        cl(cluster(idx,j)) = i
     enddo
  enddo

end subroutine s_nlp_hclust_core

s_nlp_hclust_core Subroutine

Contents

Source Code

public pure subroutine s_nlp_hclust_core(x, nd, nv, nc, gm, cm, cl, cc, cov, sigma)

Arguments

Calls

Called by

Source Code