clustering_XMeans.js

import { euclidean, euclidean_squared } from "../metrics/index.js";
import { Matrix, linspace } from "../matrix/index.js";
import { KMeans } from "../clustering/index.js";
/**
 * @class
 * @alias XMeans
 */
export class XMeans{
    /**
     * @constructor
     * @memberof module:clustering
     * @alias XMeans
     * @todo needs restructuring and repairing!!
     * @param {Matrix} matrix 
     * @param {Numbers} K_max
     * @param {Numbers} K_min
     * @param {Function} [metric = euclidean] 
     * @param {Number} [seed = 1987]
     * @returns {XMeans}
     * @see {@link https://www.cs.cmu.edu/~dpelleg/download/xmeans.pdf}
     * @see {@link https://github.com/annoviko/pyclustering/blob/master/pyclustering/cluster/xmeans.py}
     * @see {@link https://github.com/haifengl/smile/blob/master/core/src/main/java/smile/clustering/XMeans.java}
     */
    constructor(matrix, K_max = 10, K_min = 2, metric = euclidean, seed=1987) {
        //const first = super(matrix, K_min, metric, seed, false);
        const first = new KMeans(matrix, K_min, metric, seed, false);
        this._K_max = K_max;
        this._K_min = K_min;
        this._metric = metric;
        first.init(K_min, first._get_random_centroids(K_min));
        const randomizer = this._randomizer = first._randomizer;
        const centroids = first._cluster_centroids;
        const candidates = this._candidates = {};
        let K = K_min
        candidates[K] = {
            "kmeans": first,
            "cluster_centroids": centroids,
            "score": null,
        };
        const A = this._matrix = matrix;
        const N = A.shape[0];
        // foreach K in [K_min, K_max];
        do {
            console.log(K, candidates)
            const candidate = candidates[K];
            const clusters = candidate.kmeans.get_clusters();
            const parent_bic = this._bic(clusters, centroids, linspace(0, N - 1))
            candidate.score = parent_bic;
            const child_bic = [];
            const child_kmeans = [];
            const child_indices = [];
            // foreach cluster
            for (let j = 0; j < K; ++j) {
                const cluster = clusters[j];
                console.log(cluster.length)
                if (cluster.length < K_max) continue;
                const subset = Matrix.from(cluster.map(d => A.row(d)));
                const subset_kmeans = new KMeans(subset, 2, metric, 1987, false);
                subset_kmeans._randomizer = randomizer;
                subset_kmeans.init()
                const subset_cluster = subset_kmeans.get_clusters();
                const subset_centroids = subset_kmeans._cluster_centroids;
                const bic = this._bic(subset_cluster, subset_centroids, cluster)
                child_bic.push(bic);
                child_kmeans.push(subset_kmeans);
                child_indices.push(j);
            }
            //if (child_bic.length < (K )) break;
            //choose best
            let best_split = child_indices[0];
            let best_bic = child_bic[0];
            for (let i = 0; i < child_bic.length; ++i) {
                if (best_bic > child_bic[i]) {
                    best_split = child_indices[i];
                    best_bic = child_bic[i];
                }
            }
            const best_cluster_centroids = candidate.cluster_centroids.splice(best_split, 1, ...child_kmeans[best_split]._cluster_centroids);
            console.log(candidate.cluster_centroids, child_kmeans[best_split]._cluster_centroids)
            
            const parent_clusters = candidate.kmeans._clusters;
            const best_candidate_clusters = child_kmeans[best_split]._clusters;
            const best_candidate = new KMeans(A, K + 1, metric, 1987, false)
            best_candidate._randomizer = randomizer;
            // set clusters and centroids
            let counter = 0;
            //let cluster_number = best_cluster_centroids.length;
            best_candidate._clusters = parent_clusters.map(c => {
                if (c == best_split) {
                    return c + best_candidate_clusters[counter++];
                } else if (c > best_split) {
                    return c + 1;
                }
                return c;
            })
            //best_candidate._K = K + 1;
            console.log(best_candidate.get_clusters())
            //best_candidate.init(K + 1, best_cluster_centroids);
            console.log(best_candidate)
            //save best candidate.
            candidates[K + 1] = {
                "kmeans": best_candidate,
                "cluster_centroids": best_cluster_centroids,
                "score": child_bic[best_split],
            }
            

        } while (++K < K_max)


        // return best candidate.
        return this;
    }

    get_clusters() {
        let K_min = this._K_min;
        let K_max = this._K_max;
        const candidates = this._candidates;
        let best_score = candidates[K_min].score;
        let best_candidate = candidates[K_min].kmeans;
        for (let i = K_min + 1; i < K_max; ++i) {
            if (!(i in candidates)) break;
            const candidate = candidates[i];
            const score = candidate.score
            if (best_score < score) {
                best_score = score;
                best_candidate = candidate.kmeans;
            }
        }
        return best_candidate.get_clusters();
    }
    
    _bic(clusters, centroids, indices) {
        const A = this._matrix;
        const D = this._matrix.shape[1];
        const K = centroids.length;
        //const result = new Array(K).fill();
        let result = 0;

        let variance = 0;
        for (let i = 0; i < K; ++i) {
            const cluster = clusters[i];
            const centroid = centroids[i];
            const n = cluster.length;
            for (let j = 0; j < n; ++j) {
                variance += euclidean_squared(centroid, A.row(indices[cluster[j]])) ** 2;
            }
        }
        const N = clusters.reduce((a, b) => a + b.length, 0);
        const p = (K - 1) + D * K + 1;
        variance /= (N - K);

        for (let i = 0; i < K; ++i) {
            const n = clusters[i].length;
            const log_likelihood = 
                (n * Math.log(2 * Math.PI) -
                n * D * Math.log(variance) -
                (n - K)) * .5 + 
                n * Math.log(n) - 
                n * Math.log(N);
            result += log_likelihood - p * .5 * Math.log(N);
        }
        return result;
    }
    
}