Friday, April 22, 2016

STL Version of k-Mean Clustering Algorithm

Today got some time to implement the k-Mean clustering algorithm in C++. It's far from perfection, but probably it's workable. Post here for further improvement.

Note it's done on Visual Studio 2012, older versions is no guarantee to work.

ClusteringUtils.h

#pragma once

#include <vector>
#include <cmath>
#include <limits>
#include <cstddef>
#include <climits>
#include <algorithm>
#include <functional>
#include <iostream>

using namespace std;

// Wrapped class for array, never manages any resource
// The reason for devising such a wrapper because for handling
// the throughput of data feed
template<typename T, unsigned int N>
class Vector 
{
public:
Vector(T* v) : m_v(v) {}
T* begin()
{
return m_v;
}
T* end()
{
return m_v + N;
}

T operator[](unsigned int index) const
{
return m_v[index];
}

T& operator[](unsigned int index)
{
return m_v[index];
}

operator vector<double>()
{
vector<double> v(begin(), end());
return std::move(v);
}

private:
T* m_v;
};

// Data structure for cluster
// Note the elements of center will be of data type double
template<typename T, unsigned int N>
class tagClusInfo
{
public:
void reset() 
{
clus.clear();
}
vector< Vector<T, N>* > clus;
vector<double> centre;
};

// Abstract interface for clustering algorithms
template<typename T, unsigned int N>
class CClustering
{
public:
virtual bool Clustering(vector< Vector<T, N> >& inputs) { return false; }
virtual bool PrintCluster(int index) { return false; }

protected:

double distanceCalc(vector<double>& p, vector<double>& q)
{
double c = 0;
auto fn = [&](double& a, double& b) {
c += (a - b) * (a - b);
};

for(auto itp = p.begin(), itq = q.begin(); itp != p.end() && itq != q.end(); ++ itp, ++ itq)
fn(*itp, *itq);

return sqrt(c);
}

double distanceCalc(vector<double>& p, Vector<T, N>& q)
{
double c = 0;
auto fn = [&](double& a, T& b) {
c += (a - b) * (a - b);
};

auto itp = p.begin();
auto itq = q.begin();
for(; itp != p.end() && itq != q.end(); ++ itp, ++ itq)
fn(*itp, *itq);

return sqrt(c);
}

double distanceCalc(Vector<T, N>& p, vector<double>& q)
{
double c = 0;
auto fn = [&](T& a, double& b) {
c += (a - b) * (a - b);
};

auto itp = p.begin();
auto itq = q.begin();
for(; itp != p.end() && itq != q.end(); ++ itp, ++ itq)
fn(*itp, *itq);

return sqrt(c);
}

double distanceCalc(Vector<T, N>& p, Vector<T, N>& q)
{
double c = 0;
auto fn = [&](T& a, T& b) {
c += (a - b) * (a - b);
};

for(auto itp = p.begin(), itq = q.begin(); itp != p.end() && itq != q.end(); ++ itp, ++ itq)
fn(*itp, *itq);

return sqrt(c);
}

vector<double> centerCalc(vector< Vector<T, N>* >& elems, int count)
{
vector<double> center = *elems[0];
auto it = elems.begin();

for(it ++; it != elems.end(); ++ it)
{
transform((*it)->begin(), (*it)->end(), center.begin(), center.begin(), plus<double>());
}

transform(center.begin(), center.end(), center.begin(), [=](double& c) { return c /= (double)count; });

return std::move(center);
}

vector<double> centerCalc(vector< vector<double>* >& elems, int count)
{
vector<double> center = *elems[0];
auto it = elems.begin();

for(it ++; it != elems.end(); ++ it)
{
transform((*it)->begin(), (*it)->end(), center.begin(), center.begin(), plus<double>());
}

transform(center.begin(), center.end(), center.begin(), [=](double& c) { return c /= (double)count; });

return std::move(center);
}

void printElem(vector<double>& v)
{
for_each(v.begin(), v.end(), [](double& c){
cout << "\t" << c;
});

cout << endl;
}

void printElem(Vector<T, N>& v)
{
for_each(v.begin(), v.end(), [](T& c){
cout << "\t" << c;
});

cout << endl;
}
};

KMeanClustering.h

#pragma once

#include <time.h>

#include "ClusteringUtils.h"

#define KMEANDEBUG 0

template<typename T, unsigned int N>
class tagKMeanClusInfo : public tagClusInfo<T, N>
{
public:
tagKMeanClusInfo() : varClus(0) {}

private:
double varClus;
};

template<typename T, unsigned int N>
class CKMeanClustering : public CClustering<T, N>
{
public:
CKMeanClustering(unsigned numOfClusters = 2, double thres = 1.0) : m_numOfClusters(numOfClusters), m_thres(thres) 
{
for (unsigned i = 0; i < m_numOfClusters; i ++)
m_clusts.push_back(tagKMeanClusInfo<T, N>());
}

~CKMeanClustering(void) {}
virtual bool Clustering(vector< Vector<T, N> >& inputs)
{
if (inputs.size() < m_numOfClusters)
return false;

// Initialize cluster centers
// Random assign cluster centers
vector<int> indices;
srand(clock());
do 
{
int index = rand() % inputs.size();
if (find(indices.begin(), indices.end(), index) == indices.end())
indices.push_back(index);
} while (indices.size() != m_numOfClusters);

#if KMEANDEBUG

cout << "The indices chosen to be centers are" << endl;
for_each(indices.begin(), indices.end(), [=](int& i){
cout << i << '\t';
});
cout << endl;
#endif
// Here use capture method reference to fool the compiler
// Otherwise, it's too wise to try to move
for_each(indices.begin(), indices.end(), [&](int& i){
m_centers.push_back(*(inputs.begin() + i));
});

__repeat:

#if KMEANDEBUG
cout << "The centers are as follows in the current round" << endl;
for_each(m_centers.begin(), m_centers.end(), [=](vector<double>& v){
printElem(v);
});
cout << endl;
#endif

for (unsigned i = 0; i < m_numOfClusters; i ++)
m_clusts[i].reset();

for (unsigned int i = 0; i < inputs.size(); i ++)
{
double distance = numeric_limits<double>::max();
int md_index = -1;
unsigned int k = 0;

for (; k < m_centers.size(); k ++)
{
// find the minimum distance to a center
double t = distanceCalc(inputs[i], m_centers[k]);

if (t < distance)
{
distance = t;
md_index = k;
}
}

m_clusts[md_index].clus.push_back(&inputs[i]);
}

// Compute new average as new center for each cluster
for (unsigned int k = 0; k < m_centers.size(); k ++)
{
m_clusts[k].centre = centerCalc(m_clusts[k].clus, m_clusts[k].clus.size());
}
// Updating to other parameters are neglected


// If center has changed then iterate, else terminate
for (unsigned int k = 0; k < m_centers.size(); k ++)
{
if (distanceCalc(m_clusts[k].centre, m_centers[k]) > m_thres)
{
for (unsigned l = 0; l < m_centers.size(); l ++)
m_centers[l] = m_clusts[l].centre;

goto __repeat;
}
}

return true;
}

virtual bool PrintCluster(int index)
{
if (index < 0 || index >= (int)m_numOfClusters)
return false;
cout << "Cluster " << index << ":" << endl;
cout << "center:" << endl;
printElem(m_clusts[index].centre);

cout << "members:" << endl;
for (auto it = m_clusts[index].clus.begin(); it != m_clusts[index].clus.end(); ++ it)
printElem(**it);

cout << endl;

return true;
}

private:
double m_thres;
unsigned int m_numOfClusters;
vector< vector<double> > m_centers;
vector< tagKMeanClusInfo<T, N> > m_clusts;
double m_varTotal;
};

extern void KMeanClusteringTest();

KMeanClustering.cpp

#include "stdafx.h"
#include "KMeanClustering.h"

void KMeanClusteringTest()
{
CKMeanClustering<int, 2> kMeanClustering;

int data[][2] = {
{2, 1},
{3, 1},
{2, 4},
{3, 2},
{2, 3},
{1, 3}
};

vector< Vector<int, 2> > inputs;
Vector<int, 2> v1(data[0]), 
v2(data[1]), 
v3(data[2]),
v4(data[3]), 
v5(data[4]),
v6(data[5]);


inputs.push_back(v1);
inputs.push_back(v2);
inputs.push_back(v3);
inputs.push_back(v4);
inputs.push_back(v5);
inputs.push_back(v6);

kMeanClustering.Clustering(inputs);
kMeanClustering.PrintCluster(0);
kMeanClustering.PrintCluster(1);
}

Main.cpp


#include "stdafx.h"

#include "KMeanClustering.h"

int _tmain(int argc, _TCHAR* argv[])
{
KMeanClusteringTest();
return 0;
}

Happy coding!