Today got some time to implement the k-Mean clustering algorithm in C++. It's far from perfection, but probably it's workable. Post here for further improvement.
Note it's done on Visual Studio 2012, older versions is no guarantee to work.
ClusteringUtils.h
#pragma once
#include <vector>
#include <cmath>
#include <limits>
#include <cstddef>
#include <climits>
#include <algorithm>
#include <functional>
#include <iostream>
using namespace std;
// Wrapped class for array, never manages any resource
// The reason for devising such a wrapper because for handling
// the throughput of data feed
template<typename T, unsigned int N>
class Vector
{
public:
Vector(T* v) : m_v(v) {}
T* begin()
{
return m_v;
}
T* end()
{
return m_v + N;
}
T operator[](unsigned int index) const
{
return m_v[index];
}
T& operator[](unsigned int index)
{
return m_v[index];
}
operator vector<double>()
{
vector<double> v(begin(), end());
return std::move(v);
}
private:
T* m_v;
};
// Data structure for cluster
// Note the elements of center will be of data type double
template<typename T, unsigned int N>
class tagClusInfo
{
public:
void reset()
{
clus.clear();
}
vector< Vector<T, N>* > clus;
vector<double> centre;
};
// Abstract interface for clustering algorithms
template<typename T, unsigned int N>
class CClustering
{
public:
virtual bool Clustering(vector< Vector<T, N> >& inputs) { return false; }
virtual bool PrintCluster(int index) { return false; }
protected:
double distanceCalc(vector<double>& p, vector<double>& q)
{
double c = 0;
auto fn = [&](double& a, double& b) {
c += (a - b) * (a - b);
};
for(auto itp = p.begin(), itq = q.begin(); itp != p.end() && itq != q.end(); ++ itp, ++ itq)
fn(*itp, *itq);
return sqrt(c);
}
double distanceCalc(vector<double>& p, Vector<T, N>& q)
{
double c = 0;
auto fn = [&](double& a, T& b) {
c += (a - b) * (a - b);
};
auto itp = p.begin();
auto itq = q.begin();
for(; itp != p.end() && itq != q.end(); ++ itp, ++ itq)
fn(*itp, *itq);
return sqrt(c);
}
double distanceCalc(Vector<T, N>& p, vector<double>& q)
{
double c = 0;
auto fn = [&](T& a, double& b) {
c += (a - b) * (a - b);
};
auto itp = p.begin();
auto itq = q.begin();
for(; itp != p.end() && itq != q.end(); ++ itp, ++ itq)
fn(*itp, *itq);
return sqrt(c);
}
double distanceCalc(Vector<T, N>& p, Vector<T, N>& q)
{
double c = 0;
auto fn = [&](T& a, T& b) {
c += (a - b) * (a - b);
};
for(auto itp = p.begin(), itq = q.begin(); itp != p.end() && itq != q.end(); ++ itp, ++ itq)
fn(*itp, *itq);
return sqrt(c);
}
vector<double> centerCalc(vector< Vector<T, N>* >& elems, int count)
{
vector<double> center = *elems[0];
auto it = elems.begin();
for(it ++; it != elems.end(); ++ it)
{
transform((*it)->begin(), (*it)->end(), center.begin(), center.begin(), plus<double>());
}
transform(center.begin(), center.end(), center.begin(), [=](double& c) { return c /= (double)count; });
return std::move(center);
}
vector<double> centerCalc(vector< vector<double>* >& elems, int count)
{
vector<double> center = *elems[0];
auto it = elems.begin();
for(it ++; it != elems.end(); ++ it)
{
transform((*it)->begin(), (*it)->end(), center.begin(), center.begin(), plus<double>());
}
transform(center.begin(), center.end(), center.begin(), [=](double& c) { return c /= (double)count; });
return std::move(center);
}
void printElem(vector<double>& v)
{
for_each(v.begin(), v.end(), [](double& c){
cout << "\t" << c;
});
cout << endl;
}
void printElem(Vector<T, N>& v)
{
for_each(v.begin(), v.end(), [](T& c){
cout << "\t" << c;
});
cout << endl;
}
};
Note it's done on Visual Studio 2012, older versions is no guarantee to work.
ClusteringUtils.h
#pragma once
#include <vector>
#include <cmath>
#include <limits>
#include <cstddef>
#include <climits>
#include <algorithm>
#include <functional>
#include <iostream>
using namespace std;
// Wrapped class for array, never manages any resource
// The reason for devising such a wrapper because for handling
// the throughput of data feed
template<typename T, unsigned int N>
class Vector
{
public:
Vector(T* v) : m_v(v) {}
T* begin()
{
return m_v;
}
T* end()
{
return m_v + N;
}
T operator[](unsigned int index) const
{
return m_v[index];
}
T& operator[](unsigned int index)
{
return m_v[index];
}
operator vector<double>()
{
vector<double> v(begin(), end());
return std::move(v);
}
private:
T* m_v;
};
// Data structure for cluster
// Note the elements of center will be of data type double
template<typename T, unsigned int N>
class tagClusInfo
{
public:
void reset()
{
clus.clear();
}
vector< Vector<T, N>* > clus;
vector<double> centre;
};
// Abstract interface for clustering algorithms
template<typename T, unsigned int N>
class CClustering
{
public:
virtual bool Clustering(vector< Vector<T, N> >& inputs) { return false; }
virtual bool PrintCluster(int index) { return false; }
protected:
double distanceCalc(vector<double>& p, vector<double>& q)
{
double c = 0;
auto fn = [&](double& a, double& b) {
c += (a - b) * (a - b);
};
for(auto itp = p.begin(), itq = q.begin(); itp != p.end() && itq != q.end(); ++ itp, ++ itq)
fn(*itp, *itq);
return sqrt(c);
}
double distanceCalc(vector<double>& p, Vector<T, N>& q)
{
double c = 0;
auto fn = [&](double& a, T& b) {
c += (a - b) * (a - b);
};
auto itp = p.begin();
auto itq = q.begin();
for(; itp != p.end() && itq != q.end(); ++ itp, ++ itq)
fn(*itp, *itq);
return sqrt(c);
}
double distanceCalc(Vector<T, N>& p, vector<double>& q)
{
double c = 0;
auto fn = [&](T& a, double& b) {
c += (a - b) * (a - b);
};
auto itp = p.begin();
auto itq = q.begin();
for(; itp != p.end() && itq != q.end(); ++ itp, ++ itq)
fn(*itp, *itq);
return sqrt(c);
}
double distanceCalc(Vector<T, N>& p, Vector<T, N>& q)
{
double c = 0;
auto fn = [&](T& a, T& b) {
c += (a - b) * (a - b);
};
for(auto itp = p.begin(), itq = q.begin(); itp != p.end() && itq != q.end(); ++ itp, ++ itq)
fn(*itp, *itq);
return sqrt(c);
}
vector<double> centerCalc(vector< Vector<T, N>* >& elems, int count)
{
vector<double> center = *elems[0];
auto it = elems.begin();
for(it ++; it != elems.end(); ++ it)
{
transform((*it)->begin(), (*it)->end(), center.begin(), center.begin(), plus<double>());
}
transform(center.begin(), center.end(), center.begin(), [=](double& c) { return c /= (double)count; });
return std::move(center);
}
vector<double> centerCalc(vector< vector<double>* >& elems, int count)
{
vector<double> center = *elems[0];
auto it = elems.begin();
for(it ++; it != elems.end(); ++ it)
{
transform((*it)->begin(), (*it)->end(), center.begin(), center.begin(), plus<double>());
}
transform(center.begin(), center.end(), center.begin(), [=](double& c) { return c /= (double)count; });
return std::move(center);
}
void printElem(vector<double>& v)
{
for_each(v.begin(), v.end(), [](double& c){
cout << "\t" << c;
});
cout << endl;
}
void printElem(Vector<T, N>& v)
{
for_each(v.begin(), v.end(), [](T& c){
cout << "\t" << c;
});
cout << endl;
}
};
KMeanClustering.h
#pragma once
#include <time.h>
#include "ClusteringUtils.h"
#define KMEANDEBUG 0
template<typename T, unsigned int N>
class tagKMeanClusInfo : public tagClusInfo<T, N>
{
public:
tagKMeanClusInfo() : varClus(0) {}
private:
double varClus;
};
template<typename T, unsigned int N>
class CKMeanClustering : public CClustering<T, N>
{
public:
CKMeanClustering(unsigned numOfClusters = 2, double thres = 1.0) : m_numOfClusters(numOfClusters), m_thres(thres)
{
for (unsigned i = 0; i < m_numOfClusters; i ++)
m_clusts.push_back(tagKMeanClusInfo<T, N>());
}
~CKMeanClustering(void) {}
virtual bool Clustering(vector< Vector<T, N> >& inputs)
{
if (inputs.size() < m_numOfClusters)
return false;
// Initialize cluster centers
// Random assign cluster centers
vector<int> indices;
srand(clock());
do
{
int index = rand() % inputs.size();
if (find(indices.begin(), indices.end(), index) == indices.end())
indices.push_back(index);
} while (indices.size() != m_numOfClusters);
#if KMEANDEBUG
cout << "The indices chosen to be centers are" << endl;
for_each(indices.begin(), indices.end(), [=](int& i){
cout << i << '\t';
});
cout << endl;
#endif
// Here use capture method reference to fool the compiler
// Otherwise, it's too wise to try to move
for_each(indices.begin(), indices.end(), [&](int& i){
m_centers.push_back(*(inputs.begin() + i));
});
__repeat:
#if KMEANDEBUG
cout << "The centers are as follows in the current round" << endl;
for_each(m_centers.begin(), m_centers.end(), [=](vector<double>& v){
printElem(v);
});
cout << endl;
#endif
for (unsigned i = 0; i < m_numOfClusters; i ++)
m_clusts[i].reset();
for (unsigned int i = 0; i < inputs.size(); i ++)
{
double distance = numeric_limits<double>::max();
int md_index = -1;
unsigned int k = 0;
for (; k < m_centers.size(); k ++)
{
// find the minimum distance to a center
double t = distanceCalc(inputs[i], m_centers[k]);
if (t < distance)
{
distance = t;
md_index = k;
}
}
m_clusts[md_index].clus.push_back(&inputs[i]);
}
// Compute new average as new center for each cluster
for (unsigned int k = 0; k < m_centers.size(); k ++)
{
m_clusts[k].centre = centerCalc(m_clusts[k].clus, m_clusts[k].clus.size());
}
// Updating to other parameters are neglected
// If center has changed then iterate, else terminate
for (unsigned int k = 0; k < m_centers.size(); k ++)
{
if (distanceCalc(m_clusts[k].centre, m_centers[k]) > m_thres)
{
for (unsigned l = 0; l < m_centers.size(); l ++)
m_centers[l] = m_clusts[l].centre;
goto __repeat;
}
}
return true;
}
virtual bool PrintCluster(int index)
{
if (index < 0 || index >= (int)m_numOfClusters)
return false;
cout << "Cluster " << index << ":" << endl;
cout << "center:" << endl;
printElem(m_clusts[index].centre);
cout << "members:" << endl;
for (auto it = m_clusts[index].clus.begin(); it != m_clusts[index].clus.end(); ++ it)
printElem(**it);
cout << endl;
return true;
}
private:
double m_thres;
unsigned int m_numOfClusters;
vector< vector<double> > m_centers;
vector< tagKMeanClusInfo<T, N> > m_clusts;
double m_varTotal;
};
extern void KMeanClusteringTest();
KMeanClustering.cpp
#include "stdafx.h"
#include "KMeanClustering.h"
void KMeanClusteringTest()
{
CKMeanClustering<int, 2> kMeanClustering;
int data[][2] = {
{2, 1},
{3, 1},
{2, 4},
{3, 2},
{2, 3},
{1, 3}
};
vector< Vector<int, 2> > inputs;
Vector<int, 2> v1(data[0]),
v2(data[1]),
v3(data[2]),
v4(data[3]),
v5(data[4]),
v6(data[5]);
inputs.push_back(v1);
inputs.push_back(v2);
inputs.push_back(v3);
inputs.push_back(v4);
inputs.push_back(v5);
inputs.push_back(v6);
kMeanClustering.Clustering(inputs);
kMeanClustering.PrintCluster(0);
kMeanClustering.PrintCluster(1);
}
Main.cpp
#include "stdafx.h"
#include "KMeanClustering.h"
int _tmain(int argc, _TCHAR* argv[])
{
KMeanClusteringTest();
return 0;
}
Happy coding!