Deep Learning Pearls: STL Version of k-Mean Clustering Algorithm

Today got some time to implement the k-Mean clustering algorithm in C++. It's far from perfection, but probably it's workable. Post here for further improvement.

Note it's done on Visual Studio 2012, older versions is no guarantee to work.

ClusteringUtils.h

#pragma once

#include <vector>
#include <cmath>
#include <limits>
#include <cstddef>
#include <climits>
#include <algorithm>
#include <functional>
#include <iostream>

using namespace std;

// Wrapped class for array, never manages any resource
// The reason for devising such a wrapper because for handling
// the throughput of data feed
template<typename T, unsigned int N>
class Vector
{
public:
Vector(T* v) : m_v(v) {}
T* begin()
{
return m_v;
}
T* end()
{
return m_v + N;
}

T operator[](unsigned int index) const
{
return m_v[index];
}

T& operator[](unsigned int index)
{
return m_v[index];
}

operator vector<double>()
{
vector<double> v(begin(), end());
return std::move(v);
}

private:
T* m_v;
};

// Data structure for cluster
// Note the elements of center will be of data type double
template<typename T, unsigned int N>
class tagClusInfo
{
public:
void reset()
{
clus.clear();
}
vector< Vector<T, N>* > clus;
vector<double> centre;
};

// Abstract interface for clustering algorithms
template<typename T, unsigned int N>
class CClustering
{
public:
virtual bool Clustering(vector< Vector<T, N> >& inputs) { return false; }
virtual bool PrintCluster(int index) { return false; }

protected:

double distanceCalc(vector<double>& p, vector<double>& q)
{
double c = 0;
auto fn = [&](double& a, double& b) {
c += (a - b) * (a - b);
};

for(auto itp = p.begin(), itq = q.begin(); itp != p.end() && itq != q.end(); ++ itp, ++ itq)
fn(*itp, *itq);

return sqrt(c);
}

double distanceCalc(vector<double>& p, Vector<T, N>& q)
{
double c = 0;
auto fn = [&](double& a, T& b) {
c += (a - b) * (a - b);
};

auto itp = p.begin();
auto itq = q.begin();
for(; itp != p.end() && itq != q.end(); ++ itp, ++ itq)
fn(*itp, *itq);

return sqrt(c);
}

double distanceCalc(Vector<T, N>& p, vector<double>& q)
{
double c = 0;
auto fn = [&](T& a, double& b) {
c += (a - b) * (a - b);
};

auto itp = p.begin();
auto itq = q.begin();
for(; itp != p.end() && itq != q.end(); ++ itp, ++ itq)
fn(*itp, *itq);

return sqrt(c);
}

double distanceCalc(Vector<T, N>& p, Vector<T, N>& q)
{
double c = 0;
auto fn = [&](T& a, T& b) {
c += (a - b) * (a - b);
};

for(auto itp = p.begin(), itq = q.begin(); itp != p.end() && itq != q.end(); ++ itp, ++ itq)
fn(*itp, *itq);

return sqrt(c);
}

vector<double> centerCalc(vector< Vector<T, N>* >& elems, int count)
{
vector<double> center = *elems[0];
auto it = elems.begin();

for(it ++; it != elems.end(); ++ it)
{
transform((*it)->begin(), (*it)->end(), center.begin(), center.begin(), plus<double>());
}

transform(center.begin(), center.end(), center.begin(), [=](double& c) { return c /= (double)count; });

return std::move(center);
}

vector<double> centerCalc(vector< vector<double>* >& elems, int count)
{
vector<double> center = *elems[0];
auto it = elems.begin();

for(it ++; it != elems.end(); ++ it)
{
transform((*it)->begin(), (*it)->end(), center.begin(), center.begin(), plus<double>());
}

transform(center.begin(), center.end(), center.begin(), [=](double& c) { return c /= (double)count; });

return std::move(center);
}

void printElem(vector<double>& v)
{
for_each(v.begin(), v.end(), [](double& c){
cout << "\t" << c;
});

cout << endl;
}

void printElem(Vector<T, N>& v)
{
for_each(v.begin(), v.end(), [](T& c){
cout << "\t" << c;
});

cout << endl;
}
};

KMeanClustering.h

#pragma once

#include <time.h>

#include "ClusteringUtils.h"

#define KMEANDEBUG 0

template<typename T, unsigned int N>

class tagKMeanClusInfo : public tagClusInfo<T, N>

{

public:

tagKMeanClusInfo() : varClus(0) {}

private:

double varClus;

};

template<typename T, unsigned int N>

class CKMeanClustering : public CClustering<T, N>

{

public:

CKMeanClustering(unsigned numOfClusters = 2, double thres = 1.0) : m_numOfClusters(numOfClusters), m_thres(thres)

{

for (unsigned i = 0; i < m_numOfClusters; i ++)

m_clusts.push_back(tagKMeanClusInfo<T, N>());

}

~CKMeanClustering(void) {}

virtual bool Clustering(vector< Vector<T, N> >& inputs)

{

if (inputs.size() < m_numOfClusters)

return false;

// Initialize cluster centers

// Random assign cluster centers

vector<int> indices;

srand(clock());

{

int index = rand() % inputs.size();

if (find(indices.begin(), indices.end(), index) == indices.end())

indices.push_back(index);

} while (indices.size() != m_numOfClusters);

#if KMEANDEBUG

cout << "The indices chosen to be centers are" << endl;

for_each(indices.begin(), indices.end(), [=](int& i){

cout << i << '\t';

});

cout << endl;

#endif

// Here use capture method reference to fool the compiler

// Otherwise, it's too wise to try to move

for_each(indices.begin(), indices.end(), [&](int& i){

m_centers.push_back(*(inputs.begin() + i));

});

__repeat:

#if KMEANDEBUG

cout << "The centers are as follows in the current round" << endl;

for_each(m_centers.begin(), m_centers.end(), [=](vector<double>& v){

printElem(v);

});

cout << endl;

#endif

for (unsigned i = 0; i < m_numOfClusters; i ++)

m_clusts[i].reset();

for (unsigned int i = 0; i < inputs.size(); i ++)

{

double distance = numeric_limits<double>::max();

int md_index = -1;

unsigned int k = 0;

for (; k < m_centers.size(); k ++)

{

// find the minimum distance to a center

double t = distanceCalc(inputs[i], m_centers[k]);

if (t < distance)

{

distance = t;

md_index = k;

}

m_clusts[md_index].clus.push_back(&inputs[i]);

}

// Compute new average as new center for each cluster

for (unsigned int k = 0; k < m_centers.size(); k ++)

{

m_clusts[k].centre = centerCalc(m_clusts[k].clus, m_clusts[k].clus.size());

}

// Updating to other parameters are neglected

// If center has changed then iterate, else terminate

for (unsigned int k = 0; k < m_centers.size(); k ++)

{

if (distanceCalc(m_clusts[k].centre, m_centers[k]) > m_thres)

{

for (unsigned l = 0; l < m_centers.size(); l ++)

m_centers[l] = m_clusts[l].centre;

goto __repeat;

}

return true;

}

virtual bool PrintCluster(int index)

{

if (index < 0 || index >= (int)m_numOfClusters)

return false;

cout << "Cluster " << index << ":" << endl;

cout << "center:" << endl;

printElem(m_clusts[index].centre);

cout << "members:" << endl;

for (auto it = m_clusts[index].clus.begin(); it != m_clusts[index].clus.end(); ++ it)

printElem(**it);

cout << endl;

return true;

}

private:

double m_thres;

unsigned int m_numOfClusters;

vector< vector<double> > m_centers;

vector< tagKMeanClusInfo<T, N> > m_clusts;

double m_varTotal;

};

extern void KMeanClusteringTest();

KMeanClustering.cpp

#include "stdafx.h"

#include "KMeanClustering.h"

void KMeanClusteringTest()

{

CKMeanClustering<int, 2> kMeanClustering;

int data[][2] = {

{2, 1},

{3, 1},

{2, 4},

{3, 2},

{2, 3},

{1, 3}

};

vector< Vector<int, 2> > inputs;

Vector<int, 2> v1(data[0]),

v2(data[1]),

v3(data[2]),

v4(data[3]),

v5(data[4]),

v6(data[5]);

inputs.push_back(v1);

inputs.push_back(v2);

inputs.push_back(v3);

inputs.push_back(v4);

inputs.push_back(v5);

inputs.push_back(v6);

kMeanClustering.Clustering(inputs);

kMeanClustering.PrintCluster(0);

kMeanClustering.PrintCluster(1);

}

Main.cpp

#include "stdafx.h"

#include "KMeanClustering.h"

int _tmain(int argc, _TCHAR* argv[])

{

KMeanClusteringTest();

return 0;

}

Happy coding!

Deep Learning Pearls

Friday, April 22, 2016

STL Version of k-Mean Clustering Algorithm

No comments:

Post a Comment