Monday, March 21, 2016

How to draw a Julia set via CUDA

I am beginning investigate CUDA for leveraging the massive parallel paradigm. As expected, not everything goes on smoothly. When I went through some chapters from the book "CUDA by Example", it's great but it failed mentioned (or at least explicitly. The maxThreadsPerBlock field of the device property could easily be overlooked for a newbie) to me that at the beginning there's constrain on how many threads can resident simultaneously in one block. Surfing the Internet leads to an answer that the volume cannot exceed 768. I would bet the answer is correct. The bad news if the rule is violated, the most annoying thing is nothing happened. So just logged here for reminding.

The following pasted the source code for reference:

Utils.h

#pragma once
#include <Windows.h>

class CUtils
{
public:
CUtils(void);
~CUtils(void);
BOOL PrepBitmap(HDC hDC, int nWidth, int nHeight);
BOOL DrawBitmap();
BOOL DispBitmap(HDC hDC);
BOOL SaveBitmap(HWND hWnd);
private:
// bool julia(int x, int y);
private:
HDC m_memDC;
HBITMAP m_memBM;

int m_nWidth;
int m_nHeight;

bool m_juliaReady;
};


Utils.cu

#include "stdafx.h"
#include "Utils.h"

#include <stdlib.h>
#include <malloc.h>

#include <cuda_runtime.h>

template <typename T>
class cuComplex
{
public:
__device__ cuComplex(T a, T b) : r(a), i(b) {}
__device__ T magnitude2(void) { return r * r + i * i; }
__device__ cuComplex operator*(const cuComplex& a)
{
return cuComplex(r * a.r - i * a.i, i * a.r + r * a.i);
}
__device__ cuComplex operator+(const cuComplex& a)
{
return cuComplex(r + a.r, i + a.i);
}

private:
T r;
T i;
};

__global__ void julia(int width, int height, char* mask)
{
int x = blockIdx.x;
int y = blockIdx.y;

if (x < width && y < height)
{
const float scale = 1.5;

float jx = scale * (float)((width >> 1) - x) / (width >> 1);
float jy = scale * (float)((height >> 1) - y) / (height >> 1);

cuComplex<float> c(-0.8, 0.156);
cuComplex<float> a(jx, jy);

for (int i = 0; i < 200; i++)
{
a = a * a + c;
if (a.magnitude2() > 1000)
{
mask[y * width + x] = 0;

return ;
}
}

mask[y * width + x] = 1;

}
}

CUtils::CUtils(void) : m_memDC(NULL), m_memBM(NULL), m_juliaReady(false)
{
}

BOOL CUtils::PrepBitmap(HDC hDC, int nWidth = 640, int nHeight = 480)
{
m_nWidth = nWidth;
m_nHeight = nHeight;

    m_memDC = CreateCompatibleDC(hDC);
    m_memBM = CreateCompatibleBitmap(hDC, nWidth, nHeight);
    SelectObject(m_memDC, m_memBM);

return TRUE;
}

BOOL CUtils::DrawBitmap()
{
int size = (m_nWidth * m_nHeight) * sizeof(char);
char* maskHost = (char*)malloc(size);

char* maskDevice;

cudaError_t cuErr;

int dev = 1;
cudaSetDevice(dev);

cuErr = cudaMalloc(&maskDevice, size);

if (cuErr != cudaSuccess)
{
MessageBox(NULL, _T("cudaMalloc() failed"), _T("Error"), MB_OK);
return FALSE;
}

dim3 grid(m_nWidth, m_nHeight);
julia<<<grid, 1>>>(m_nWidth, m_nHeight, maskDevice);

cudaDeviceSynchronize();
cudaMemcpy(maskHost, maskDevice, size, cudaMemcpyDeviceToHost);

for (int x = 0; x < m_nWidth; x ++)
{
for (int y = 0; y < m_nHeight; y ++)
SetPixel(m_memDC, x, y, maskHost[y * m_nWidth + x]? RGB(255, 255, 255): RGB(0, 0, 0));
}

free(maskHost);
cudaFree(maskDevice);

m_juliaReady = true;

return TRUE;
}

BOOL CUtils::DispBitmap(HDC hDC)
{
if (!m_juliaReady)
{
if (!m_memBM)
PrepBitmap(hDC);

DrawBitmap();
}

BitBlt(hDC, 0, 0, m_nWidth, m_nHeight, m_memDC, 0, 0,SRCCOPY);

return TRUE;
}

#if 0

bool CUtils::julia(int x, int y)
{
const double scale = 1.5;

double jx = scale * (double)((m_nWidth >> 1) - x) / (m_nWidth >> 1);
double jy = scale * (double)((m_nHeight >> 1) - y) / (m_nHeight >> 1);

cuComplex<double> c(-0.8, 0.156);
cuComplex<double> a(jx, jy);

int i = 0;
for (i=0; i<200; i++)
{
a = a * a + c;
if (a.magnitude2() > 1000)
return 0;
}

return 1;
}

#endif

CUtils::~CUtils(void)
{
if (m_memBM)
DeleteObject(m_memBM);
if (m_memDC)
DeleteObject(m_memDC);
}

// Refer to https://msdn.microsoft.com/en-us/library/windows/desktop/dd183402(v=vs.85).aspx
BOOL CUtils::SaveBitmap(HWND hWnd)
{
BITMAP bmpJulia;
    GetObject(m_memBM,sizeof(BITMAP),&bmpJulia);
   
    BITMAPFILEHEADER   bmfHeader;  
    BITMAPINFOHEADER   bi;
   
    bi.biSize = sizeof(BITMAPINFOHEADER);  
    bi.biWidth = bmpJulia.bmWidth;  
    bi.biHeight = bmpJulia.bmHeight;
    bi.biPlanes = 1;  
    bi.biBitCount = 32;  
    bi.biCompression = BI_RGB;  
    bi.biSizeImage = 0;
    bi.biXPelsPerMeter = 0;  
    bi.biYPelsPerMeter = 0;  
    bi.biClrUsed = 0;  
    bi.biClrImportant = 0;

    DWORD dwBmpSize = ((bmpJulia.bmWidth * bi.biBitCount + 31) / 32) * 4 * bmpJulia.bmHeight;

    // Starting with 32-bit Windows, GlobalAlloc and LocalAlloc are implemented as wrapper functions that
    // call HeapAlloc using a handle to the process's default heap. Therefore, GlobalAlloc and LocalAlloc
    // have greater overhead than HeapAlloc.
    HANDLE hDIB = GlobalAlloc(GHND,dwBmpSize);
    char *lpbitmap = (char *)GlobalLock(hDIB);  

    // Gets the "bits" from the bitmap and copies them into a buffer
    // which is pointed to by lpbitmap.
    GetDIBits(GetDC(hWnd), m_memBM, 0,
        (UINT)bmpJulia.bmHeight,
        lpbitmap,
        (BITMAPINFO *)&bi, DIB_RGB_COLORS);

    // A file is created, this is where we will save the screen capture.
    HANDLE hFile = CreateFile(L"Julia.bmp",
        GENERIC_WRITE,
        0,
        NULL,
        CREATE_ALWAYS,
        FILE_ATTRIBUTE_NORMAL, NULL);
 
    // Add the size of the headers to the size of the bitmap to get the total file size
    DWORD dwSizeofDIB = dwBmpSize + sizeof(BITMAPFILEHEADER) + sizeof(BITMAPINFOHEADER);

    //Offset to where the actual bitmap bits start.
    bmfHeader.bfOffBits = (DWORD)sizeof(BITMAPFILEHEADER) + (DWORD)sizeof(BITMAPINFOHEADER);
 
    //Size of the file
    bmfHeader.bfSize = dwSizeofDIB;
 
    //bfType must always be BM for Bitmaps
    bmfHeader.bfType = 0x4D42; //BM

    DWORD dwBytesWritten = 0;
    WriteFile(hFile, (LPSTR)&bmfHeader, sizeof(BITMAPFILEHEADER), &dwBytesWritten, NULL);
    WriteFile(hFile, (LPSTR)&bi, sizeof(BITMAPINFOHEADER), &dwBytesWritten, NULL);
    WriteFile(hFile, (LPSTR)lpbitmap, dwBmpSize, &dwBytesWritten, NULL);
 
    //Unlock and Free the DIB from the heap
    GlobalUnlock(hDIB);  
    GlobalFree(hDIB);

    //Close the handle for the file that was created
    CloseHandle(hFile);

return TRUE;
}

The result:

Tuesday, March 8, 2016

How to devise an auto-encoder

I think Autoencoder is an important type of MLP since one way it's to some extent an extension of the traditional classification method utilizing MLP, another way it's the key to understanding some ingenious training ideal which weights high in application of Deep Learning.

In this post I will guide to build a very simple autoencoder from implementation perspective. I have to state first that I am not intended to break Ng's law that do not share source code, typically when most content of this post is borrowed from his famous website http://ufldl.stanford.edu/wiki/index.php/UFLDL_Tutorial. This problem is most of the time, even I honestly write the code myself, but some time later I have no idea of my original thinking. So I write here mainly for reference. The encouragement is be familiar with it, not directly copy it.

The implementation is in Matlab script, typically, the work needs to be done in the sparseAutoencoderCost.m. I would think I have had implemented the autoencoder correctly, however no guarantee:

function [cost,grad] = sparseAutoencoderCost(theta, visibleSize, hiddenSize, ...
                                             lambda, sparsityParam, beta, data)

% visibleSize: the number of input units (probably 64)
% hiddenSize: the number of hidden units (probably 25)
% lambda: weight decay parameter
% sparsityParam: The desired average activation for the hidden units (denoted in the lecture
%                           notes by the greek alphabet rho, which looks like a lower-case "p").
% beta: weight of sparsity penalty term
% data: Our 64x10000 matrix containing the training data.  So, data(:,i) is the i-th training example.
 
% The input theta is a vector (because minFunc expects the parameters to be a vector).
% We first convert theta to the (W1, W2, b1, b2) matrix/vector format, so that this
% follows the notation convention of the lecture notes.

W1 = reshape(theta(1:hiddenSize*visibleSize), hiddenSize, visibleSize);
W2 = reshape(theta(hiddenSize*visibleSize+1:2*hiddenSize*visibleSize), visibleSize, hiddenSize);
b1 = theta(2*hiddenSize*visibleSize+1:2*hiddenSize*visibleSize+hiddenSize);
b2 = theta(2*hiddenSize*visibleSize+hiddenSize+1:end);

% Cost and gradient variables (your code needs to compute these values).
% Here, we initialize them to zeros.
cost = 0;
W1grad = zeros(size(W1));
W2grad = zeros(size(W2));
b1grad = zeros(size(b1));
b2grad = zeros(size(b2));

%% ---------- YOUR CODE HERE --------------------------------------
%  Instructions: Compute the cost/optimization objective J_sparse(W,b) for the Sparse Autoencoder,
%                and the corresponding gradients W1grad, W2grad, b1grad, b2grad.
%
% W1grad, W2grad, b1grad and b2grad should be computed using backpropagation.
% Note that W1grad has the same dimensions as W1, b1grad has the same dimensions
% as b1, etc.  Your code should set W1grad to be the partial derivative of J_sparse(W,b) with
% respect to W1.  I.e., W1grad(i,j) should be the partial derivative of J_sparse(W,b)
% with respect to the input parameter W1(i,j).  Thus, W1grad should be equal to the term
% [(1/m) \Delta W^{(1)} + \lambda W^{(1)}] in the last block of pseudo-code in Section 2.2
% of the lecture notes (and similarly for W2grad, b1grad, b2grad).
%
% Stated differently, if we were using batch gradient descent to optimize the parameters,
% the gradient descent update to W1 would be W1 := W1 - alpha * W1grad, and similarly for W2, b1, b2.
%

% Since it's a special case, so iteration is avoided here
% first we calculate the activation of the first hidden layer

% Since dimension of W1 is of hiddenSize*visibleSize, so W'*x is a proposed
% alternative, since the bias term needs to be taken into consideration, we
% modify W to reflect such a effectiveness

% Keep in mind it's intended to utilize the computation in vector form as thorough as possible

ss = size(data, 2);

% Catering for the bias term, so we manipulate the weights and inputs 
% for the input layer
W1_e = [b1 W1];
data_e = [ones(1, ss); data];

% Calculation for the hidden layer
u1 = W1_e * data_e;
x2 = sigmoid(u1);


% The same manipulation applies to the input layer
W2_e = [b2 W2];
x2_e = [ones(1, ss); x2];


% Calculation for the output layer
u2 = W2_e * x2_e;
output = sigmoid(u2);

% Up to now, the forward propagation is completed

% Now it's time to build the cost function

% First the avarage cost without weight decay could be calculated
diff_output = output - data;

cost = mean(sum(diff_output .^ 2)) / 2;

% Take the weight decay into consideration
cost = cost + lambda * (sum(sum(W1 .^ 2)) + sum(sum(W2 .^ 2))) / 2;

% The average activity of the hidden layer can be computed as follows
rho = mean(x2, 2);

sparsity_cost = sparsityParam * log(sparsityParam ./ rho) + ...
    (1 - sparsityParam) * log((1 - sparsityParam) ./ (1 - rho));

cost = cost + beta * sum(sparsity_cost);

% Now we compute the gradient via back-propagation
% We do the computation from an element-wise viewpoint but from an
% vector-wise way
delta_output = (output - data) .* output .* (1 - output);

b2grad = mean(delta_output, 2);

for i = 1 : ss;
    W2grad = W2grad + delta_output(:, i) * x2(:, i)';
end

W2grad = W2grad / ss + lambda * W2;

delta_sparsity = -1 * sparsityParam ./ rho + ...
    (1 - sparsityParam) ./ (1 - rho);

delta_hidden = W2' * delta_output + repmat(beta * delta_sparsity, 1, ss);

delta_hidden = delta_hidden .* x2 .* (1 - x2);

b1grad = mean (delta_hidden, 2);

for i = 1 : ss;
    W1grad = W1grad + delta_hidden(:, i) * data(:, i)';
end

W1grad = W1grad / ss + lambda * W1;


%-------------------------------------------------------------------
% After computing the cost and gradient, we will convert the gradients back
% to a vector format (suitable for minFunc).  Specifically, we will unroll
% your gradient matrices into a vector.

grad = [W1grad(:) ; W2grad(:) ; b1grad(:) ; b2grad(:)];

end

%-------------------------------------------------------------------
% Here's an implementation of the sigmoid function, which you may find useful
% in your computation of the costs and the gradients.  This inputs a (row or
% column) vector (say (z1, z2, z3)) and returns (f(z1), f(z2), f(z3)). 

function sigm = sigmoid(x)
  
    sigm = 1 ./ (1 + exp(-x));
end


It's estimated the most difficulty part is the implementation of Autoencoder, once we have one, some interesting results can be spotted. The first figure belows show some samples to be learnt from, and the second figure is the weights learnt however displayed in an squared image way:



Happy exploring, happy coding.