I am beginning investigate CUDA for leveraging the massive parallel paradigm. As expected, not everything goes on smoothly. When I went through some chapters from the book "CUDA by Example", it's great but it failed mentioned (or at least explicitly. The maxThreadsPerBlock field of the device property could easily be overlooked for a newbie) to me that at the beginning there's constrain on how many threads can resident simultaneously in one block. Surfing the Internet leads to an answer that the volume cannot exceed 768. I would bet the answer is correct. The bad news if the rule is violated, the most annoying thing is nothing happened. So just logged here for reminding.
The following pasted the source code for reference:
Utils.h
#pragma once
#include <Windows.h>
class CUtils
{
public:
CUtils(void);
~CUtils(void);
BOOL PrepBitmap(HDC hDC, int nWidth, int nHeight);
BOOL DrawBitmap();
BOOL DispBitmap(HDC hDC);
BOOL SaveBitmap(HWND hWnd);
private:
// bool julia(int x, int y);
private:
HDC m_memDC;
HBITMAP m_memBM;
int m_nWidth;
int m_nHeight;
bool m_juliaReady;
};
Utils.cu
#include "stdafx.h"
#include "Utils.h"
#include <stdlib.h>
#include <malloc.h>
#include <cuda_runtime.h>
template <typename T>
class cuComplex
{
public:
__device__ cuComplex(T a, T b) : r(a), i(b) {}
__device__ T magnitude2(void) { return r * r + i * i; }
__device__ cuComplex operator*(const cuComplex& a)
{
return cuComplex(r * a.r - i * a.i, i * a.r + r * a.i);
}
__device__ cuComplex operator+(const cuComplex& a)
{
return cuComplex(r + a.r, i + a.i);
}
private:
T r;
T i;
};
__global__ void julia(int width, int height, char* mask)
{
int x = blockIdx.x;
int y = blockIdx.y;
if (x < width && y < height)
{
const float scale = 1.5;
float jx = scale * (float)((width >> 1) - x) / (width >> 1);
float jy = scale * (float)((height >> 1) - y) / (height >> 1);
cuComplex<float> c(-0.8, 0.156);
cuComplex<float> a(jx, jy);
for (int i = 0; i < 200; i++)
{
a = a * a + c;
if (a.magnitude2() > 1000)
{
mask[y * width + x] = 0;
return ;
}
}
mask[y * width + x] = 1;
}
}
CUtils::CUtils(void) : m_memDC(NULL), m_memBM(NULL), m_juliaReady(false)
{
}
BOOL CUtils::PrepBitmap(HDC hDC, int nWidth = 640, int nHeight = 480)
{
m_nWidth = nWidth;
m_nHeight = nHeight;
m_memDC = CreateCompatibleDC(hDC);
m_memBM = CreateCompatibleBitmap(hDC, nWidth, nHeight);
SelectObject(m_memDC, m_memBM);
return TRUE;
}
BOOL CUtils::DrawBitmap()
{
int size = (m_nWidth * m_nHeight) * sizeof(char);
char* maskHost = (char*)malloc(size);
char* maskDevice;
cudaError_t cuErr;
int dev = 1;
cudaSetDevice(dev);
cuErr = cudaMalloc(&maskDevice, size);
if (cuErr != cudaSuccess)
{
MessageBox(NULL, _T("cudaMalloc() failed"), _T("Error"), MB_OK);
return FALSE;
}
dim3 grid(m_nWidth, m_nHeight);
julia<<<grid, 1>>>(m_nWidth, m_nHeight, maskDevice);
cudaDeviceSynchronize();
cudaMemcpy(maskHost, maskDevice, size, cudaMemcpyDeviceToHost);
for (int x = 0; x < m_nWidth; x ++)
{
for (int y = 0; y < m_nHeight; y ++)
SetPixel(m_memDC, x, y, maskHost[y * m_nWidth + x]? RGB(255, 255, 255): RGB(0, 0, 0));
}
free(maskHost);
cudaFree(maskDevice);
m_juliaReady = true;
return TRUE;
}
BOOL CUtils::DispBitmap(HDC hDC)
{
if (!m_juliaReady)
{
if (!m_memBM)
PrepBitmap(hDC);
DrawBitmap();
}
BitBlt(hDC, 0, 0, m_nWidth, m_nHeight, m_memDC, 0, 0,SRCCOPY);
return TRUE;
}
#if 0
bool CUtils::julia(int x, int y)
{
const double scale = 1.5;
double jx = scale * (double)((m_nWidth >> 1) - x) / (m_nWidth >> 1);
double jy = scale * (double)((m_nHeight >> 1) - y) / (m_nHeight >> 1);
cuComplex<double> c(-0.8, 0.156);
cuComplex<double> a(jx, jy);
int i = 0;
for (i=0; i<200; i++)
{
a = a * a + c;
if (a.magnitude2() > 1000)
return 0;
}
return 1;
}
#endif
CUtils::~CUtils(void)
{
if (m_memBM)
DeleteObject(m_memBM);
if (m_memDC)
DeleteObject(m_memDC);
}
// Refer to https://msdn.microsoft.com/en-us/library/windows/desktop/dd183402(v=vs.85).aspx
BOOL CUtils::SaveBitmap(HWND hWnd)
{
BITMAP bmpJulia;
GetObject(m_memBM,sizeof(BITMAP),&bmpJulia);
BITMAPFILEHEADER bmfHeader;
BITMAPINFOHEADER bi;
bi.biSize = sizeof(BITMAPINFOHEADER);
bi.biWidth = bmpJulia.bmWidth;
bi.biHeight = bmpJulia.bmHeight;
bi.biPlanes = 1;
bi.biBitCount = 32;
bi.biCompression = BI_RGB;
bi.biSizeImage = 0;
bi.biXPelsPerMeter = 0;
bi.biYPelsPerMeter = 0;
bi.biClrUsed = 0;
bi.biClrImportant = 0;
DWORD dwBmpSize = ((bmpJulia.bmWidth * bi.biBitCount + 31) / 32) * 4 * bmpJulia.bmHeight;
// Starting with 32-bit Windows, GlobalAlloc and LocalAlloc are implemented as wrapper functions that
// call HeapAlloc using a handle to the process's default heap. Therefore, GlobalAlloc and LocalAlloc
// have greater overhead than HeapAlloc.
HANDLE hDIB = GlobalAlloc(GHND,dwBmpSize);
char *lpbitmap = (char *)GlobalLock(hDIB);
// Gets the "bits" from the bitmap and copies them into a buffer
// which is pointed to by lpbitmap.
GetDIBits(GetDC(hWnd), m_memBM, 0,
(UINT)bmpJulia.bmHeight,
lpbitmap,
(BITMAPINFO *)&bi, DIB_RGB_COLORS);
// A file is created, this is where we will save the screen capture.
HANDLE hFile = CreateFile(L"Julia.bmp",
GENERIC_WRITE,
0,
NULL,
CREATE_ALWAYS,
FILE_ATTRIBUTE_NORMAL, NULL);
// Add the size of the headers to the size of the bitmap to get the total file size
DWORD dwSizeofDIB = dwBmpSize + sizeof(BITMAPFILEHEADER) + sizeof(BITMAPINFOHEADER);
//Offset to where the actual bitmap bits start.
bmfHeader.bfOffBits = (DWORD)sizeof(BITMAPFILEHEADER) + (DWORD)sizeof(BITMAPINFOHEADER);
//Size of the file
bmfHeader.bfSize = dwSizeofDIB;
//bfType must always be BM for Bitmaps
bmfHeader.bfType = 0x4D42; //BM
DWORD dwBytesWritten = 0;
WriteFile(hFile, (LPSTR)&bmfHeader, sizeof(BITMAPFILEHEADER), &dwBytesWritten, NULL);
WriteFile(hFile, (LPSTR)&bi, sizeof(BITMAPINFOHEADER), &dwBytesWritten, NULL);
WriteFile(hFile, (LPSTR)lpbitmap, dwBmpSize, &dwBytesWritten, NULL);
//Unlock and Free the DIB from the heap
GlobalUnlock(hDIB);
GlobalFree(hDIB);
//Close the handle for the file that was created
CloseHandle(hFile);
return TRUE;
}
The result:
The following pasted the source code for reference:
Utils.h
#pragma once
#include <Windows.h>
class CUtils
{
public:
CUtils(void);
~CUtils(void);
BOOL PrepBitmap(HDC hDC, int nWidth, int nHeight);
BOOL DrawBitmap();
BOOL DispBitmap(HDC hDC);
BOOL SaveBitmap(HWND hWnd);
private:
// bool julia(int x, int y);
private:
HDC m_memDC;
HBITMAP m_memBM;
int m_nWidth;
int m_nHeight;
bool m_juliaReady;
};
Utils.cu
#include "stdafx.h"
#include "Utils.h"
#include <stdlib.h>
#include <malloc.h>
#include <cuda_runtime.h>
template <typename T>
class cuComplex
{
public:
__device__ cuComplex(T a, T b) : r(a), i(b) {}
__device__ T magnitude2(void) { return r * r + i * i; }
__device__ cuComplex operator*(const cuComplex& a)
{
return cuComplex(r * a.r - i * a.i, i * a.r + r * a.i);
}
__device__ cuComplex operator+(const cuComplex& a)
{
return cuComplex(r + a.r, i + a.i);
}
private:
T r;
T i;
};
__global__ void julia(int width, int height, char* mask)
{
int x = blockIdx.x;
int y = blockIdx.y;
if (x < width && y < height)
{
const float scale = 1.5;
float jx = scale * (float)((width >> 1) - x) / (width >> 1);
float jy = scale * (float)((height >> 1) - y) / (height >> 1);
cuComplex<float> c(-0.8, 0.156);
cuComplex<float> a(jx, jy);
for (int i = 0; i < 200; i++)
{
a = a * a + c;
if (a.magnitude2() > 1000)
{
mask[y * width + x] = 0;
return ;
}
}
mask[y * width + x] = 1;
}
}
CUtils::CUtils(void) : m_memDC(NULL), m_memBM(NULL), m_juliaReady(false)
{
}
BOOL CUtils::PrepBitmap(HDC hDC, int nWidth = 640, int nHeight = 480)
{
m_nWidth = nWidth;
m_nHeight = nHeight;
m_memDC = CreateCompatibleDC(hDC);
m_memBM = CreateCompatibleBitmap(hDC, nWidth, nHeight);
SelectObject(m_memDC, m_memBM);
return TRUE;
}
BOOL CUtils::DrawBitmap()
{
int size = (m_nWidth * m_nHeight) * sizeof(char);
char* maskHost = (char*)malloc(size);
char* maskDevice;
cudaError_t cuErr;
int dev = 1;
cudaSetDevice(dev);
cuErr = cudaMalloc(&maskDevice, size);
if (cuErr != cudaSuccess)
{
MessageBox(NULL, _T("cudaMalloc() failed"), _T("Error"), MB_OK);
return FALSE;
}
dim3 grid(m_nWidth, m_nHeight);
julia<<<grid, 1>>>(m_nWidth, m_nHeight, maskDevice);
cudaDeviceSynchronize();
cudaMemcpy(maskHost, maskDevice, size, cudaMemcpyDeviceToHost);
for (int x = 0; x < m_nWidth; x ++)
{
for (int y = 0; y < m_nHeight; y ++)
SetPixel(m_memDC, x, y, maskHost[y * m_nWidth + x]? RGB(255, 255, 255): RGB(0, 0, 0));
}
free(maskHost);
cudaFree(maskDevice);
m_juliaReady = true;
return TRUE;
}
BOOL CUtils::DispBitmap(HDC hDC)
{
if (!m_juliaReady)
{
if (!m_memBM)
PrepBitmap(hDC);
DrawBitmap();
}
BitBlt(hDC, 0, 0, m_nWidth, m_nHeight, m_memDC, 0, 0,SRCCOPY);
return TRUE;
}
#if 0
bool CUtils::julia(int x, int y)
{
const double scale = 1.5;
double jx = scale * (double)((m_nWidth >> 1) - x) / (m_nWidth >> 1);
double jy = scale * (double)((m_nHeight >> 1) - y) / (m_nHeight >> 1);
cuComplex<double> c(-0.8, 0.156);
cuComplex<double> a(jx, jy);
int i = 0;
for (i=0; i<200; i++)
{
a = a * a + c;
if (a.magnitude2() > 1000)
return 0;
}
return 1;
}
#endif
CUtils::~CUtils(void)
{
if (m_memBM)
DeleteObject(m_memBM);
if (m_memDC)
DeleteObject(m_memDC);
}
// Refer to https://msdn.microsoft.com/en-us/library/windows/desktop/dd183402(v=vs.85).aspx
BOOL CUtils::SaveBitmap(HWND hWnd)
{
BITMAP bmpJulia;
GetObject(m_memBM,sizeof(BITMAP),&bmpJulia);
BITMAPFILEHEADER bmfHeader;
BITMAPINFOHEADER bi;
bi.biSize = sizeof(BITMAPINFOHEADER);
bi.biWidth = bmpJulia.bmWidth;
bi.biHeight = bmpJulia.bmHeight;
bi.biPlanes = 1;
bi.biBitCount = 32;
bi.biCompression = BI_RGB;
bi.biSizeImage = 0;
bi.biXPelsPerMeter = 0;
bi.biYPelsPerMeter = 0;
bi.biClrUsed = 0;
bi.biClrImportant = 0;
DWORD dwBmpSize = ((bmpJulia.bmWidth * bi.biBitCount + 31) / 32) * 4 * bmpJulia.bmHeight;
// Starting with 32-bit Windows, GlobalAlloc and LocalAlloc are implemented as wrapper functions that
// call HeapAlloc using a handle to the process's default heap. Therefore, GlobalAlloc and LocalAlloc
// have greater overhead than HeapAlloc.
HANDLE hDIB = GlobalAlloc(GHND,dwBmpSize);
char *lpbitmap = (char *)GlobalLock(hDIB);
// Gets the "bits" from the bitmap and copies them into a buffer
// which is pointed to by lpbitmap.
GetDIBits(GetDC(hWnd), m_memBM, 0,
(UINT)bmpJulia.bmHeight,
lpbitmap,
(BITMAPINFO *)&bi, DIB_RGB_COLORS);
// A file is created, this is where we will save the screen capture.
HANDLE hFile = CreateFile(L"Julia.bmp",
GENERIC_WRITE,
0,
NULL,
CREATE_ALWAYS,
FILE_ATTRIBUTE_NORMAL, NULL);
// Add the size of the headers to the size of the bitmap to get the total file size
DWORD dwSizeofDIB = dwBmpSize + sizeof(BITMAPFILEHEADER) + sizeof(BITMAPINFOHEADER);
//Offset to where the actual bitmap bits start.
bmfHeader.bfOffBits = (DWORD)sizeof(BITMAPFILEHEADER) + (DWORD)sizeof(BITMAPINFOHEADER);
//Size of the file
bmfHeader.bfSize = dwSizeofDIB;
//bfType must always be BM for Bitmaps
bmfHeader.bfType = 0x4D42; //BM
DWORD dwBytesWritten = 0;
WriteFile(hFile, (LPSTR)&bmfHeader, sizeof(BITMAPFILEHEADER), &dwBytesWritten, NULL);
WriteFile(hFile, (LPSTR)&bi, sizeof(BITMAPINFOHEADER), &dwBytesWritten, NULL);
WriteFile(hFile, (LPSTR)lpbitmap, dwBmpSize, &dwBytesWritten, NULL);
//Unlock and Free the DIB from the heap
GlobalUnlock(hDIB);
GlobalFree(hDIB);
//Close the handle for the file that was created
CloseHandle(hFile);
return TRUE;
}
The result: