Tuesday, April 24, 2012

CUDA: bad performance with shared memory and no parallelism

I'm trying to exploit shared memory in this kernel function, but the performance are not as good as I was expecting. This function is called many times in my application (about 1000 times or more), so I was thinking to exploit shared memory to avoid the memory latency. But something is wrong apparently because my application became really slow since i'm using shared memory.

This is the kernel:



__global__ void AndBitwiseOperation(int* _memory_device, int b1_size, int* b1_memory, int* b2_memory){
int j = 0;

// index GPU - Transaction-wise
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int tid = threadIdx.x;

// shared variable
extern __shared__ int shared_memory_data[];
extern __shared__ int shared_b1_data[];
extern __shared__ int shared_b2_data[];

// copy from global memory into shared memory and sync threads
shared_b1_data[tid] = b1_memory[tid];
shared_b2_data[tid] = b2_memory[tid];
__syncthreads();

// AND each int bitwise
for(j = 0; j < b1_size; j++)
shared_memory_data[tid] = (shared_b1_data[tid] & shared_b2_data[tid]);

// write result for this block to global memory
_memory_device[i] = shared_memory_data[i];
}


The shared variables are declared extern because I don't know the size of b1 and b2 since they depend from the number of customer that I can only know at runtime (but both have the same size all the times).

This is how I call the kernel:



void Bitmap::And(const Bitmap &b1, const Bitmap &b2)
{

int* _memory_device;
int* b1_memory;
int* b2_memory;

int b1_size = b1.getIntSize();

// allocate memory on GPU
(cudaMalloc((void **)&b1_memory, _memSizeInt * SIZE_UINT));
(cudaMalloc((void **)&b2_memory, _memSizeInt * SIZE_UINT));
(cudaMalloc((void **)&_memory_device, _memSizeInt * SIZE_UINT));

// copy values on GPU
(cudaMemcpy(b1_memory, b1._memory, _memSizeInt * SIZE_UINT, cudaMemcpyHostToDevice ));
(cudaMemcpy(b2_memory, b2._memory, _memSizeInt * SIZE_UINT, cudaMemcpyHostToDevice ));
(cudaMemcpy(_memory_device, _memory, _memSizeInt * SIZE_UINT, cudaMemcpyHostToDevice ));

dim3 dimBlock(1, 1);
dim3 dimGrid(1, 1);

AndBitwiseOperation<<<dimGrid, dimBlock>>>(_memory_device, b1_size, b1_memory, b2_memory);

// return values
(cudaMemcpy(_memory, _memory_device, _memSizeInt * SIZE_UINT, cudaMemcpyDeviceToHost ));

// Free Memory
(cudaFree(b1_memory));
(cudaFree(b2_memory));
(cudaFree(_memory_device));
}


b1 and b2 are bitmaps with 4 bits for each element. The number of elements depend from the number of customers. Also, I have problem with the kernel's parameters, because if I add some blocks or threads, the AndBitwiseOperation() is not giving me the correct result. With just 1 block and 1 thread per block the result is correct but the kernel is not in parallel.

Every advice is welcomed :)

Thank you





No comments:

Post a Comment