Hello
I did not completely understand your code … So I reduced ( ) it, with some glimpses at the NVIDIA sample, to make a simple test.
One of the most important points is (what I already mentioned above) : You did not assign any shared memory to the kernel! It has to be given as the third argument inside the <<< backets >>> :
theKernel<<<blocks, threads, sharedMemorySize>>>(…);
In general, I’ll probably not have the time to dig through this sort of code to find possible errors for you in the future. However, here’s what I did (all put into a single file) :
// ====================== c++ include files ==========================
#include <iostream>
#include <stdlib.h>
using namespace std;
// ====================== cuda include files ==========================
#include <cuda_runtime.h>
//#include "async_kernel.cu"
/// Kernel function
extern "C" __global__ void reductionVector(float* d_iVector, float* d_oVector, int size)
{
// each thread loads one element from global to shared mem
unsigned int t_ID = threadIdx.x;
unsigned int index = blockIdx.x*blockDim.x+threadIdx.x;
extern __shared__ int partialSum[];
partialSum[t_ID] = (index < size)? d_iVector[index] : 0;
__syncthreads();
// do reduction in shared mem
for (unsigned int s=blockDim.x / 2; s>0; s>>=1)
{
if (t_ID < s) {
partialSum[t_ID] += partialSum[t_ID + s];
}
__syncthreads();
}
// write result for this block to global mem
if (t_ID == 0) d_oVector[blockIdx.x] = partialSum[0];
}
/// ====================================================================
/// ======================= Functions ============================
void generateValues(float *vector, int size);
int roundBlocks(int value1, int value2);
//extern"C" float launchReductionVector( dim3 dimBlocks , dim3 dimThreads, float* d_vector, int size);
extern "C" __global__ void reductionVector(float* d_iVector, float* d_oVector, int size);
int menu( void );
/// ====================================================================
int maxThreads = 256; // number of threads per block
int maxBlocks = 64;
unsigned int nextPow2( unsigned int x ) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
void debugPrint(float *dVector, int size)
{
float *hVector = new float[size];
cudaMemcpy (hVector, dVector, size * sizeof(float), cudaMemcpyDeviceToHost);
for (int i=0; i<size; i++)
{
cout << "Element " << i << ": " << hVector** << endl;
}
delete[] hVector;
}
int main ( void )
{
float* vector = NULL, *d_vector, *sum = NULL, cpuSum = 0;
int size = 20000;/// 16384;
int threads = 256;
int blocks = roundBlocks(size, threads);
float* d_output;
cudaMallocHost((void**)&vector, size * sizeof(float));
cudaMemset (vector, 0, size * sizeof(float));
generateValues(vector, size);
for (int i = 0; i < size; i++)
cpuSum += vector**;
/*cout << "Generated values = " << endl;
for (int j = 0; j < size; j++)
cout << vector[j] << " ";
cout << "DONE" << endl;*/
sum = new float(-1);
cudaMalloc((void**)&d_vector, size * sizeof(float));
cudaMemcpy (d_vector, vector, size * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_output, sizeof(float) * blocks);
cudaMemset(d_output,0,sizeof(float) * blocks);
int sharedMemorySize = threads * sizeof(float);
cout << "For " << size << " elements using " << blocks << " blocks of " << threads << " threads, total: " << (blocks*threads) << " threads" << endl;
reductionVector<<< blocks, threads, sharedMemorySize >>>(d_vector, d_output, size);
int s=blocks;
while(s > 1)
{
threads = (s < maxThreads) ? nextPow2(s) : maxThreads;
blocks = (s + threads - 1) / threads;
cout << "For " << s << " elements using " << blocks << " blocks of " << threads << " threads, total: " << (blocks*threads) << " threads" << endl;
//debugPrint(d_output, s);
reductionVector<<<blocks, threads, sharedMemorySize>>>(d_output, d_output, s);
s = (s + threads - 1) / threads;
}
cudaMemcpy (sum, d_output, sizeof(float) * blocks, cudaMemcpyDeviceToHost);
cout << "GPU Result = " << *sum << endl;
cout << "CPU Result = " << cpuSum << endl;
cudaFreeHost(vector);
cudaFree(d_vector);
cudaFree(d_output);
cout << "Done" << endl;
return 0;
}
/// ====================================================================
/// ==================== Function Implementations ====================
/// ====================================================================
int menu( void )
{
int op = -1;
while ((op < 0) || (op > 1) )
{
cout << "==========================================" << endl;
cout << "========= MENU =========" << endl;
cout << "==========================================" << endl << endl;
cout << " 1 - Reduce a random Vector" << endl;
cout << " 0 - Exit" << endl;
cout << "--->";
cin >> op;
}
return op;
}
void generateValues(float *vector, int size)
{
for (int i = 0; i < size; i++)
vector** = rand() % 10;
}
int roundBlocks(int value1, int value2)
{
int ret = ceil( (float)value1 / (float)value2 );
if ( ret <= 1 )
return 1;
return ret;
}
bye
Marco