【发布时间】:2016-02-13 15:48:41
【问题描述】:
我编写了一个代码来通过减少找到最小值。但是,结果始终为零。我不知道是什么问题。请帮我。
这是内核代码:我修改了 Nvidia 的求和代码。
#include <limits.h>
#define NumThread 128
#define NumBlock 32
__global__ void min_reduce(int* In, int* Out, int n){
__shared__ int sdata[NumThread];
unsigned int i = blockIdx.x * NumThread + threadIdx.x;
unsigned int tid = threadIdx.x;
unsigned int gridSize = NumBlock * NumThread;
int myMin = INT_MAX;
while (i < n){
if(In[i] < myMin)
myMin = In[i];
i += gridSize;
}
sdata[tid] = myMin;
__syncthreads();
if (NumThread >= 1024){
if (tid < 512)
if(sdata[tid] > sdata[tid + 512] ) sdata[tid] = sdata[tid + 512];
__syncthreads();
}
if (NumThread >= 512){
if(sdata[tid] > sdata[tid + 256] ) sdata[tid] = sdata[tid + 256];
__syncthreads();
}
if (NumThread >= 256){
if(sdata[tid] > sdata[tid + 128] && sdata[tid + 128] !=0) sdata[tid] = sdata[tid + 128];
__syncthreads();
}
if (NumThread >= 128){
if(sdata[tid] > sdata[tid + 64] ) sdata[tid] = sdata[tid + 64];
__syncthreads();
}
//the following practice is deprecated
if (tid < 32){
volatile int *smem = sdata;
if (NumThread >= 64) if(smem[tid] > smem[tid + 32] ) smem[tid] = smem[tid+32];
if (NumThread >= 32) if(smem[tid] > smem[tid + 16]) smem[tid] = smem[tid+16];
if (NumThread >= 16) if(smem[tid] > smem[tid + 8]) smem[tid] = smem[tid+8];
if (NumThread >= 8) if(smem[tid] > smem[tid + 4] ) smem[tid] = smem[tid+4];
if (NumThread >= 4) if(smem[tid] > smem[tid + 2] ) smem[tid] = smem[tid+2];
if (NumThread >= 2) if(smem[tid] > smem[tid + 1] ) smem[tid] = smem[tid+1];
}
if (tid == 0)
if(sdata[0] < sdata[1] ) Out[blockIdx.x] = sdata[0];
else Out[blockIdx.x] = sdata[1];
}
这是我的主要代码:
#include <stdio.h>
#include <stdlib.h>
#include "min_reduction.cu"
int main(int argc, char* argv[]){
unsigned int length = 1048576;
int i, Size, min;
int *a, *out, *gpuA, *gpuOut;
cudaSetDevice(0);
Size = length * sizeof(int);
a = (int*)malloc(Size);
out = (int*)malloc(NumBlock*sizeof(int));
for(i=0;i<length;i++) a[i] = (i + 10);
cudaMalloc((void**)&gpuA,Size);
cudaMalloc((void**)&gpuOut,NumBlock*sizeof(int));
cudaMemcpy(gpuA,a,Size,cudaMemcpyHostToDevice);
min_reduce<<<NumBlock,NumThread>>>(gpuA,gpuOut,length);
cudaDeviceSynchronize();
cudaMemcpy(out,gpuOut,NumBlock*sizeof(int),cudaMemcpyDeviceToHost);
min = out[0];
for(i=1;i<NumBlock;i++) if(min < out[i]) min = out[i];
return 0;
}
【问题讨论】:
标签: cuda