【问题标题】:terminate called after throwing an instance of 'thrust::system::system_error' what(): parallel_for failed: cudaErrorInvalidValue: invalid argument在抛出“thrust::system::system_error”实例后调用终止 what():parallel_for failed:cudaErrorInvalidValue: invalid argument
【发布时间】:2020-12-27 05:32:16
【问题描述】:

我正在尝试计算 curand_uniform() 返回 1.0 的次数。但是我似乎无法让以下代码为我工作:

#include <stdio.h>
#include <stdlib.h>  
#include <thrust/device_vector.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
using namespace std;

__global__
void counts(int length, int *sum, curandStatePhilox4_32_10_t*  state) {
  int tempsum = int(0);
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  curandStatePhilox4_32_10_t localState  =  state[i];
  for(; i < length; i += blockDim.x * gridDim.x) {
    double thisnum = curand_uniform( &localState );
    if ( thisnum == 1.0 ){
      tempsum += 1;
    }
  }
  atomicAdd(sum, tempsum);
}

__global__
void curand_setup(curandStatePhilox4_32_10_t *state, long seed) {
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    curand_init(seed, id, 0, &state[id]);
}

int main(int argc, char *argv[]) {
  const int N = 1e5;

  int* count_h = 0;
  int* count_d;
  cudaMalloc(&count_d, sizeof(int) );
  cudaMemcpy(count_d, count_h, sizeof(int), cudaMemcpyHostToDevice);

  int threads_per_block = 64;
  int Nblocks = 32*6;

  thrust::device_vector<curandStatePhilox4_32_10_t> d_state(Nblocks*threads_per_block);
  curand_setup<<<Nblocks, threads_per_block>>>(d_state.data().get(), time(0));
  counts<<<Nblocks, threads_per_block>>>(N, count_d, d_state.data().get());

  cudaMemcpy(count_h, count_d, sizeof(int), cudaMemcpyDeviceToHost);

  cout << count_h << endl;

  cudaFree(count_d);
  free(count_h);
}

我收到终端错误(在 linux):

terminate called after throwing an instance of 'thrust::system::system_error'
  what():  parallel_for failed: cudaErrorInvalidValue: invalid argument
Aborted (core dumped)

我是这样编译的:

nvcc -Xcompiler "-fopenmp" -o test uniform_one_hit_count.cu

我不明白这个错误信息。

【问题讨论】:

    标签: cuda thrust curand


    【解决方案1】:

    这一行:

    thrust::device_vector<curandStatePhilox4_32_10_t> d_state(Nblocks*threads_per_block);
    

    正在设备上初始化一个新向量。当推力这样做时,它会调用正在使用的对象的构造函数,在本例中为curandStatePhilox4_32_10,它的定义在/usr/local/cuda/include/curand_philox4x32_x.h 中的结构体(无论如何在Linux 上)。不幸的是,该结构定义没有提供任何用__device__ 修饰的构造函数,这给推力带来了麻烦。

    一个简单的解决方法是在主机上组装矢量并将其复制到设备:

    thrust::host_vector<curandStatePhilox4_32_10_t> h_state(Nblocks*threads_per_block);
    thrust::device_vector<curandStatePhilox4_32_10_t> d_state = h_state;
    

    或者,只需使用 cudaMalloc 分配空间:

    curandStatePhilox4_32_10_t *d_state;
    cudaMalloc(&d_state, (Nblocks*threads_per_block)*sizeof(d_state[0]));
    

    您至少还有一个其他问题。这实际上并没有为指针应该指向的内容提供正确的存储分配:

    int* count_h = 0;
    

    之后,您应该执行以下操作:

    count_h = (int *)malloc(sizeof(int));
    memset(count_h, 0, sizeof(int));
    

    在您的打印行上,您很可能希望这样做:

    cout << count_h[0] << endl;
    

    解决count_h 问题的另一种方法是从以下开始:

    int count_h = 0;
    

    这将需要对您的代码(cudaMemcpy 操作)进行一组不同的更改。

    【讨论】:

    • 感谢罗伯特。一如既往的出色。
    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2019-05-04
    • 1970-01-01
    • 2018-06-13
    • 2022-01-19
    • 2016-03-04
    • 2017-09-26
    相关资源
    最近更新 更多