分叉后的CUDA初始化错误答案

【问题标题】：CUDA initialization error after fork分叉后的CUDA初始化错误
【发布时间】：2015-10-21 15:22:25
【问题描述】：

调用 fork() 后出现“初始化错误”。如果我在没有 fork 的情况下运行相同的程序，一切正常。

if (fork() == 0) {
    ...
    cudaMalloc(....);
    ...
}

这是什么原因造成的？

一个完整的例子如下。如果我注释掉 cudaGetDeviceCount 调用，它工作正常。

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <cuda_runtime.h>

#define PERR(call) \
  if (call) {\
   fprintf(stderr, "%s:%d Error [%s] on "#call"\n", __FILE__, __LINE__,\
      cudaGetErrorString(cudaGetLastError()));\
   exit(1);\
  }

int
main(int argc, char **argv)
{
  float *v_d;
  int gpucount;

  cudaGetDeviceCount(&gpucount);

  if (fork() == 0) {
    cudaSetDevice(0);
    PERR(cudaMalloc(&v_d, 1000*sizeof(float)));
  }
  wait(NULL);
  return 0;
}

简单的 Makefile：

PROGS = fork
CUDA_PATH = /usr/local/cuda
CXXFLAGS = -g -O0 -Wall
CXXINCLUDES = -I$(CUDA_PATH)/include
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(CXX) -Xcompiler "$(CXXFLAGS)"

fork: fork.cxx
        $(NVCC) $^ -o $@ $(LIBS)

clean:
        (rm $(PROGS) *.o)

在这种情况下，我只是想从父进程中获取可用设备的数量。这种解决方法可以做到：

  if (fork() == 0) {
    PERR(cudaGetDeviceCount(&gpucount));
    return(gpucount);
  }
  wait(&gpucount);
  gpucount =  WEXITSTATUS(gpucount);

【问题讨论】：

你能提供一个完整的代码来演示你在做什么吗？我对fork 和cudaMalloc 没有任何问题。
我想我现在有一点线索了。该程序在分叉之前调用 cudaSetDevice。如果我将呼叫移到叉内，它就会运行。我会整理一个小例子。

标签： cuda

【解决方案1】：

fork() 创建一个子进程。进程有自己的地址空间。一个 CUDA 上下文不能在两个不同的进程之间共享，原因有很多，其中一个原因是各种指针在不同的地址空间中将毫无意义。

如果您在 fork() 之前创建 CUDA 上下文，则不能在子进程中使用它。 cudaSetDevice(0); 调用尝试共享 CUDA 上下文，当您调用 cudaGetDeviceCount(); 时在父进程中隐式创建

正如您所暗示的，解决方案是在父进程或子进程中进行 CUDA 工作。如果您在多设备系统中，应该可以将单独的设备分配给单独的进程（CUDA simpleIPC sample code 正是这样做的）。（关键是不要在fork之前创建CUDA上下文。）

您可能对this question/answer 和this one 感兴趣。

这是一个完整的示例（需要 2 个 CUDA 设备），显示一个子进程和一个父进程使用不同的 GPU：

$ cat t345.cu
#include <unistd.h>     /* Symbolic Constants */
#include <sys/types.h>  /* Primitive System Data Types */
#include <errno.h>      /* Errors */
#include <stdio.h>      /* Input/Output */
#include <sys/wait.h>   /* Wait for Process Termination */
#include <stdlib.h>     /* General Utilities */


#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


__global__ void addkernel(int *data){
  *data += 1;
}

int main()
{
    pid_t childpid; /* variable to store the child's pid */
    int retval;     /* child process: user-provided return code */
    int status;     /* parent process: child's exit status */

    /* only 1 int variable is needed because each process would have its
       own instance of the variable
       here, 2 int variables are used for clarity */

    /* now create new process */
    childpid = fork();

    if (childpid >= 0) /* fork succeeded */
    {
        if (childpid == 0) /* fork() returns 0 to the child process */
        {
            printf("CHILD: I am the child process!\n");
            printf("CHILD: Here's my PID: %d\n", getpid());
            printf("CHILD: My parent's PID is: %d\n", getppid());
            printf("CHILD: The value of my copy of childpid is: %d\n", childpid);
            int *h_a, *d_a;
            h_a = (int *)malloc(sizeof(int));
            cudaSetDevice(0);
            cudaCheckErrors("CHILD cudaSetDevice fail");
            cudaMalloc(&d_a, sizeof(int));
            cudaCheckErrors("cudaMalloc fail");
            *h_a = 1;
            cudaMemcpy(d_a, h_a, sizeof(int), cudaMemcpyHostToDevice);
            cudaCheckErrors("cudaMemcpy H2D fail");
            addkernel<<<1,1>>>(d_a);
            cudaDeviceSynchronize();
            cudaCheckErrors("kernel fail");
            cudaMemcpy(h_a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
            cudaCheckErrors("cudaMemcpy D2H fail");
            printf("CHILD: result: %d\n", *h_a);

            printf("CHILD: Sleeping for 1 second...\n");
            sleep(1); /* sleep for 1 second */
            cudaDeviceReset();
            printf("CHILD: Enter an exit value (0 to 255): ");
            scanf(" %d", &retval);
            printf("CHILD: Goodbye!\n");
            exit(retval); /* child exits with user-provided return code */
        }
        else /* fork() returns new pid to the parent process */
        {
            printf("PARENT: I am the parent process!\n");
            printf("PARENT: Here's my PID: %d\n", getpid());
            printf("PARENT: The value of my copy of childpid is %d\n", childpid);
            printf("PARENT: I will now wait for my child to exit.\n");
            int *h_a, *d_a;
            h_a = (int *)malloc(sizeof(int));
            cudaSetDevice(1);
            cudaCheckErrors("PARENT cudaSetDevice fail");
            cudaMalloc(&d_a, sizeof(int));
            cudaCheckErrors("cudaMalloc fail");
            *h_a = 2;
            cudaMemcpy(d_a, h_a, sizeof(int), cudaMemcpyHostToDevice);
            cudaCheckErrors("cudaMemcpy H2D fail");
            addkernel<<<1,1>>>(d_a);
            cudaDeviceSynchronize();
            cudaCheckErrors("kernel fail");
            cudaMemcpy(h_a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
            cudaCheckErrors("cudaMemcpy D2H fail");
            printf("PARENT: result: %d\n", *h_a);
            wait(&status); /* wait for child to exit, and store its status */
            printf("PARENT: Child's exit code is: %d\n", WEXITSTATUS(status));
            cudaSetDevice(0);
            cudaCheckErrors("PARENT cudaSetDevice  2 fail");
            int *h_a2, *d_a2;
            cudaMalloc(&d_a2, sizeof(int));
            cudaCheckErrors("cudaMalloc fail");
            h_a2 = (int *)malloc(sizeof(int));
            *h_a2 = 5;
            cudaMemcpy(d_a2, h_a2, sizeof(int), cudaMemcpyHostToDevice);
            cudaCheckErrors("cudaMemcpy H2D fail");
            addkernel<<<1,1>>>(d_a2);
            cudaDeviceSynchronize();
            cudaCheckErrors("kernel fail");
            cudaMemcpy(h_a2, d_a2, sizeof(int), cudaMemcpyDeviceToHost);
            cudaCheckErrors("cudaMemcpy D2H fail");
            printf("PARENT: result2: %d\n", *h_a2);
            printf("PARENT: Goodbye!\n");
            exit(0);  /* parent exits */
        }
    }
    else /* fork returns -1 on failure */
    {
        perror("fork"); /* display error message */
        exit(0);
    }
}
$ nvcc -arch=sm_20 -o t345 t345.cu
$ ./t345
CHILD: I am the child process!
CHILD: Here's my PID: 23603
CHILD: My parent's PID is: 23602
CHILD: The value of my copy of childpid is: 0
PARENT: I am the parent process!
PARENT: Here's my PID: 23602
PARENT: The value of my copy of childpid is 23603
PARENT: I will now wait for my child to exit.
CHILD: result: 2
CHILD: Sleeping for 1 second...
PARENT: result: 3
CHILD: Enter an exit value (0 to 255): 10
CHILD: Goodbye!
PARENT: Child's exit code is: 10
PARENT: result2: 6
PARENT: Goodbye!
$

（修改自here）

【讨论】：

我很失望“工具包文档”中没有提到它。示例“0_Simple/simpleIPC/simpleIPC.cu”解释了如何解决这个问题。我还在原始问题中为我的用例添加了一个简单的解决方法。参考“cudaDeviceReset();”中还有另一个误导性的调用。这似乎没有做任何事情来重置底层上下文。
但是如果你需要在fork之前调用cudaGetDeviceCount怎么办？如果我使用多个 GPU 并且每个 GPU 使用一个进程，我想知道提前生成多少进程。
每个 GPU 使用一个线程可能比每个 GPU 使用一个进程更好。如果您想使用进程，则为您的程序可以处理的最大 GPU 数量创建进程（这可能不大于 8）。然后每个进程都有自己的唯一ID，可以查询cudaGetDeviceCount。如果不存在对应于该进程的 GPU，则该进程将退出。例如，如果您有 4 个 GPU，但启动了 8 个进程，那么进程 0-3 将分别获得一个 GPU，而进程 4-7 将看到只有 4 个 GPU，因此将退出。我敢肯定还有其他方法。
对于另一种方法，请查看simpleIPC cuda sample code。它启动一个工作进程来查询设备计数，然后让该进程退出，这会破坏其上下文。设备计数会传回给原始进程，然后由原始进程使用它为每个 GPU 生成一个进程。