使用驱动程序 API 编译 CUDA 动态并行代码时出错答案

【问题标题】：Error compiling CUDA dynamic parallelism code with Driver API使用驱动程序 API 编译 CUDA 动态并行代码时出错
【发布时间】：2021-11-07 00:43:29
【问题描述】：

我正在尝试实现 NestedHelloWorld 作为动态并行的示例。环境是VS2019，必须实现为Driver API接口，用C语言。代码是

CUdevice* gpu_initialize(CUdevice *dev_ptr, int int_of_dev);        //signature of the init function

// Host code
int main()
{
printf("Nested Parallelism : Hello World (Driver API)\n");

CUdevice device;
if (gpu_initialize(&device,0) == NULL) {                //Initialize the device[0] and populate the device pointer contents
    printf("Error Initializing GPU... exiting");
    exit(-1);
}

CUresult result;
char* error_string;

//Create a context
CUcontext  context;
unsigned int flags = CU_CTX_SCHED_YIELD;            //set to mode where the GPU yields when awaiting CPU : increases latency
if ((result = cuCtxCreate(&context,flags,device)) != CUDA_SUCCESS) {
    if (cuGetErrorName(result, &error_string) != CUDA_ERROR_INVALID_VALUE) {
        printf("Error creating context : %s\n", error_string);
        return -1;
    }
    else {
        printf("Unknown error creating context\n");
        return -1;
    }
}

//Load the module by specifying the filename
CUmodule module;
char filename[] = "C:\\Users\\gautam\\OneDrive\\Projects\\VS 2019\\repos\\learn_cuda_nesting\\dynamic_parallelism\\x64\\Debug\\nestedHelloWorld.cu.obj";        //Use for VS 2019 compilation

if ((result = cuModuleLoad(&module,filename)) != CUDA_SUCCESS) {
    if (cuGetErrorName(result, &error_string) != CUDA_ERROR_INVALID_VALUE) {
        printf("Error loading Module using filename %s: %s \n",filename,error_string);
        return -1;
    }
    else {
        printf("Unknown error loading Module from filename %s\n",filename);
        return -1;
    }
}
else printf("Successfully loaded module %s\n", filename);

//Load the function from the module
CUfunction function;
char        function_name[120];
strcpy_s(function_name,120,"nestedHelloWorld");
if ((result = cuModuleGetFunction(&function,module,function_name)) != CUDA_SUCCESS) {
    if (cuGetErrorName(result, &error_string) != CUDA_ERROR_INVALID_VALUE) {
        printf("Error loading function %s: %s\n",function_name, error_string);
        return -1;
    }
    else {
        printf("Unknown error loading function %s\n",function_name);
        return -1;
    }
}
else printf("Successfully loaded function %s\n", function_name);

//Set up kernel grid parameters

int size = 8;
int blocksize = 8;   // initial block size
int igrid = 1;
int threads_per_block_x = 8;
int threads_per_block_y = 1;
int blocks_per_grid_x = (size + threads_per_block_x - 1) / threads_per_block_x;
int blocks_per_grid_y = 1;

//Launch the first function in the kernel
//Case 1 : Invoke kernel with 8 x 1 grid size
void* args[] = {(void *)&size, (void *)&igrid};

if ((result = cuLaunchKernel(function, blocks_per_grid_x, blocks_per_grid_y, 1, threads_per_block_x, threads_per_block_y, 1, 0, 0, args, 0)) != CUDA_SUCCESS) {
    if (cuGetErrorName(result, &error_string) != CUDA_ERROR_INVALID_VALUE) {
        printf("Error launching kernel %s: %s\n",function_name, error_string);
        result = cuCtxDestroy(context);
        return -1;
    }
    else {
        printf("Unknown error launching kernel %s\n", function_name);
        result = cuCtxDestroy(context);
        return -1;
    }
}
else printf("CUDA kernel launch with (%d,%d) blocks per grid, each with (%d,%d) threads per block\n", blocks_per_grid_x, blocks_per_grid_y, threads_per_block_x, threads_per_block_y);

result = cuCtxSynchronize();        //wait for kernel run to finish
result = cuCtxDestroy(context);
exit(0);
}

对应的.cu文件代码为：

extern "C" __global__ void nestedHelloWorld(int const iSize, int iDepth)
{
int tid = threadIdx.x;
printf("Recursion=%d: Hello World from thread %d block %d\n", iDepth, tid,blockIdx.x);

// condition to stop recursive execution
if (iSize == 1) return;

// reduce block size to half
int nthreads = iSize >> 1;

// thread 0 launches child grid recursively
if(tid == 0 && nthreads > 0)
{
    //nestedHelloWorld(nthreads, ++iDepth);
    nestedHelloWorld<<<1, nthreads>>>(nthreads, ++iDepth);
    printf("-------> nested execution depth: %d\n", iDepth);
}
}

设置如下：

最后，gpu 参数和错误提示无效 PTX。

虽然这似乎是“基本代码”，但真正的目的是识别正在执行的编译参数错误。编译通过，没有任何错误。

感谢所有帮助...

编辑：已添加成功编译截图，以及单独尝试编译文件然后链接时使用的外部命令截图。错误没有改变。

【问题讨论】：

响应第一部分 - 从源代码编译 .cu.obj 是 VS 2019“构建”或“重建”指令的一部分。第二部分 - 我尝试使用 nvcc 进行独立编译使用 -rdc=true 并且具有相同的错误。正在编辑问题以添加这两个方面... Tks

标签： cuda visual-studio-2019

【解决方案1】：

我们可以在这里构建一个完整的示例，从 ptxjit CUDA sample code 开始，然后按照附加说明 here。这基本上就是您所需要的一切。

这是一个完整的 linux 示例，包括 Makefile：

$ cat kernel.cu
#include <cstdio>
extern "C" __global__ void k(int N)
{
    printf("kernel level %d\n", N);
    if ((N > 1) && (threadIdx.x == 0))  k<<<1,1>>>(N-1);
}
$ cat ptxjit.cpp

/*
 */

// System includes
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <iostream>

// CUDA driver & runtime
#include <cuda.h>
#include <cuda_runtime.h>

// helper functions and utilities to work with CUDA
#define CUDA_DRIVER_API
#include <helper_cuda.h>
#include <helper_cuda_drvapi.h>
#include <helper_functions.h>  // helper for shared that are common to CUDA Samples

#define PTX_FILE "kernel.ptx"

const char *sSDKname = "CDP Recursion Test (Driver API)";

bool inline findModulePath(const char *module_file, std::string &module_path,
                           char **argv, std::string &ptx_source) {
  char *actual_path = sdkFindFilePath(module_file, argv[0]);

  if (actual_path) {
    module_path = actual_path;
  } else {
    printf("> findModulePath file not found: <%s> \n", module_file);
    return false;
  }

  if (module_path.empty()) {
    printf("> findModulePath file not found: <%s> \n", module_file);
    return false;
  } else {
    printf("> findModulePath <%s>\n", module_path.c_str());

    if (module_path.rfind(".ptx") != std::string::npos) {
      FILE *fp = fopen(module_path.c_str(), "rb");
      fseek(fp, 0, SEEK_END);
      int file_size = ftell(fp);
      char *buf = new char[file_size + 1];
      fseek(fp, 0, SEEK_SET);
      fread(buf, sizeof(char), file_size, fp);
      fclose(fp);
      buf[file_size] = '\0';
      ptx_source = buf;
      delete[] buf;
    }

    return true;
  }
}

void ptxJIT(int argc, char **argv, CUmodule *phModule, CUfunction *phKernel,
            CUlinkState *lState) {
  const int options_num = 5;
  CUjit_option options[options_num];
  void *optionVals[options_num];
  float walltime;
  char error_log[8192], info_log[8192];
  unsigned int logSize = 8192;
  void *cuOut;
  size_t outSize;
  int myErr = 0;
  std::string module_path, ptx_source;

  // Setup linker options
  // Return walltime from JIT compilation
  options[0] = CU_JIT_WALL_TIME;
  optionVals[0] = (void *)&walltime;
  // Pass a buffer for info messages
  options[1] = CU_JIT_INFO_LOG_BUFFER;
  optionVals[1] = (void *)info_log;
  // Pass the size of the info buffer
  options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
  optionVals[2] = (void *)(long)logSize;
  // Pass a buffer for error message
  options[3] = CU_JIT_ERROR_LOG_BUFFER;
  optionVals[3] = (void *)error_log;
  // Pass the size of the error buffer
  options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
  optionVals[4] = (void *)(long)logSize;

  // Create a pending linker invocation
  checkCudaErrors(cuLinkCreate(options_num, options, optionVals, lState));

  // first search for the module path before we load the results
  if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) {
    printf("> findModulePath could not find <kernel> ptx\n");
    exit(EXIT_FAILURE);
  } else {
    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
  }

  // Load the PTX from the ptx file
  printf("Loading ptxjit_kernel[] program\n");
  myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void *)ptx_source.c_str(),
                        strlen(ptx_source.c_str()) + 1, 0, 0, 0, 0);

  if (myErr != CUDA_SUCCESS) {
    // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option
    // above.
    fprintf(stderr, "PTX Linker Error:\n%s\n", error_log);
  }
  myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "/usr/local/cuda/lib64/libcudadevrt.a", 0, NULL, NULL);
  if (myErr != CUDA_SUCCESS) {
    // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option
    // above.
    fprintf(stderr, "Library Linker Error:\n%s\n", error_log);
  }
  // Complete the linker step
  checkCudaErrors(cuLinkComplete(*lState, &cuOut, &outSize));

  // Linker walltime and info_log were requested in options above.
  printf("CUDA Link Completed in %fms. Linker Output:\n%s\n", walltime,
         info_log);

  // Load resulting cuBin into module
  checkCudaErrors(cuModuleLoadData(phModule, cuOut));

  // Locate the kernel entry poin
  checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, "k"));

  // Destroy the linker invocation
  checkCudaErrors(cuLinkDestroy(*lState));
}

// Variables
CUcontext cuContext;

int main(int argc, char **argv) {
  const unsigned int nThreads = 1;
  const unsigned int nBlocks = 1;

  CUmodule hModule = 0;
  CUfunction hKernel = 0;
  CUlinkState lState;

  int cuda_device = 0;

  printf("[%s] - Starting...\n", sSDKname);
  // Initialize
  checkCudaErrors(cuInit(0));

  CUdevice dev = findCudaDeviceDRV(argc, (const char **)argv);
  int driverVersion;
  cudaDriverGetVersion(&driverVersion);
  if (driverVersion < CUDART_VERSION) {
    printf("driverVersion = %d < CUDART_VERSION = %d \n"
      "Enhanced compatibility is not supported for this sample.. waving execution\n", driverVersion, CUDART_VERSION);
    exit(EXIT_WAIVED);
  }
  // Create context
  checkCudaErrors(cuCtxCreate(&cuContext, 0, dev));


  // JIT Compile the Kernel from PTX and get the Handles (Driver API)
  ptxJIT(argc, argv, &hModule, &hKernel, &lState);

  // Set the kernel parameters (Driver API)
  dim3 block(nThreads, 1, 1);
  dim3 grid(nBlocks, 1, 1);
  int my_N = 4;
  void *args[1] = {&my_N};

  // Launch the kernel (Driver API_)
  checkCudaErrors(cuLaunchKernel(hKernel, grid.x, grid.y, grid.z, block.x,
                                 block.y, block.z, 0, NULL, args, NULL));
  std::cout << "CUDA kernel launched" << std::endl;

  cuCtxSynchronize();

  if (hModule) {
    checkCudaErrors(cuModuleUnload(hModule));
    hModule = 0;
  }

  return EXIT_SUCCESS;
}
$ cat Makefile
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda

##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################

# architecture
HOST_ARCH   := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif

# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
        HOST_ARCH := sbsa
        TARGET_ARCH := sbsa
    endif
endif

ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
endif

# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
endif

# operating system
HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif

# host compiler
ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),sbsa)
        HOST_COMPILER ?= aarch64-linux-gnu-g++
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
endif
HOST_COMPILER ?= g++
NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)

# internal flags
NVCCFLAGS   := -m${TARGET_SIZE}
CCFLAGS     :=
LDFLAGS     :=

# build flags
ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
endif

ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
        NVCCFLAGS += -D_QNX_SOURCE
        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
        LDFLAGS += -lsocket
        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
        ifdef TARGET_OVERRIDE
            LDFLAGS += -lslog2
        endif

        ifneq ($(TARGET_FS),)
            LDFLAGS += -L$(TARGET_FS)/usr/lib
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
            CCFLAGS += -I$(TARGET_FS)/../include
        endif
    endif
endif

ifdef TARGET_OVERRIDE # cuda toolkit targets override
    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif

# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif

# Debug build flags
ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
else
      BUILD_TYPE := release
endif

ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))

UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)

SAMPLE_ENABLED := 1

ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))

# Common includes and paths for CUDA
INCLUDES  := -I$(CUDA_PATH)/samples/common/inc
LIBRARIES :=

################################################################################

PTX_FILE := kernel.ptx

#Detect if installed version of GCC supports required C++11
ifeq ($(TARGET_OS),linux)
    empty :=
    space := $(empty) $(empty)
    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
#Create version number without "."
    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
# Make sure the version number has at least 3 decimals
    GCCVERSION += 00
# Remove spaces from the version number
    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
#$(warning $(GCCVERSION))

    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000)

    ifeq ($(IS_MIN_VERSION), 1)
        $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
    else
        $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
        SAMPLE_ENABLED := 0
    endif
endif

# Gencode arguments
SMS ?= 70

ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))

ifeq ($(SMS),)
# Generate PTX code from SM 35
GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
endif

# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif

ifeq ($(TARGET_OS),darwin)
  ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
else
  ifeq ($(TARGET_ARCH),x86_64)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
    CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
    ifdef TARGET_OVERRIDE
        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
    endif
  endif

  ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
  endif

  ifeq ($(HOST_ARCH),ppc64le)
    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
  endif

  CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
  ifeq ("$(CUDALIB)","")
    $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
    SAMPLE_ENABLED := 0
  else
    CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
    LIBRARIES += -L$(CUDALIB) -lcuda
  endif
endif

ALL_CCFLAGS += --threads 0 --std=c++11

LIBRARIES += -lcudart_static

ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif

################################################################################

# Target rules
all: build

build: ptxjit $(PTX_FILE)

check.deps:
ifeq ($(SAMPLE_ENABLED),0)
        @echo "Sample will be waived due to the above missing dependencies"
else
        @echo "Sample is ready - all dependencies have been met"
endif

$(PTX_FILE): kernel.cu
        $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -rdc=true -o $@ -ptx $<

ptxjit.o:ptxjit.cpp
        $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

ptxjit: ptxjit.o
        $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)

run: build
        $(EXEC) ./ptxjit

clean:
        rm -f ptxjit ptxjit.o  $(PTX_FILE)

clobber: clean
$ make clean
>>> GCC Version is greater or equal to 4.7.0 <<<
rm -f ptxjit ptxjit.o  kernel.ptx
$ make
>>> GCC Version is greater or equal to 4.7.0 <<<
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc  -m64    --threads 0 --std=c++11 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o ptxjit.o -c ptxjit.cpp
/usr/local/cuda/bin/nvcc -ccbin g++   -m64      -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o ptxjit ptxjit.o  -L/usr/local/cuda/lib64/stubs -lcuda -lcudart_static
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc  -m64    --threads 0 --std=c++11 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -rdc=true -o kernel.ptx -ptx kernel.cu
$ cuda-memcheck ./ptxjit
========= CUDA-MEMCHECK
[CDP Recursion Test (Driver API)] - Starting...
> Using CUDA Device [0]: Tesla V100-PCIE-32GB
> findModulePath <./kernel.ptx>
> initCUDA loading module: <./kernel.ptx>
Loading ptxjit_kernel[] program
CUDA Link Completed in 0.000000ms. Linker Output:

CUDA kernel launched
kernel level 4
kernel level 3
kernel level 2
kernel level 1
========= ERROR SUMMARY: 0 errors
$

在 Windows 上，您将使用上述文件遵循相同的路径。从 ptxjit 项目开始。为简单起见，您可能需要重命名内核，使其与 ptxjit 项目中使用的内核文件的名称完全匹配。

这是我使用 VS2019 遵循的详细步骤：

打开 ptxjit 解决方案，在我的机器上它在这里：C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit
从上述 linux 版本中获取 ptxjit.cpp 代码，并用它来替换该解决方案/项目中 ptxjit.cpp 的内容。
将定义语句改回：#define PTX_FILE "ptxjit_kernel64.ptx"
更改设备运行时库的位置以匹配您的机器。特别是这一行：myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "/usr/local/cuda/lib64/libcudadevrt.a", 0, NULL, NULL); 需要更改为 myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1\\lib\\x64\\cudadevrt.lib", 0, NULL, NULL);
在该项目的 ptxjit_kernel.cu 文件中，将该文件的内容替换为上述 linux 版本的 kernel.cu 文件内容。
在解决方案资源管理器窗口中，右键单击 ptxjit_kernel.cu 文件并选择“属性”。在左侧的“配置属性”窗格中，展开 CUDA C/C++ 部分，然后选择“通用”。在右侧窗格中，将“生成可重定位设备代码”选项从“否”更改为“是”。点击“确定”。
在同一解决方案资源管理器窗口中，右键单击 ptxjit 项目，然后选择属性。进入配置属性...CUDA Linker...General，将“Perform Device Link”从 Yes 更改为 No。点击“OK”。
选择构建...重建解决方案

当我这样做时，我会得到这样的构建控制台输出：

1>------ Rebuild All started: Project: ptxjit, Configuration: Debug x64 ------
1>Compiling CUDA source file ptxjit_kernel.cu...
1>
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\nvcc.exe" -gencode=arch=compute_35,code=\"compute_35,compute_35\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.26.28801\bin\HostX86\x64" -x cu -rdc=true  -I./ -I../../common/inc -I./ -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\/include" -I../../common/inc -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\include"  -G   --keep-dir x64\Debug -maxrregcount=0  --machine 64 -ptx -cudart static -Xcompiler "/wd 4819" -o data/ptxjit_kernel64.ptx "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\ptxjit_kernel.cu"
1>CUDACOMPILE : nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
1>ptxjit_kernel.cu
1>Done building project "ptxjit_vs2019.vcxproj".
1>ptxjit.cpp
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\ptxjit.cpp(318,41): warning C4312: 'type cast': conversion from 'long' to 'void *' of greater size
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\ptxjit.cpp(324,41): warning C4312: 'type cast': conversion from 'long' to 'void *' of greater size
1>LINK : ..\..\bin\win64\Debug\\ptxjit.exe not found or not built by the last incremental link; performing full link
1>   Creating library ../../bin/win64/Debug/ptxjit.lib and object ../../bin/win64/Debug/ptxjit.exp
1>ptxjit_vs2019.vcxproj -> C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\../../bin/win64/Debug/ptxjit.exe
1>Done building project "ptxjit_vs2019.vcxproj".
========== Rebuild All: 1 succeeded, 0 failed, 0 skipped ==========

此时我们可以转到exe的指定位置：

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\../../bin/win64/Debug/ptxjit.exe

并在命令控制台中运行它。当我这样做时，我会看到这样的输出：

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>ptxjit.exe
[CDP Recursion Test (Driver API)] - Starting...
> Using CUDA Device [0]: Quadro P4000
sdkFindFilePath <ptxjit_kernel64.ptx> in ./
...
sdkFindFilePath <ptxjit_kernel64.ptx> in ../../../6_Advanced/ptxjit/data/
> findModulePath <../../../6_Advanced/ptxjit/data/ptxjit_kernel64.ptx>
> initCUDA loading module: <../../../6_Advanced/ptxjit/data/ptxjit_kernel64.ptx>
Loading ptxjit_kernel[] program
CUDA Link Completed in -107374176.000000ms. Linker Output:

CUDA kernel launched
kernel level 4
kernel level 3
kernel level 2
kernel level 1

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>

注意事项：

上面编译中的 C4312 警告在原始项目中，可以通过在相关行上从 long 切换到 long long 来删除。这不是一个实际问题。
可以通过将 ptx 文件从其位置复制到 exe 文件位置来缩短运行时打印的 sdkFindFilePath 消息的扩展序列。最终的 sdkFindFilePath 输出将告诉您它在哪里找到了 ptx 文件。

【讨论】：

该解决方案运行顺利。进一步探索 - 有没有办法在仍然使用驱动程序 API 接口的同时避免这种“运行时的 jit 链接”过程？这种链接大大减慢了那些需要该模块的主机线程的运行时间。

【解决方案2】：

回复在我的other answer 下发布的问题：

有没有办法在仍然使用驱动程序 API 接口的同时避免这种“运行时 jit 链接”过程

是的。 （我提供了一个单独的答案，因为我在之前的答案中遇到了字符限制）。

在这种情况下，我们希望在内核代码本身的编译期间创建a fatbin object 而不是ptx。该 fatbin 需要使用 -rdc=true 编译，正如您对动态并行性所期望的那样，还需要与 CUDA 设备运行时库一起进行设备链接。

在这种情况下，主机端机制更简单，因为我们不需要任何链接步骤。似乎比较接近此流程的 CUDA 示例代码是 vectorAddDrv，因此我将从该代码/示例项目开始以演示这一点。

这里是linux版本：

$ cat vectorAdd_kernel.cu
#include <cstdio>
extern "C" __global__ void k(int N)
{
    printf("kernel level %d\n", N);
    if ((N > 1) && (threadIdx.x == 0))  k<<<1,1>>>(N-1);
}
$ cat vectorAddDrv.cpp
// Includes
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <cstring>
#include <cuda.h>

// includes, project
#include <helper_cuda_drvapi.h>
#include <helper_functions.h>

// includes, CUDA
#include <builtin_types.h>

using namespace std;

// Variables
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction vecAdd_kernel;

// Functions
bool findModulePath(const char *, string &, char **, string &);

//define input fatbin file
#ifndef FATBIN_FILE
#define FATBIN_FILE "vectorAdd_kernel64.fatbin"
#endif

// Host code
int main(int argc, char **argv)
{
    printf("Linked CDP demo (Driver API)\n");
    int N = 4, devID = 0;

    // Initialize
    checkCudaErrors(cuInit(0));

    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
    // Create context
    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));

    // first search for the module path before we load the results
    string module_path;

    std::ostringstream fatbin;

    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin))
    {
        exit(EXIT_FAILURE);
    }
    else
    {
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
    }

    if (!fatbin.str().size())
    {
        printf("fatbin file empty. exiting..\n");
        exit(EXIT_FAILURE);
    }

    // Create module from binary file (FATBIN)
    checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));

    // Get function handle from module
    checkCudaErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "k"));


        // Grid/Block configuration
        int threadsPerBlock = 1;
        int blocksPerGrid   = 1;

        void *args[] = { &N };

        // Launch the CUDA kernel
        checkCudaErrors(cuLaunchKernel(vecAdd_kernel,  blocksPerGrid, 1, 1,
                               threadsPerBlock, 1, 1,
                               0,
                               NULL, args, NULL));

    checkCudaErrors(cuCtxSynchronize());

    exit(EXIT_SUCCESS);
}
$ cat Makefile
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda

##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################

# architecture
HOST_ARCH   := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif

# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
        HOST_ARCH := sbsa
        TARGET_ARCH := sbsa
    endif
endif

ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
endif

# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
endif

# operating system
HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif

# host compiler
ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),sbsa)
        HOST_COMPILER ?= aarch64-linux-gnu-g++
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
endif
HOST_COMPILER ?= g++
NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)

# internal flags
NVCCFLAGS   := -m${TARGET_SIZE}
CCFLAGS     :=
LDFLAGS     :=

# build flags
ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
endif

ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
        NVCCFLAGS += -D_QNX_SOURCE
        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
        LDFLAGS += -lsocket
        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
        ifdef TARGET_OVERRIDE
            LDFLAGS += -lslog2
        endif

        ifneq ($(TARGET_FS),)
            LDFLAGS += -L$(TARGET_FS)/usr/lib
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
            CCFLAGS += -I$(TARGET_FS)/../include
        endif
    endif
endif

ifdef TARGET_OVERRIDE # cuda toolkit targets override
    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif

# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif

# Debug build flags
ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
else
      BUILD_TYPE := release
endif

ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))

UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)

SAMPLE_ENABLED := 1

ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))

# Common includes and paths for CUDA
INCLUDES  := -I$(CUDA_PATH)/samples/common/inc
LIBRARIES :=

################################################################################

FATBIN_FILE := vectorAdd_kernel${TARGET_SIZE}.fatbin

#Detect if installed version of GCC supports required C++11
ifeq ($(TARGET_OS),linux)
    empty :=
    space := $(empty) $(empty)
    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
#Create version number without "."
    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
# Make sure the version number has at least 3 decimals
    GCCVERSION += 00
# Remove spaces from the version number
    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
#$(warning $(GCCVERSION))

    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000)

    ifeq ($(IS_MIN_VERSION), 1)
        $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
    else
        $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
        SAMPLE_ENABLED := 0
    endif
endif

# Gencode arguments
SMS ?= 52 60 61 70 75 80 86

ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))

ifeq ($(SMS),)
# Generate PTX code from SM 35
GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
endif

# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif

ifeq ($(TARGET_OS),darwin)
  ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
else
  ifeq ($(TARGET_ARCH),x86_64)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
    CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
    ifdef TARGET_OVERRIDE
        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
    endif
  endif

  ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
  endif

  ifeq ($(HOST_ARCH),ppc64le)
    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
  endif

  CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
  ifeq ("$(CUDALIB)","")
    $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
    SAMPLE_ENABLED := 0
  else
    CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
    LIBRARIES += -L$(CUDALIB) -lcuda
  endif
endif

ALL_CCFLAGS += --threads 0 --std=c++11

ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif

################################################################################

# Target rules
all: build

build: vectorAddDrv $(FATBIN_FILE)

check.deps:
ifeq ($(SAMPLE_ENABLED),0)
        @echo "Sample will be waived due to the above missing dependencies"
else
        @echo "Sample is ready - all dependencies have been met"
endif

$(FATBIN_FILE): vectorAdd_kernel.cu
        $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -rdc=true -lcudadevrt -dlink  -o $@ -fatbin $<

vectorAddDrv.o:vectorAddDrv.cpp
        $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

vectorAddDrv: vectorAddDrv.o
        $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)

run: build
        $(EXEC) ./vectorAddDrv

clean:
        rm -f vectorAddDrv vectorAddDrv.o  $(FATBIN_FILE)

clobber: clean
$ make clean
>>> GCC Version is greater or equal to 4.7.0 <<<
rm -f vectorAddDrv vectorAddDrv.o  vectorAdd_kernel64.fatbin
$ make
>>> GCC Version is greater or equal to 4.7.0 <<<
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc  -m64    --threads 0 --std=c++11 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -o vectorAddDrv.o -c vectorAddDrv.cpp
/usr/local/cuda/bin/nvcc -ccbin g++   -m64      -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -o vectorAddDrv vectorAddDrv.o  -L/usr/local/cuda/lib64/stubs -lcuda
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc  -m64    --threads 0 --std=c++11 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -rdc=true -lcudadevrt -dlink  -o vectorAdd_kernel64.fatbin -fatbin vectorAdd_kernel.cu
$ cuda-memcheck ./vectorAddDrv
========= CUDA-MEMCHECK
Linked CDP demo (Driver API)
> Using CUDA Device [0]: Tesla V100-PCIE-32GB
> findModulePath found file at <./vectorAdd_kernel64.fatbin>
> initCUDA loading module: <./vectorAdd_kernel64.fatbin>
kernel level 4
kernel level 3
kernel level 2
kernel level 1
========= ERROR SUMMARY: 0 errors
$

在 Windows/VS 2019/CUDA 11.1 上，我按照以下步骤操作：

打开vectorAddDrv 项目/解决方案，在我的机器上它位于：C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv
将 vectorAddDrv.cpp 文件中的代码替换为上述 linux 示例中同一文件中的代码。
将 vectorAdd_kernel.cu 文件中的代码替换为上述 linux 示例中同一文件中的代码。
在左侧的解决方案资源管理器窗格中，右键单击vectorAdd_kernel.cu 文件，然后打开属性。然后在 Configuration Properties...CUDA C/C++...Common 中将“Generate Relocatable Device Code”从 No 更改为 Yes。然后在配置属性...CUDA C/C++...命令行中添加-dlink。还要确保配置属性...CUDA C/C++...设备...代码生成与您要运行的设备相匹配。点击“确定”。
在左侧相同的解决方案资源管理器窗格中，右键单击 vectorAddDrv project，选择 Properties，然后在 Configuration Properties...CUDA Linker...General 将“Perform Device Link”从是对否。点击“确定”。
选择构建...重新构建解决方案。

当我这样做时，我会看到如下控制台构建输出：

1>------ Rebuild All started: Project: vectorAddDrv, Configuration: Debug x64 ------
1>Compiling CUDA source file vectorAdd_kernel.cu...
1>
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\nvcc.exe" -gencode=arch=compute_61,code=sm_61 --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.26.28801\bin\HostX86\x64" -x cu -rdc=true  -I./ -I../../common/inc -I./ -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\/include" -I../../common/inc -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\include"  -G   --keep-dir x64\Debug -maxrregcount=0  --machine 64 -fatbin -cudart static -dlink -Xcompiler "/wd 4819" -o data/vectorAdd_kernel64.fatbin "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv\vectorAdd_kernel.cu"
1>vectorAdd_kernel.cu
1>vectorAddDrv.cpp
1>LINK : ..\..\bin\win64\Debug\\vectorAddDrv.exe not found or not built by the last incremental link; performing full link
1>vectorAddDrv_vs2019.vcxproj -> C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv\../../bin/win64/Debug/vectorAddDrv.exe
========== Rebuild All: 1 succeeded, 0 failed, 0 skipped ==========

如果我们随后打开命令提示符并导航到可执行文件的指定位置并运行它，我看到了：

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>vectorAddDrv
Linked CDP demo (Driver API)
> Using CUDA Device [0]: Quadro P4000
sdkFindFilePath <vectorAdd_kernel64.fatbin> in ./
...
sdkFindFilePath <vectorAdd_kernel64.fatbin> in ../../../0_Simple/vectorAddDrv/data/
> findModulePath found file at <../../../0_Simple/vectorAddDrv/data/vectorAdd_kernel64.fatbin>
> initCUDA loading module: <../../../0_Simple/vectorAddDrv/data/vectorAdd_kernel64.fatbin>
kernel level 4
kernel level 3
kernel level 2
kernel level 1

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>

另一个答案中的一个注释也适用于此：通过将 fatbin 文件从其位置复制到 exe 文件的位置，可以缩短运行时打印的 sdkFindFilePath 消息的扩展序列。最终的 sdkFindFilePath 输出会告诉你它在哪里找到了 fatbin 文件。

【讨论】：