我们可以在这里构建一个完整的示例,从 ptxjit CUDA sample code 开始,然后按照附加说明 here。这基本上就是您所需要的一切。
这是一个完整的 linux 示例,包括 Makefile:
$ cat kernel.cu
#include <cstdio>
extern "C" __global__ void k(int N)
{
printf("kernel level %d\n", N);
if ((N > 1) && (threadIdx.x == 0)) k<<<1,1>>>(N-1);
}
$ cat ptxjit.cpp
/*
*/
// System includes
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <iostream>
// CUDA driver & runtime
#include <cuda.h>
#include <cuda_runtime.h>
// helper functions and utilities to work with CUDA
#define CUDA_DRIVER_API
#include <helper_cuda.h>
#include <helper_cuda_drvapi.h>
#include <helper_functions.h> // helper for shared that are common to CUDA Samples
#define PTX_FILE "kernel.ptx"
const char *sSDKname = "CDP Recursion Test (Driver API)";
bool inline findModulePath(const char *module_file, std::string &module_path,
char **argv, std::string &ptx_source) {
char *actual_path = sdkFindFilePath(module_file, argv[0]);
if (actual_path) {
module_path = actual_path;
} else {
printf("> findModulePath file not found: <%s> \n", module_file);
return false;
}
if (module_path.empty()) {
printf("> findModulePath file not found: <%s> \n", module_file);
return false;
} else {
printf("> findModulePath <%s>\n", module_path.c_str());
if (module_path.rfind(".ptx") != std::string::npos) {
FILE *fp = fopen(module_path.c_str(), "rb");
fseek(fp, 0, SEEK_END);
int file_size = ftell(fp);
char *buf = new char[file_size + 1];
fseek(fp, 0, SEEK_SET);
fread(buf, sizeof(char), file_size, fp);
fclose(fp);
buf[file_size] = '\0';
ptx_source = buf;
delete[] buf;
}
return true;
}
}
void ptxJIT(int argc, char **argv, CUmodule *phModule, CUfunction *phKernel,
CUlinkState *lState) {
const int options_num = 5;
CUjit_option options[options_num];
void *optionVals[options_num];
float walltime;
char error_log[8192], info_log[8192];
unsigned int logSize = 8192;
void *cuOut;
size_t outSize;
int myErr = 0;
std::string module_path, ptx_source;
// Setup linker options
// Return walltime from JIT compilation
options[0] = CU_JIT_WALL_TIME;
optionVals[0] = (void *)&walltime;
// Pass a buffer for info messages
options[1] = CU_JIT_INFO_LOG_BUFFER;
optionVals[1] = (void *)info_log;
// Pass the size of the info buffer
options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
optionVals[2] = (void *)(long)logSize;
// Pass a buffer for error message
options[3] = CU_JIT_ERROR_LOG_BUFFER;
optionVals[3] = (void *)error_log;
// Pass the size of the error buffer
options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
optionVals[4] = (void *)(long)logSize;
// Create a pending linker invocation
checkCudaErrors(cuLinkCreate(options_num, options, optionVals, lState));
// first search for the module path before we load the results
if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) {
printf("> findModulePath could not find <kernel> ptx\n");
exit(EXIT_FAILURE);
} else {
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
}
// Load the PTX from the ptx file
printf("Loading ptxjit_kernel[] program\n");
myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void *)ptx_source.c_str(),
strlen(ptx_source.c_str()) + 1, 0, 0, 0, 0);
if (myErr != CUDA_SUCCESS) {
// Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option
// above.
fprintf(stderr, "PTX Linker Error:\n%s\n", error_log);
}
myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "/usr/local/cuda/lib64/libcudadevrt.a", 0, NULL, NULL);
if (myErr != CUDA_SUCCESS) {
// Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option
// above.
fprintf(stderr, "Library Linker Error:\n%s\n", error_log);
}
// Complete the linker step
checkCudaErrors(cuLinkComplete(*lState, &cuOut, &outSize));
// Linker walltime and info_log were requested in options above.
printf("CUDA Link Completed in %fms. Linker Output:\n%s\n", walltime,
info_log);
// Load resulting cuBin into module
checkCudaErrors(cuModuleLoadData(phModule, cuOut));
// Locate the kernel entry poin
checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, "k"));
// Destroy the linker invocation
checkCudaErrors(cuLinkDestroy(*lState));
}
// Variables
CUcontext cuContext;
int main(int argc, char **argv) {
const unsigned int nThreads = 1;
const unsigned int nBlocks = 1;
CUmodule hModule = 0;
CUfunction hKernel = 0;
CUlinkState lState;
int cuda_device = 0;
printf("[%s] - Starting...\n", sSDKname);
// Initialize
checkCudaErrors(cuInit(0));
CUdevice dev = findCudaDeviceDRV(argc, (const char **)argv);
int driverVersion;
cudaDriverGetVersion(&driverVersion);
if (driverVersion < CUDART_VERSION) {
printf("driverVersion = %d < CUDART_VERSION = %d \n"
"Enhanced compatibility is not supported for this sample.. waving execution\n", driverVersion, CUDART_VERSION);
exit(EXIT_WAIVED);
}
// Create context
checkCudaErrors(cuCtxCreate(&cuContext, 0, dev));
// JIT Compile the Kernel from PTX and get the Handles (Driver API)
ptxJIT(argc, argv, &hModule, &hKernel, &lState);
// Set the kernel parameters (Driver API)
dim3 block(nThreads, 1, 1);
dim3 grid(nBlocks, 1, 1);
int my_N = 4;
void *args[1] = {&my_N};
// Launch the kernel (Driver API_)
checkCudaErrors(cuLaunchKernel(hKernel, grid.x, grid.y, grid.z, block.x,
block.y, block.z, 0, NULL, args, NULL));
std::cout << "CUDA kernel launched" << std::endl;
cuCtxSynchronize();
if (hModule) {
checkCudaErrors(cuModuleUnload(hModule));
hModule = 0;
}
return EXIT_SUCCESS;
}
$ cat Makefile
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
HOST_ARCH := sbsa
TARGET_ARCH := sbsa
endif
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),sbsa)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
NVCCFLAGS += -D_QNX_SOURCE
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
LDFLAGS += -lsocket
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
ifdef TARGET_OVERRIDE
LDFLAGS += -lslog2
endif
ifneq ($(TARGET_FS),)
LDFLAGS += -L$(TARGET_FS)/usr/lib
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
CCFLAGS += -I$(TARGET_FS)/../include
endif
endif
endif
ifdef TARGET_OVERRIDE # cuda toolkit targets override
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)
SAMPLE_ENABLED := 1
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I$(CUDA_PATH)/samples/common/inc
LIBRARIES :=
################################################################################
PTX_FILE := kernel.ptx
#Detect if installed version of GCC supports required C++11
ifeq ($(TARGET_OS),linux)
empty :=
space := $(empty) $(empty)
GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
#Create version number without "."
GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
# Make sure the version number has at least 3 decimals
GCCVERSION += 00
# Remove spaces from the version number
GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
#$(warning $(GCCVERSION))
IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000)
ifeq ($(IS_MIN_VERSION), 1)
$(info >>> GCC Version is greater or equal to 4.7.0 <<<)
else
$(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
SAMPLE_ENABLED := 0
endif
endif
# Gencode arguments
SMS ?= 70
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
ifeq ($(SMS),)
# Generate PTX code from SM 35
GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
endif
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
ifeq ($(TARGET_OS),darwin)
ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
else
ifeq ($(TARGET_ARCH),x86_64)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
ifdef TARGET_OVERRIDE
CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
endif
endif
ifeq ($(TARGET_ARCH),ppc64le)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
endif
ifeq ($(HOST_ARCH),ppc64le)
CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
endif
CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
ifeq ("$(CUDALIB)","")
$(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed. Please re-install the driver. <<<)
SAMPLE_ENABLED := 0
else
CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
LIBRARIES += -L$(CUDALIB) -lcuda
endif
endif
ALL_CCFLAGS += --threads 0 --std=c++11
LIBRARIES += -lcudart_static
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: ptxjit $(PTX_FILE)
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
$(PTX_FILE): kernel.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -rdc=true -o $@ -ptx $<
ptxjit.o:ptxjit.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
ptxjit: ptxjit.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
run: build
$(EXEC) ./ptxjit
clean:
rm -f ptxjit ptxjit.o $(PTX_FILE)
clobber: clean
$ make clean
>>> GCC Version is greater or equal to 4.7.0 <<<
rm -f ptxjit ptxjit.o kernel.ptx
$ make
>>> GCC Version is greater or equal to 4.7.0 <<<
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc -m64 --threads 0 --std=c++11 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o ptxjit.o -c ptxjit.cpp
/usr/local/cuda/bin/nvcc -ccbin g++ -m64 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o ptxjit ptxjit.o -L/usr/local/cuda/lib64/stubs -lcuda -lcudart_static
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc -m64 --threads 0 --std=c++11 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -rdc=true -o kernel.ptx -ptx kernel.cu
$ cuda-memcheck ./ptxjit
========= CUDA-MEMCHECK
[CDP Recursion Test (Driver API)] - Starting...
> Using CUDA Device [0]: Tesla V100-PCIE-32GB
> findModulePath <./kernel.ptx>
> initCUDA loading module: <./kernel.ptx>
Loading ptxjit_kernel[] program
CUDA Link Completed in 0.000000ms. Linker Output:
CUDA kernel launched
kernel level 4
kernel level 3
kernel level 2
kernel level 1
========= ERROR SUMMARY: 0 errors
$
在 Windows 上,您将使用上述文件遵循相同的路径。从 ptxjit 项目开始。为简单起见,您可能需要重命名内核,使其与 ptxjit 项目中使用的内核文件的名称完全匹配。
这是我使用 VS2019 遵循的详细步骤:
- 打开 ptxjit 解决方案,在我的机器上它在这里:
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit
- 从上述 linux 版本中获取 ptxjit.cpp 代码,并用它来替换该解决方案/项目中 ptxjit.cpp 的内容。
- 将定义语句改回:
#define PTX_FILE "ptxjit_kernel64.ptx"
- 更改设备运行时库的位置以匹配您的机器。特别是这一行:
myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "/usr/local/cuda/lib64/libcudadevrt.a", 0, NULL, NULL); 需要更改为 myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1\\lib\\x64\\cudadevrt.lib", 0, NULL, NULL);
- 在该项目的 ptxjit_kernel.cu 文件中,将该文件的内容替换为上述 linux 版本的 kernel.cu 文件内容。
- 在解决方案资源管理器窗口中,右键单击 ptxjit_kernel.cu 文件并选择“属性”。在左侧的“配置属性”窗格中,展开 CUDA C/C++ 部分,然后选择“通用”。在右侧窗格中,将“生成可重定位设备代码”选项从“否”更改为“是”。点击“确定”。
- 在同一解决方案资源管理器窗口中,右键单击 ptxjit 项目,然后选择属性。进入配置属性...CUDA Linker...General,将“Perform Device Link”从 Yes 更改为 No。点击“OK”。
- 选择构建...重建解决方案
当我这样做时,我会得到这样的构建控制台输出:
1>------ Rebuild All started: Project: ptxjit, Configuration: Debug x64 ------
1>Compiling CUDA source file ptxjit_kernel.cu...
1>
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\nvcc.exe" -gencode=arch=compute_35,code=\"compute_35,compute_35\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.26.28801\bin\HostX86\x64" -x cu -rdc=true -I./ -I../../common/inc -I./ -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\/include" -I../../common/inc -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\include" -G --keep-dir x64\Debug -maxrregcount=0 --machine 64 -ptx -cudart static -Xcompiler "/wd 4819" -o data/ptxjit_kernel64.ptx "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\ptxjit_kernel.cu"
1>CUDACOMPILE : nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
1>ptxjit_kernel.cu
1>Done building project "ptxjit_vs2019.vcxproj".
1>ptxjit.cpp
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\ptxjit.cpp(318,41): warning C4312: 'type cast': conversion from 'long' to 'void *' of greater size
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\ptxjit.cpp(324,41): warning C4312: 'type cast': conversion from 'long' to 'void *' of greater size
1>LINK : ..\..\bin\win64\Debug\\ptxjit.exe not found or not built by the last incremental link; performing full link
1> Creating library ../../bin/win64/Debug/ptxjit.lib and object ../../bin/win64/Debug/ptxjit.exp
1>ptxjit_vs2019.vcxproj -> C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\../../bin/win64/Debug/ptxjit.exe
1>Done building project "ptxjit_vs2019.vcxproj".
========== Rebuild All: 1 succeeded, 0 failed, 0 skipped ==========
此时我们可以转到exe的指定位置:
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\../../bin/win64/Debug/ptxjit.exe
并在命令控制台中运行它。当我这样做时,我会看到这样的输出:
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>ptxjit.exe
[CDP Recursion Test (Driver API)] - Starting...
> Using CUDA Device [0]: Quadro P4000
sdkFindFilePath <ptxjit_kernel64.ptx> in ./
...
sdkFindFilePath <ptxjit_kernel64.ptx> in ../../../6_Advanced/ptxjit/data/
> findModulePath <../../../6_Advanced/ptxjit/data/ptxjit_kernel64.ptx>
> initCUDA loading module: <../../../6_Advanced/ptxjit/data/ptxjit_kernel64.ptx>
Loading ptxjit_kernel[] program
CUDA Link Completed in -107374176.000000ms. Linker Output:
CUDA kernel launched
kernel level 4
kernel level 3
kernel level 2
kernel level 1
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>
注意事项:
- 上面编译中的 C4312 警告在原始项目中,可以通过在相关行上从
long 切换到 long long 来删除。这不是一个实际问题。
- 可以通过将 ptx 文件从其位置复制到 exe 文件位置来缩短运行时打印的
sdkFindFilePath 消息的扩展序列。最终的 sdkFindFilePath 输出将告诉您它在哪里找到了 ptx 文件。