未找到 Ctypes 函数答案

【问题标题】：Ctypes function not found未找到 Ctypes 函数
【发布时间】：2021-04-14 22:13:51
【问题描述】：

我尝试使用ctypes 在python 中运行一些cuda 代码。编译并加载.so 文件后，我遇到一个错误，告诉我cuda 函数不存在。我之前尝试过使用纯 c 的示例并且效果很好。我的编译有什么问题吗？

Cuda 代码

#include <stdio.h>
#include <stdlib.h>
#define BLOCK_SIZE 16

struct Matrix {
    int width;
    int height;
    float *elements;
};

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C){

    // runs for each col - row pair
    float tmpVal = 0;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    for (int i = 0; i < A.width; ++i)
        tmpVal += A.elements[row * A.width + i] *
                  B.elements[i * B.width + col];
    C.elements[ row * C.width + col ] = tmpVal;
}

void mMul( Matrix *A, Matrix *B, Matrix *C ){

    Matrix d_A, d_B, d_C;

    // Matrix d_A
    d_A.width    =   A->width;
    d_A.height   =   A->height;
    size_t sizeA =   A->width * A->height * sizeof(float);
    // dynamically allocate cudaMemory for elemenst array
    cudaMalloc(&d_A.elements, sizeA);
    cudaMemcpy(d_A.elements, A->elements, sizeA, cudaMemcpyHostToDevice);

    // Matrix d_B
    d_B.width    =   B->width;
    d_B.height   =   B->height;
    size_t sizeB =   B->width * B->height * sizeof(float);
    // dynamically allocate cudaMemory for elemenst array
    cudaMalloc(&d_B.elements, sizeB);
    cudaMemcpy(d_B.elements, B->elements, sizeB, cudaMemcpyHostToDevice);

    // Matrix d_C
    d_C.width    =   C->width;
    d_C.height   =   C->height;
    size_t sizeC =   C->width * C->height * sizeof(float);

    // dynamically allocate cudaMemory for elemenst array
    cudaMalloc(&d_C.elements, sizeC);

    // 16 * 16 = 256 threads per block
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

    // Blocks per grid
    dim3 dimGrid(B->width / dimBlock.x, A->height / dimBlock.y);

    // calling the Kernel
    MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

    // copy results from result matrix C to the host again
    cudaMemcpy(C->elements, d_C.elements, sizeC, cudaMemcpyDeviceToHost);

    // free the cuda memory
    cudaFree(d_A.elements);
    cudaFree(d_B.elements);
    cudaFree(d_C.elements);
}

然后我编译成Sequential_Cuda_Pyton.so

nvcc --shared --compiler-options '-fPIC' -o Sequential_Cuda_Python.so Sequential_Cuda_Python.cu

python - ctypes 代码：

import numpy as np
from numpy.ctypeslib import ndpointer
from ctypes import *

class Matrix(Structure):
    _fields_ = [("width", c_int),
                ("height", c_int),
                ("elements", POINTER(c_float))]

libc = CDLL("./Sequential_Cuda_Python.so")

libc.mMul.argtypes = [ POINTER(Matrix), POINTER(Matrix), POINTER(Matrix) ]

错误，好像没有找到函数

Traceback (most recent call last):
  File "cuda_arr.py", line 17, in <module>
    libc.mMul.argtypes = [ POINTER(Matrix), POINTER(Matrix), POINTER(Matrix) ]
  File "/usr/lib/python3.8/ctypes/__init__.py", line 386, in __getattr__
    func = self.__getitem__(name)
  File "/usr/lib/python3.8/ctypes/__init__.py", line 391, in __getitem__
    func = self._FuncPtr((name_or_ordinal, self))
AttributeError: ... /Sequential_Cuda_Python.so: undefined symbol: mMul

【问题讨论】：

您可能缺少的是 CUDA 是使用 C++ 编译器和 C++ 链接编译的。您可以使 C 示例工作的事实可能无关紧要。

标签： python numpy cuda ctypes

【解决方案1】：

根据评论，你需要extern "C"

C++（以及 cuda 的扩展）做了一些叫做 name mangling

试试这个有没有extern "C"

readelf --symbols Sequential_Cuda_Python.so | grep mMul

#include <stdio.h>
#include <stdlib.h>
#define BLOCK_SIZE 16

struct Matrix {
    int width;
    int height;
    float *elements;
};

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C){

    // runs for each col - row pair
    float tmpVal = 0;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    for (int i = 0; i < A.width; ++i)
        tmpVal += A.elements[row * A.width + i] *
                  B.elements[i * B.width + col];
    C.elements[ row * C.width + col ] = tmpVal;
}

extern "C" {
void mMul( Matrix *A, Matrix *B, Matrix *C ){

    Matrix d_A, d_B, d_C;

    // Matrix d_A
    d_A.width    =   A->width;
    d_A.height   =   A->height;
    size_t sizeA =   A->width * A->height * sizeof(float);
    // dynamically allocate cudaMemory for elemenst array
    cudaMalloc(&d_A.elements, sizeA);
    cudaMemcpy(d_A.elements, A->elements, sizeA, cudaMemcpyHostToDevice);

    // Matrix d_B
    d_B.width    =   B->width;
    d_B.height   =   B->height;
    size_t sizeB =   B->width * B->height * sizeof(float);
    // dynamically allocate cudaMemory for elemenst array
    cudaMalloc(&d_B.elements, sizeB);
    cudaMemcpy(d_B.elements, B->elements, sizeB, cudaMemcpyHostToDevice);

    // Matrix d_C
    d_C.width    =   C->width;
    d_C.height   =   C->height;
    size_t sizeC =   C->width * C->height * sizeof(float);

    // dynamically allocate cudaMemory for elemenst array
    cudaMalloc(&d_C.elements, sizeC);

    // 16 * 16 = 256 threads per block
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

    // Blocks per grid
    dim3 dimGrid(B->width / dimBlock.x, A->height / dimBlock.y);

    // calling the Kernel
    MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

    // copy results from result matrix C to the host again
    cudaMemcpy(C->elements, d_C.elements, sizeC, cudaMemcpyDeviceToHost);

    // free the cuda memory
    cudaFree(d_A.elements);
    cudaFree(d_B.elements);
    cudaFree(d_C.elements);
}
}

【讨论】：