Openacc/openmp :: 致命错误：数据子句中的变量部分存在于设备上：name=Tnew答案

【问题标题】：Openacc/openmp :: FATAL ERROR: variable in data clause is partially present on the device: name=TnewOpenacc/openmp :: 致命错误：数据子句中的变量部分存在于设备上：name=Tnew
【发布时间】：2020-07-25 09:31:30
【问题描述】：

我对并行编程很陌生。我一直在做一个课堂项目，必须使用 openmp 和 openacc 实现一个混合模型，通过计算 cpu 上的一部分行和 GPU 上的其余行来计算离散化的 2D 拉普拉斯方程。

编译成功，但是当我运行时出现“致命错误：数据子句中的变量部分存在于设备上：name=Tnew”错误。

#include <omp.h>
#include <openacc.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include <chrono>
#include <iomanip>


using namespace std;


int main(int argc, char *argv[])
{
    //Total size of the grid
    int grid_size = atoi(argv[1]);
    // a variable to determine the row to split the entire grid between CPU and GPU
   int split = atoi(argv[2]);

    double * T = new double[(grid_size+2)*(grid_size+2)];
   double * Tnew = new double[(grid_size+2)*(grid_size+2)];
   double tol = 1e-5;

    int nthreads = atoi(argv[3]);
    omp_set_num_threads(nthreads);

    cout << "Grid size is " << grid_size << "number of threads " << nthreads << endl;
    //Initialize arrays
    for (int i=0; i<grid_size+2; ++i) {
      for (int j=0; j<grid_size+2; ++j) {
         T[i*(grid_size+2) + j] = 0;
            if (0 == i && 0 != j && grid_size+1 != j) { T[i*(grid_size+2) + j] = 100; }
            else if (grid_size+1 == i) T[i*(grid_size+2) + j] = 0;
            else if (0 == j && 0 != i && grid_size+1 != i) { T[i*(grid_size+2) + j] = 75; }
             else if (grid_size+1 == j && 0 != i && grid_size+1 != i) { T[i*(grid_size+2) + j] = 50; }
        }
    }

    //Print out array 
    if (grid_size <= 20) {
        for (int i=0; i<grid_size+2; ++i) {
         for (int j=0; j<grid_size+2; ++j) {
            cout << T[i*(grid_size+2) + j] << '\t';
            }
            cout << endl;
        }
    }


    double calc_time = omp_get_wtime();

    #pragma omp parallel
    {
        int tid = omp_get_thread_num();

        /* Select the last thread to interact with gpu. Push the contents of array T beggining 
        from the split location till the end to the gpu
        */ 

        if(tid==nthreads-1){

            int iteration = 0;
            double error = 1.0;

        // Copy Rows of T begining from a row before split location till end and copyout split location till the end.
     #pragma acc enter data copyin(T[split*(grid_size+2):(grid_size+2)*(grid_size+1)]) create(Tnew[split*(grid_size+2):(grid_size+2)*(grid_size+1)]) 
             while (error > tol && iteration <3000)
            {

                error = 0.0;
                iteration++;


                #pragma acc loop independent reduction(+:error)

                for(int a = split+1; a < grid_size+1; a++){
                    for(int b = 1; b < grid_size+1; b++){
                        Tnew[a*(grid_size+2)+b] = 0.25 * (T[(a-1)*(grid_size+2)+b]
                                        +T[(a+1)*(grid_size+2)+b]
                                        +T[a*(grid_size+2)+(b-1)]
                                        +T[a*(grid_size+2)+(b+1)]);
                        //error = fabs(Tnew[a*(grid_size+2)+b]-T[a*(grid_size+2)+b]);
                        error = fmaxf(error,fabs(Tnew[a*(grid_size+2)+b]-T[a*(grid_size+2)+b]));
                    }
                }

               #pragma acc loop independent 

                for(int ai = split+1; ai < grid_size+1; ai++){
                    for(int bi = 1; bi < grid_size + 1; bi++){
                         T[ai*(grid_size+2)+bi] = Tnew[ai*(grid_size+2)+bi];
                    }
                }

                // Update the  gpu's boundary row in main memory
                #pragma acc update self(T[(split+1)*(grid_size+2):((split+1)*(grid_size+2)+ grid_size)])
                // Update the threads boundary row in GPU
                #pragma acc update device(T[(split)*(grid_size+2):(split*(grid_size+2)+ grid_size)])
            }
                #pragma acc exit data copyout(T[(split+1)*(grid_size+2):(grid_size+2)*(grid_size+1)])

            cout << "GPU Portion Completed" <<  iteration << " Iterations" << endl;
        }

        // The first N rows until the split location gets computed by the rest of omp threads
        else 
        {
            double error = 1.0;
            int  iteration = 0;

              while (error > tol && iteration <3000) {
                    error=0;
                #pragma omp for collapse(2) nowait
                //#pragma acc kernels         
                for(int a = 1; a < split+1; a++){
                    for(int b = 1; b < grid_size+1; b++){
                        Tnew[a*(grid_size+2)+b] = 0.25 * (T[(a-1)*(grid_size+2)+b]
                                        +T[(a+1)*(grid_size+2)+b]
                                        +T[a*(grid_size+2)+(b-1)]
                                        +T[a*(grid_size+2)+(b+1)]);

                        error = fmaxf(error,fabs(Tnew[a*(grid_size+2)+b]-T[a*(grid_size+2)+b]));
                    }
                }

                #pragma omp for collapse(2) nowait
                for(int ai = 1; ai < split+1; ai++){
                    for(int bi = 1; bi < grid_size + 1; bi++){
                        T[ai*(grid_size+2)+bi] = Tnew[ai*(grid_size+2)+bi];
                    }
                }               
            }
        }       
    }

    calc_time = omp_get_wtime() - calc_time;
    cout << "calc time " << calc_time << endl;


    if (grid_size <= 20) {
        for (int i=0; i<grid_size+2; ++i) {
                for (int j=0; j<grid_size+2; ++j) {
                cout << setprecision(5) << T[i*(grid_size+2) + j] << '\t';
            }
            cout << endl;
        }
    }

   delete [] T;
   delete [] Tnew;
}

以下是我编译时得到的消息


pgc++ -mp -acc -Minfo mixed_omp_acc.cpp -o omp_acc

main:
      7, include "iostream"
          35, include "iostream"
                4, include "ostream"
                    38, include "ios"
                         44, include "basic_ios.h"
                              53, Parallel region activated
                             128, Parallel region terminated
     64, Generating copyout(T[(grid_size+1)*(split+1):(grid_size+2)*(grid_size+1)]) [if not already present]
         Generating create(iteration) [if not already present]
         Generating copyin(tol) [if not already present]
         Generating create(Tnew[split:(grid_size+2)*(grid_size+1)]) [if not already present]
         Generating copyout(T[(grid_size+2)*split:(grid_size+2)*(grid_size+1)]) [if not already present]
         Generating copyin(error) [if not already present]
     94, Generating update self(T[(grid_size+2)*(split+1):(grid_size+2)*(grid_size+1)])
         Generating update device(T[(grid_size+2)*split:(grid_size+2)*(grid_size+1)])
    106, Parallel loop activated with static block schedule
    114, Barrier
    118, Parallel loop activated with static block schedule
    122, Barrier

以下是我运行时遇到的错误。

第一个参数是网格大小，第二个是openmp和openacc之间的划分索引，第三个是cpu线程数。我试图分配最后一个 cpu 线程与 gpu 交互。

T lives at 0x8cc130 size 3696 partially present
Present table dump for device[1]: NVIDIA Tesla GPU 0, compute capability 7.0, threadid=1
host:0x8cc080 device:0x7f09b3afa000 size:3696 presentcount:0+1 line:69 name:T
host:0x8ccfb0 device:0x7f09b3afb000 size:3696 presentcount:0+1 line:69 name:Tnew
allocated block device:0x7f09b3afa000 size:4096 thread:1
allocated block device:0x7f09b3afb000 size:4096 thread:1
FATAL ERROR: variable in data clause is partially present on the device: name=T
******* mixed_omp_acc.cpp main_1F252L55 line:106

【问题讨论】：

标签： c++ openmp openacc

【解决方案1】：

对于 C/C++ 中的 OpenACC 数组整形语法，起始元素后跟要复制的元素数，即“arr[start:length]”。尽管您似乎将它用作“arr [start：end]”，因此当它到达更新子句时，主机 T 数组太小而无法容纳结果。要解决此问题，请更新您的数组形状以使用起始元素，后跟要复制的元素数量，而不是范围。

【讨论】：

感谢您的回复。你能举一个元素数量的例子吗？例如;我的数组的大小是 (grid_size+2)*(grid_size+2)，我想将数组的一部分从行索引“split”开始复制到最后一行，包括行中包含的整个列。这下面看起来对吗？ ``` #pragma acc 输入数据copyin(T [ split *(grid_size+2) : (grid_size+2)*(grid_size+2) - (split*(grid_size+2)) ]) ```
是的，这似乎是正确的。就个人而言，我可能会将大小放入变量中。像“int split_start=split *(grid_size+2);”、“int size= (grid_size+2)*(grid_size+2)”、“输入数据copyin(T[split_start:size-split_start)]”，所以是更容易阅读，但这不是必需的。