【问题标题】:How can one parallelize geopandas "to_file" function如何并行化 geopandas“to_file”函数
【发布时间】:2019-04-04 02:31:10
【问题描述】:

我正在尝试为 Geopandas 实现一个并行化函数,该函数采用单个矢量数据(即:包含 Multipolygon 数据类型的 Shapefile),并将其转换为具有用户定义的单元格 x 和 y 大小的标准细胞网格。

由于此功能可能会导致严重的内存问题(即:由太高的空间分辨率引起),我想知道是否可以将数据迭代地保存在给定的目标文件中。这样,由于每个并行进程都运行“GRID”功能,同一进程可以以附加模式迭代地保存数据。这样一来,我相信不会有内存问题。

这是我的“SHP_to_GRID_Function”。请注意,下面的代码仍然要求多处理生成的整个数据直接由内存处理(因此对于大型数据集来说溢出是肯定的)。

import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Polygon
from multiprocessing import Pool
import os
from functools import partial


def info(title):
    print(title)
    print('module name:', __name__)
    print('parent process:', os.getppid())
    print('process id:', os.getpid())


def parallelize_df(gdf, func, n_cores, dx=100, dy=100, verbose=False):

    Geometries= gdf.loc[:, 'geometry'].values

    pool = Pool(processes=n_cores)
    func_partial=partial(func, dx, dy, verbose) # prod_x has only one argument x (y is fixed to 10) 

    results = pool.map(func_partial, Geometries)

    pool.close()
    pool.join()

    print(np.shape(results))

    GRID = gpd.GeoSeries(np.array(results).ravel())

    print("GRID well created") 

    return GRID

def generate_grid_from_Poligon(dx=100, dy=100, verbose=False, polygon=None):
    if verbose == True:
        info('function parallelize_df')
    else:
        None

    xmin,ymin,xmax,ymax = polygon.bounds

    lenght = dx
    wide = dy

    cols = list(np.arange(int(np.floor(xmin)), int(np.ceil(xmax)), wide))
    rows = list(np.arange(int(np.floor(ymin)), int(np.ceil(ymax)), lenght))
    rows.reverse()

    subpolygons = []
    for x in cols:
        for y in rows:
            subpolygons.append( Polygon([(x,y), (x+wide, y), (x+wide, y-lenght), (x, y-lenght)]) )


    return subpolygons


def main(GDF, n_cores='standard', dx=100, dy=100, verbose= False):
    """
    GDF: geodataframe
    n_cores: use standard or a positive numerical (int) value. It will set the number of cores to use in the multiprocessing

    args: (dx: dimension in the x coordinate to make the grid
            dy: dimenion in the y coordinate to make the grid)

    """

    if isinstance(n_cores, str):
        import multiprocessing
        N_cores = multiprocessing.cpu_count() -1

    elif isinstance(n_cores, int):

        N_cores =n_cores


    GRID_GDF = parallelize_df(GDF, generate_grid_from_Poligon, n_cores=N_cores, dx=dx, dy=dy, verbose=verbose)

    return GRID_GDF

感谢您抽出宝贵的时间,

此致,

菲利普·利尔

【问题讨论】:

    标签: python-3.x python-multiprocessing geopandas


    【解决方案1】:

    我终于找到了我的问题的解决方案。它并不完美,因为它需要对运行期间创建的所有临时文件进行多个写入过程和一个最终连接过程。

    请随意提出替代方案。

    这是我找到的解决方案。

    import numpy as np
    import geopandas as gpd
    import pandas as pd
    from shapely.geometry import Polygon
    from multiprocessing import Pool, Lock, freeze_support
    import os
    from functools import partial
    import time
    
    def info(time_value):
    
        print('module name:', __name__)
        print('parent process:', os.getppid())
        print('process id:', os.getpid())
        print("Time spent: ", time.time() - time_value)
    
    def init(l):
    
        global lock
    
        lock=l
    
    def Data_Arranger(to_filename):
    
        """This function concatenates and deletes temporary files. It is an arranger 
            of the multicessing data results"
        """
    
        Base = os.path.join(os.path.dirname(to_filename), 'temp')
    
    
        Strings = [file for file in os.listdir(Base)]
    
        Strings = [os.path.join(Base, S) for S in Strings]
    
        if not os.path.exists(os.path.dirname(to_filename)):
            os.mkdir(os.path.dirname(to_filename))
    
        Sq = [S for S in Strings if S.endswith('.shp')]
    
        gpd.GeoDataFrame(pd.concat([gpd.read_file(sq1) for sq1 in Sq]), crs=GDF.crs).to_file(to_filename)
    
        for sq1 in Sq:
            os.remove(sq1) 
    
        import shutil
    
        shutil.rmtree(Base, ignore_errors=True) 
    
    
    
    
    def parallelize_df(gdf, func, n_cores, dx=100, dy=100, verbose=False, to_filename=None):
    
    
    
        Geometries= gdf.loc[:, 'geometry'].values
        crs = gdf.crs
    
        pool = Pool(processes=n_cores, initializer=init, initargs=(Lock(), ) )
    
        func_partial=partial(func, dx, dy, verbose, to_filename, crs) # prod_x has only one argument x (y is fixed to 10) 
    
    
        pool.map(func_partial, Geometries)
    
        pool.close()
        pool.join()
    
    
    def generate_grid_from_gdf(dx=100, dy=100, verbose=False, to_filename=None, crs=None, polygon=None):
        if verbose == True:
            info(time.time())
        else:
            None
    
        xmin,ymin,xmax,ymax = polygon.bounds
    
        lenght = dx
        wide = dy
    
        cols = list(np.arange(int(np.floor(xmin)), int(np.ceil(xmax)), wide))
        rows = list(np.arange(int(np.floor(ymin)), int(np.ceil(ymax)), lenght))
        rows.reverse()
    
        subpolygons = []
        for x in cols:
            for y in rows:
                subpolygons.append( Polygon([(x,y), (x+wide, y), (x+wide, y-lenght), (x, y-lenght)]) )
    
    
    
        lock.acquire()
    
        print('parent process: ', os.getppid(), ' has activated the Lock')
        GDF = gpd.GeoDataFrame(geometry=subpolygons, crs=crs)
    
    
        to_filename = os.path.join(os.path.dirname(to_filename), 'temp',  str(os.getpid()) + '_' + str(time.time()) + '.' + os.path.basename(to_filename).split('.')[-1])
    
        if not os.path.exists(os.path.dirname(to_filename)):
            os.mkdir(os.path.dirname(to_filename))
    
        try:
            print("to_filename: ", to_filename)
            GDF.to_file(to_filename)
        except:
            print("error in the file saving")
        lock.release()
    
        print('parent process: ', os.getppid(), ' has unlocked')
    
    
    
    
    def main(GDF, n_cores='standard', dx=100, dy=100, verbose= False, to_filename=None):
        """
        GDF: geodataframe
        n_cores: use standard or a positive numerical (int) value. It will set the number of cores to use in the multiprocessing
    
        dx: dimension in the x coordinate to make the grid
        dy: dimenion in the y coordinate to make the grid)
        verbose: whether or not to show info from the processing. Appliable only if applying the function not
                in Windows (LINUX, UBUNTU, etc.), or when running in separte console in Windows.
    
        to_filename: the path which will be used to save the resultant file.
        """
    
        if isinstance(n_cores, str):
            import multiprocessing
            N_cores = multiprocessing.cpu_count() -1
    
        elif isinstance(n_cores, int):
    
            N_cores =n_cores
    
    
    
        parallelize_df(GDF, generate_grid_from_gdf, n_cores=N_cores, dx=dx, dy=dy, verbose=verbose, to_filename=to_filename)
        Data_Arranger(to_filename)
    
        ####################################################################################
    
    if "__main__" == __name__:
        freeze_support()
        GDF = gpd.read_file("Someone's_file.shp")
    
    
        to_filename = "To_file_directory/To_file_name.shp"
    
        dx = 500 # resampling to 500 units. Ex: assuming the coordinate reference system is in meters, this function will return polygons of the given geometries in 500m for the longitudinal dimension.
    
        dy = 500 # same here. Assuming CRS is in meters units, the resultant file will be have polygons of 500m in latitudinal dimension
    
        main(GDF, dx=dx, dy=dy, verbose=True, to_filename=to_filename)
    

    感谢您的宝贵时间。

    菲利普·利尔

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 2010-10-09
      • 2013-02-12
      • 2023-03-26
      • 1970-01-01
      • 2020-12-31
      • 2019-05-07
      • 2014-12-31
      相关资源
      最近更新 更多