【问题标题】:AttributeError: 'DataFrame' object has no attribute 'seek'AttributeError:“DataFrame”对象没有“seek”属性
【发布时间】:2021-11-24 13:32:23
【问题描述】:

DataProcessor 类中,raw_file_processingdataset_csvclasses_csvidset_csv 函数处理原始数据文件并输出csv 可以被read_archive 函数读取的文件。 我的代码引发了AttributeError: 'DataFrame' object has no attribute 'seek' 错误。

import pandas as pd
import warnings
import numpy as np
import os
import zipfile
import re
from sklearn.model_selection import train_test_split


class DataProcesser:
    def __init__(self, archive_path, col_id='ID', col_class='class', col_classname='class_name', col_set='set',
                 read_on_init=True, **kwargs):

        self.archive_path = archive_path
        self.archive = zipfile.ZipFile(self.archive_path, 'r')
        self.col_id = col_id
        self.col_class = col_class
        self.col_classname = col_classname
        self.col_set = col_set
        self.dataset = None
        self.dataset_cropped = None
        self.id_set = None
        self.classes = None
        self.train_set = None
        self.validation_set = None
        self.test_set = None
        self.logs = []
        self.stats = None
        self.flag_subset = False
        self.flag_process = False
        self.flag_split = False
        self.measurement_df = None
        if read_on_init:
            self.read_archive(**kwargs)

        def raw_file_processing(self):

            # If the path contains HTAN CODEX data, perform the following processing steps
            if os.path.isdir(archive_path):

                # 'Class' refers to the independent variable
                # The class info is the 3rd column tile_num in the current example
                # The rationale for looking at tile_num is that if we're examining tumor progression, we can observe the relative positions of the tumor growth
                # Tumor progression may be denoted by the corresponding values of tumor progression markers/antibodies such as CEA
                # In the future, we may append all the tumor patient files and normal patient files and then assign patient number as "class"
                self.col_classname = self.archive_path.iloc[2]

                # Dummy-code the classes
                self.col_class = pd.get_dummies(self.col_classname)

                # Create the ID series by concatenating columns 1-3
                self.col_id = self.archive_path.assign(
                    ID=self.archive_path[['cell_id:cell_id', 'region:region', 'tile_num:tile_num']].apply(
                        lambda row: '_'.join([str(each) for each in row]), axis=1))
                self.col_id = self.archive_path.drop(columns=['cell_id:cell_id', 'region:region', 'tile_num:tile_num'])

                # Obtain measurement info
                # Normalize data against blank/empty columns
                # log-transform the data
                for col in self.archive_path[9:]:
                    if re.findall(r"Blank|Empty", col):
                        background = col
                    else:
                        for index, row in col:
                            norm_data = row / background
                            self.measurement_df = np.log2(norm_data)

            return self.archive_path, self.col_id, self.col_class, self.measurement_df

    def dataset_csv(self):

        # If the path contains HTAN CODEX data, perform the following processing steps
        if os.path.isdir(self.archive_path):
            """Col 1: ID
            Col 2: class
            Col 3-n: measurements"""
            id_col = self.col_id

            self.col_class = self.col_class.to_frame()

            frames = [id_col, self.col_class, self.measurement_df]
            self.dataset = pd.concat(frames)
            data_csv = self.dataset.to_csv("../input_data/dataset.csv")

        return data_csv

    def classes_csv(self):

        # If the path contains HTAN CODEX data, perform the following processing steps
        if os.path.isdir(self.archive_path):
            # Remove any duplicate rows with the same col_class and cls_col info
            self.cls_df = pd.DataFrame({'class': [self.col_class], 'class_name': [self.col_classname]})
            self.cls_df.drop_duplicate(keep=False, inplace=True)

            # Save as csv file
            self.cls_df.to_csv('../input_data/classes.csv')

        return self.cls_df

    def idset_csv(self):

        # If the path contains HTAN CODEX data, perform the following processing steps
        if os.path.isdir(self.archive_path):
            # Get the ids
            ids = self.archive_path[0]

            # Train-test-validation split
            ids.sample(frac=1)
            train, test = train_test_split(ids, test_size=0.2, random_state=1)
            train, val = train_test_split(train, test_size=0.25, random_state=1)

            # Assuming train, val, test are dataframes
            # A string is assigned to the "set" column.
            train.loc[:, 'set'] = 'train'
            val.loc[:, 'set'] = 'val'
            test.loc[:, 'set'] = 'test'

            # Save as csv file
            id_set = pd.concat([train, val, test], axis=0)
            id_set_csv = id_set.to_csv('../input_data/id_set.csv', index=False)

        return id_set_csv

    def zip_files(self):

        # Create a ZipFile object for dataset.csv, classes.csv, and id_set.csv
        zip = ZipFile("data.zip", "w")
        zip.write("dataset.csv")
        zip.write("classes.csv")
        zip.write("id_set.csv")
        zip.close()
        return zip

    def read_archive(self, datatable=True, **kwargs):
        """
        Read a zip archive, without extraction, than contains:
        * data as .csv, observations in rows, measurements in columns. Names of columns must have the format:
         A_1, A_2, A_3,..., C_1, C_2,... where A and C are groups (sensors) and 1,2,3... measurement time
        * IDs of training/validation/test as .csv
        * Explicit name of classes as .csv
        :return: 2 pandas, one with raw data, one with IDs
        """
        if datatable:
            try:
                from datatable import fread
                self.dataset = fread(self.archive.open('dataset.csv'), **kwargs).to_pandas()
                self.id_set = fread(self.archive.open('id_set.csv'), **kwargs).to_pandas()
                self.classes = fread(self.archive.open('classes.csv'), **kwargs).to_pandas()
            except ModuleNotFoundError:
                warnings.warn('datatable module not found, using pandas instead. To prevent this message from appearing'
                              ' use "datatable = False" when reading the archive.')
                self.dataset = pd.read_csv(self.archive.open('dataset.csv'))
                self.id_set = pd.read_csv(self.archive.open('id_set.csv'))
                self.classes = pd.read_csv(self.archive.open('classes.csv'))
        else:
            self.dataset = pd.read_csv(self.archive.open('dataset.csv'))
            self.id_set = pd.read_csv(self.archive.open('id_set.csv'))
            self.classes = pd.read_csv(self.archive.open('classes.csv'))
        self.check_datasets()
        self.logs.append('Read archive: {0}'.format(self.archive_path))
        return None


input_path = "//wsl$/Ubuntu-20.04/home/melissachua/CODEX/input_data"
# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
    for file in files:
        with open(os.path.join(root, file), "r") as data:
            data_file = pd.read_csv(data)
            data = DataProcesser(data_file, datatable=False)

meas_var = None
start_time = None
end_time = None

# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
    for file in files:
        with open(os.path.join(root, file), "r") as data:
            data_file = pd.read_csv(data)

            # The data object is used to automatically derive some parameters (e.g. number of classes)
            data = DataProcesser(data_file, datatable=False)

追溯

> Traceback (most recent call last):   File
> "C:/Users/User/PycharmProjects/CODEX/main.py", line 171, in <module>
>     data = DataProcesser(data_file, datatable=False)   File "C:/Users/User/PycharmProjects/CODEX/main.py", line 16, in __init__
>     self.archive = zipfile.ZipFile(self.archive_path, 'r')   File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\zipfile.py",
> line 1269, in __init__
>     self._RealGetContents()   File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\zipfile.py",
> line 1332, in _RealGetContents
>     endrec = _EndRecData(fp)   File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\zipfile.py",
> line 264, in _EndRecData
>     fpin.seek(0, 2)   File "C:\Users\User\PycharmProjects\CODEX\venv\lib\site-packages\pandas\core\generic.py",
> line 5487, in __getattr__
>     return object.__getattribute__(self, name) AttributeError: 'DataFrame' object has no attribute 'seek'
> 
> Process finished with exit code 1

【问题讨论】:

    标签: python pandas dataframe machine-learning deep-learning


    【解决方案1】:

    您收到的错误来自 zipfile.ZipFile 调用。您应该将 .zip 文件(路径)传递给构造函数,而不是 pandas DataFrame。

    【讨论】:

      【解决方案2】:

      在您的代码中,您有以下几行:

      
      data_file = pd.read_csv(data)
      data = DataProcesser(data_file, datatable=False)
      

      在第一行中,您将 csv 文件读入 DataFrame 并将此 DataFrame 存储在变量 data_file 中。 第二行使用这个DataFrame 作为DataProcesser 构造函数的输入。但是,构造函数定义如下:

      def __init__(self, archive_path, col_id='ID', col_class='class', col_classname='class_name', col_set='set',
                       read_on_init=True, **kwargs):
      

      您将DataFrame 作为archive_path 传递,这不是您的构造函数所期望的。构造函数期望 str 作为文件名(也可以使用 file),例如 zipfile 构造函数或 os 函数。例如,请参阅Zipfile documnetation

      因此,将您的DataFrame 存储在另一个变量中,并使用archive_path 作为文件路径。不幸的是,您在代码中多次混淆了DataFrame 实例和archive_path。下面是几个例子。

      
      # Constrcutor
      self.archive = zipfile.ZipFile(self.archive_path, 'r')
      
      ...
      
      # First method
      if os.path.isdir(archive_path):
      
      ...
      
      self.col_classname = self.archive_path.iloc[2]
      
      ...
      for col in self.archive_path[9:]:
      
      

      【讨论】: