【问题标题】:Scrapy - SQLalchemy Foreign Key not created in SQLiteScrapy - SQLite 中未创建 SQLalchemy 外键
【发布时间】:2020-08-22 08:18:26
【问题描述】:

我尝试使用 itemLoader 运行 Scrapy 以收集所有数据并将它们放入 SQLite 3。我成功收集了我想要的所有信息,但我无法使用 @ 在我的 ThreadInfo 和 PostInfo 表中生成外键987654321@ 带外键。我确实尝试过back_ref,但也没有用。 我的 Scrapy 完成后,所有其他信息都被插入到 SQLite 数据库中。

我的目标是让四个表相互链接,boardInfo、threadInfo、postInfo 和 authorInfo。

  • boardInfo 将与 threadInfo 具有一对多的关系
  • threadInfo 与 postInfo 是一对多的关系
  • authorInfo 将与 threadInfo 具有一对多的关系,并且
    发布信息。

我使用 DB Browser for SQLite 发现我的外键值是Null。 我尝试查询值(threadInfo.boardInfos_id),它显示None。我尝试解决这个问题很多天并通读文档但无法解决问题。

如何在我的 threadInfo 和 postInfo 表中生成外键?

感谢您的所有指导和 cmets。

这是我的models.py

from sqlalchemy import create_engine, Column, Table, ForeignKey, MetaData
from sqlalchemy import Integer, String, Date, DateTime, Float, Boolean, Text
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base
from scrapy.utils.project import get_project_settings

Base = declarative_base()

def db_connect():
    '''
    Performs database connection using database settings from settings.py.
    Returns sqlalchemy engine instance
    '''
    return create_engine(get_project_settings().get('CONNECTION_STRING'))

def create_table(engine):
    Base.metadata.create_all(engine)

class BoardInfo(Base): 
    __tablename__ = 'boardInfos'
    id = Column(Integer, primary_key=True)
    boardName = Column('boardName', String(100)) 
    threadInfosLink = relationship('ThreadInfo', back_populates='boardInfosLink') # One-to-Many with threadInfo

class ThreadInfo(Base):
    __tablename__ = 'threadInfos'
    id = Column(Integer, primary_key=True)
    threadTitle = Column('threadTitle', String())
    threadLink = Column('threadLink', String())
    threadAuthor = Column('threadAuthor', String())
    threadPost = Column('threadPost', Text())
    replyCount = Column('replyCount', Integer)
    readCount = Column('readCount', Integer)

    boardInfos_id = Column(Integer, ForeignKey('boardInfos.id')) # Many-to-One with boardInfo
    boardInfosLink = relationship('BoardInfo', back_populates='threadInfosLink') # Many-to-One with boardInfo

    postInfosLink = relationship('PostInfo', back_populates='threadInfosLink') # One-to-Many with postInfo
    
    authorInfos_id = Column(Integer, ForeignKey('authorInfos.id')) # Many-to-One with authorInfo
    authorInfosLink = relationship('AuthorInfo', back_populates='threadInfosLink') # Many-to-One with authorInfo

class PostInfo(Base):
    __tablename__ = 'postInfos'
    id = Column(Integer, primary_key=True)
    postOrder = Column('postOrder', Integer, nullable=True)
    postAuthor = Column('postAuthor', Text(), nullable=True)
    postContent = Column('postContent', Text(), nullable=True)
    postTimestamp = Column('postTimestamp', Text(), nullable=True)

    threadInfos_id = Column(Integer, ForeignKey('threadInfos.id')) # Many-to-One with threadInfo 
    threadInfosLink = relationship('ThreadInfo', back_populates='postInfosLink') # Many-to-One with threadInfo 
    
    authorInfos_id = Column(Integer, ForeignKey('authorInfos.id')) # Many-to-One with authorInfo
    authorInfosLink = relationship('AuthorInfo', back_populates='postInfosLink') # Many-to-One with authorInfo

class AuthorInfo(Base):
    __tablename__ = 'authorInfos'
    id = Column(Integer, primary_key=True)
    threadAuthor = Column('threadAuthor', String())

    postInfosLink = relationship('PostInfo', back_populates='authorInfosLink') # One-to-Many with postInfo
    threadInfosLink = relationship('ThreadInfo', back_populates='authorInfosLink') # One-to-Many with threadInfo

这是我的 pipelines.py

from sqlalchemy import exists, event
from sqlalchemy.orm import sessionmaker
from scrapy.exceptions import DropItem
from .models import db_connect, create_table, BoardInfo, ThreadInfo, PostInfo, AuthorInfo
from sqlalchemy.engine import Engine
from sqlite3 import Connection as SQLite3Connection
import logging

@event.listens_for(Engine, "connect")
def _set_sqlite_pragma(dbapi_connection, connection_record):
    if isinstance(dbapi_connection, SQLite3Connection):
        cursor = dbapi_connection.cursor()
        cursor.execute("PRAGMA foreign_keys=ON;")
        # print("@@@@@@@ PRAGMA prog is running!! @@@@@@")
        cursor.close()

class DuplicatesPipeline(object):

    def __init__(self):
        '''
        Initializes database connection and sessionmaker.
        Creates tables.
        '''
        engine = db_connect()
        create_table(engine)
        self.Session = sessionmaker(bind=engine)
        logging.info('****DuplicatesPipeline: database connected****')

    def process_item(self, item, spider):

        session = self.Session()
        
        exist_threadLink = session.query(exists().where(ThreadInfo.threadLink == item['threadLink'])).scalar()
        exist_thread_replyCount = session.query(ThreadInfo.replyCount).filter_by(threadLink = item['threadLink']).scalar()
        if exist_threadLink is True: # threadLink is in DB
            if exist_thread_replyCount < item['replyCount']: # check if replyCount is more?
                return item
                session.close()
            else:
                raise DropItem('Duplicated item found and replyCount is not changed')
                session.close()
        else: # New threadLink to be added to BoardPipeline
            return item
            session.close()

class BoardPipeline(object):
    def __init__(self):
        '''
        Initializes database connection and sessionmaker
        Creates tables
        '''
        engine = db_connect()
        create_table(engine)
        self.Session = sessionmaker(bind=engine)

    def process_item(self, item, spider):
        '''
        Save scraped info in the database
        This method is called for every item pipeline component
        '''

        session = self.Session()

        # Input info to boardInfos
        boardInfo = BoardInfo()
        boardInfo.boardName = item['boardName']
        
        # Input info to threadInfos
        threadInfo = ThreadInfo()
        threadInfo.threadTitle = item['threadTitle']
        threadInfo.threadLink = item['threadLink']
        threadInfo.threadAuthor = item['threadAuthor']
        threadInfo.threadPost = item['threadPost']
        threadInfo.replyCount = item['replyCount']
        threadInfo.readCount = item['readCount']

        # Input info to postInfos
        # Due to info is in list, so we have to loop and add it.
        for num in range(len(item['postOrder'])):
            postInfoNum = 'postInfo' + str(num)
            postInfoNum = PostInfo()
            postInfoNum.postOrder = item['postOrder'][num]
            postInfoNum.postAuthor = item['postAuthor'][num]
            postInfoNum.postContent = item['postContent'][num]
            postInfoNum.postTimestamp = item['postTimestamp'][num]
            session.add(postInfoNum)
        
        # Input info to authorInfo
        authorInfo = AuthorInfo()
        authorInfo.threadAuthor = item['threadAuthor'] 

        # check whether the boardName exists
        exist_boardName = session.query(exists().where(BoardInfo.boardName == item['boardName'])).scalar()
        if exist_boardName is False:  # the current boardName does not exists
            session.add(boardInfo)

        # check whether the threadAuthor exists
        exist_threadAuthor = session.query(exists().where(AuthorInfo.threadAuthor == item['threadAuthor'])).scalar()
        if exist_threadAuthor is False:  # the current threadAuthor does not exists
            session.add(authorInfo)

        try:
            session.add(threadInfo)
            session.commit()

        except:
            session.rollback()
            raise

        finally:
            session.close()

        return item

【问题讨论】:

    标签: python sqlite sqlalchemy scrapy


    【解决方案1】:

    从我看到的代码来看,在我看来,您并没有在任何地方设置 ThreadInfo.authorInfosLinkThreadInfo.authorInfos_id(您的所有 FK/关系也是如此)。

    对于要附加到 ThreadInfo 实例的相关对象,您需要创建它们,然后附加它们,如下所示:

            # Input info to authorInfo
            authorInfo = AuthorInfo()
            authorInfo.threadAuthor = item['threadAuthor'] 
            
            threadInfo.authorInfosLink = authorInfo
    

    如果每个对象通过 FK 关联,您可能不想 session.add()。你会想要:

    1. 实例化一个BoardInfo对象bi
    2. 然后实例化附加你的相关ThreadInfo对象ti
    3. 附加您的相关对象,例如bi.threadInfosLink = ti
    4. 在所有链式关系结束时,您只需使用 session.add(bi)bi 添加到会话中 - 所有相关对象都将通过它们的关系添加,并且 FK 将是正确的。

    【讨论】:

    • 感谢您的解释。当我添加以下代码时,出现错误 TypeError: Incompatible collection type: ThreadInfo is not list-like。 'boardInfo.threadInfosLink = threadInfo' 当我添加'threadInfo.authorInfosLink = authorInfo'时,它可以工作,但这会在我的数据库中创建重复的'authorInfo.threadAuthor',并且我从authorInfo表中获得了FK作为主键。所以我的表中有一些重复的 authorName 并且 FK 链接了新创建的 authorName 的 PK。如何链接到相同的 BoardInfo.boardName 或 AuthorInfo.authorName?
    • 因此,根据关系,如果它是多对一并且您决定将其置于何处,您可能需要将 .append() 附加到属性而不是分配相关对象给它。给定变量名称等,我很难阅读您的模型,所以我没有遵循您的意图,但是如果 Board 对象将包含线程集合,则为关系属性指定一个清晰的名称,例如 threads = relationship('ThreadInfo', back_populates='boardInfosLink') --那么如果你想将一个线程附加到 BoardInfo bi 它将是:bi.threads.append(ti) 或类似
    • 我已经设法开始为您的评论工作。我需要threadInfo.authorInfosLink = authorInfo来生成FK的链接。
    【解决方案2】:

    根据我其他答案的 cmets 中的讨论,以下是我将如何合理化您的模型以使它们对我更有意义。

    注意:

    1. 我已经删除了所有不必要的“信息”
    2. 我已从您的模型定义中删除了显式列名,并将依赖 SQLAlchemy 根据我的属性名称为我推断这些列名的能力
    3. 在“Post”对象中,我没有将属性命名为 PostContent,这暗示内容与 Post 相关,因为这是我们访问它的方式 - 而是简单地调用属性“Post”
    4. 我已删除所有“链接”术语 - 在我认为您希望引用相关对象集合的地方,我提供了该对象的复数属性作为关系。
    5. 我在 Post 模型中留下了一条线供您删除。如您所见,您不需要两次“作者”——一次作为相关对象,一次在 Post 上,这违背了 FK 的目的。

    通过这些更改,当您尝试在其他代码中使用这些模型时,您需要在哪里使用 .append() 以及在哪里简单地分配相关对象变得很明显。对于给定的 Board 对象,您知道“线程”是仅基于属性名称的集合,因此您将执行类似 b.threads.append(thread)

    的操作
    from sqlalchemy import create_engine, Column, Table, ForeignKey, MetaData
    from sqlalchemy import Integer, String, Date, DateTime, Float, Boolean, Text
    from sqlalchemy.orm import relationship
    from sqlalchemy.ext.declarative import declarative_base
    
    class Board(Base): 
        __tablename__ = 'board'
        id = Column(Integer, primary_key=True)
        name = Column(String(100)) 
        threads = relationship(back_populates='board')
    
    class Thread(Base):
        __tablename__ = 'thread'
        id = Column(Integer, primary_key=True)
        title = Column(String())
        link = Column(String())
        author = Column(String())
        post = Column(Text())
        reply_count = Column(Integer)
        read_count = Column(Integer)
    
        board_id = Column(Integer, ForeignKey('Board.id'))
        board = relationship('Board', back_populates='threads')
    
        posts = relationship('Post', back_populates='threads')
        
        author_id = Column(Integer, ForeignKey('Author.id'))
        author = relationship('Author', back_populates='threads')
    
    class Post(Base):
        __tablename__ = 'post'
        id = Column(Integer, primary_key=True)
        order = Column(Integer, nullable=True)
        author = Column(Text(), nullable=True)    # remove this line and instead use the relationship below
        content = Column(Text(), nullable=True)
        timestamp = Column(Text(), nullable=True)
    
        thread_id = Column(Integer, ForeignKey('Thread.id'))
        thread = relationship('Thread', back_populates='posts')
        
        author_id = Column(Integer, ForeignKey('Author.id')) 
        author = relationship('Author', back_populates='posts')
    
    class AuthorInfo(Base):
        __tablename__ = 'author'
        id = Column(Integer, primary_key=True)
        name = Column(String())
    
        posts = relationship('Post', back_populates='author') 
        threads = relationship('Thread', back_populates='author')
    

    【讨论】:

      猜你喜欢
      • 2010-11-23
      • 2018-03-06
      • 2014-09-26
      • 1970-01-01
      • 2011-02-06
      • 2012-07-15
      相关资源
      最近更新 更多