【问题标题】:Scrapy selenium middleware memory leakScrapy selenium 中间件内存泄漏
【发布时间】:2016-01-28 15:16:11
【问题描述】:

我在我的 scrapy 爬虫中使用 selenium 中间件:

from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait

class JSMiddleware(object):
    def __init__(self):
        dcaps = dict(DesiredCapabilities.PHANTOMJS)
        service = ['--ignore-ssl-errors=true', '--ssl-protocol=any', '--web-security=false']
        dcaps = {'handlesAlerts': False, 'javascriptEnabled': True, 'takesScreenshot':False}
        dcaps["phantomjs.page.settings.userAgent"] = (
                                                      "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36")
        self.driver = webdriver.PhantomJS(desired_capabilities=dcaps, service_args=service)
        self.driver.set_window_size(1120, 550)
        self.driver.set_page_load_timeout(15)

    def ajax_complete(self, driver):
        jquery=False
        jscomplete=False
        try:
            jquery = (0 == driver.execute_script("return jQuery.active"))
        except WebDriverException:
            pass

        try:
            if driver.execute_script("return document.readyState") == "complete":
                jscomplete = True
        except WebDriverException:
            pass        
        return jquery & jscomplete

    def process_request(self, request, spider):        
        self.driver.get(request.url)      
        WebDriverWait(self.driver, 20).until(
                                             self.ajax_complete, "Wait till loaded")
        body = self.driver.page_source
        response = HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)        
        return response

当爬虫完成后,它正在等待新的传入作业(基本上是通过 mysql 检索作业信息)。

问题是,phantomjs 保持打开状态,因此会造成内存泄漏。我应该如何以及在哪里关闭它?

这是我的蜘蛛:

from bs4 import BeautifulSoup
from items import Item
from jobs import DoneJob
from model import CrawlerSettings
import re
from readability.readability import Document
from scrapy.exceptions import CloseSpider
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider
from scrapy.spiders import Rule
from urlparse import urlparse
from utility import MysqlConnector

class MySpider(CrawlSpider):          

    def __init__(self, job):    
        self.counter = 0
        self.itemCounter = 0
        #Get the hosts
        self.job = job        
        allowedDomainsPre = job.url.split(",")
        allowedDomains = []        
        for domains in allowedDomainsPre:
            parsed_uri = urlparse(domains)
            domain = '{uri.netloc}'.format(uri=parsed_uri)
            print "DOMAIN"
            print domain
            allowedDomains.append(domain)

        self.allowed_domains = allowedDomains
        self.start_urls = allowedDomainsPre
        #Get job patterns
        jobPatterns = job.processing_patterns.split(",")
        allowedPatterns = []
        deniedPatterns = []
        for pattern in jobPatterns:
            if '-' in pattern:
                deniedPatterns.append(pattern.replace("-", ""))
            else:
                allowedPatterns.append(pattern)        

        self._rules = [
            Rule(LinkExtractor(allow=(allowedPatterns), deny=(deniedPatterns)), callback=self.parse_items, follow=True)
                ]    
        self.name = job.id   
        self.settings = CrawlerSettings.normal_settings

    def closed(self, spider):
        #stats = spider.crawler.stats.get_stats()
        itemCount = 0
        if self.itemCounter:
            itemCount = self.itemCounter  
        DoneJob.DoneJob().jobDone(self.job, itemCount)

    def parse_items(self, response):           
        item = Item()
        if self.counter >= 30:
            self.checkActive()
        #if the user wants a minimum description             
        if self.job.min_description > 0:            
            item['html'] = response.body  
            item['url'] = response.url
            #Job
            item['job'] = {}
            item['job']['id'] = self.job.id
            item['job']['user_id'] = self.job.user_id
            item['job']['name'] = self.job.name
            item['job']['url'] = self.job.url
            item['job']['api'] = self.job.api
            item['job']['max_pages'] = self.job.max_pages
            item['job']['crawl_depth'] = self.job.crawl_depth
            item['job']['processing_patterns'] = self.job.processing_patterns
            item['job']['days'] = self.job.days
            item['job']['ajax'] = self.job.ajax
            item['job']['min_description'] = self.job.min_description  
            soup = BeautifulSoup(response.body, 'html.parser')        
            article = Document(soup.prettify()).summary()
            article_soup = BeautifulSoup(article) 
            text = re.sub(' +', ' ', article_soup.get_text().rstrip())
            text_length = len(text.split(' '))            
            if text_length > self.job.min_description:
                self.counter = self.counter + 1
                self.itemCounter=self.itemCounter+1
                return item
        else:
            item['html'] = response.body  
            item['url'] = response.url
            item['job'] = {}
            #Job
            item['job']['id'] = self.job.id
            item['job']['user_id'] = self.job.user_id
            item['job']['name'] = self.job.name
            item['job']['url'] = self.job.url
            item['job']['api'] = self.job.api
            item['job']['max_pages'] = self.job.max_pages
            item['job']['crawl_depth'] = self.job.crawl_depth
            item['job']['processing_patterns'] = self.job.processing_patterns
            item['job']['days'] = self.job.days
            item['job']['ajax'] = self.job.ajax
            item['job']['min_description'] = self.job.min_description     
            self.counter = self.counter + 1
            self.itemCounter=self.itemCounter+1
            return item


    def checkActive(self): 
        self.counter = 0      
        mysql = MysqlConnector.Mysql()
        db = mysql.getConnection();
        cur = db.cursor();
        cur.execute("SELECT status FROM job WHERE id=" + str(self.job.id))
        for row in cur.fetchall():            
            status = int(row[0])
            break  
        db.close() 
        if status == 3:
            raise CloseSpider(reason='Job cancelled')

这是我的初始化:

from jobs import GetJob
import time
from twisted.internet import reactor
from twisted.internet import task

def schedule():
    jobs = GetJob.Job()
    jobs.getJobs()

if __name__ == "__main__":    
    t = task.LoopingCall(schedule)    
    t.start(15)
    reactor.run()

【问题讨论】:

    标签: mysql selenium memory-leaks scrapy


    【解决方案1】:

    我认为你可以在蜘蛛结束时使用信号来做一些动作:

    class JSMiddleware(object):
        @classmethod
        def from_crawler(cls, crawler):
            return cls(crawler)
    
        def __init__(self, crawler):
            ...
            crawler.signals.connect(self.spider_closed, signals.spider_closed)
    
        def spider_closed(self, spider):
            # actions when spider ends
            ...
    

    【讨论】:

    • 只是想:多个爬虫可以同时运行 -> 这个解决方案不会中断其他爬虫吗?
    • 每个作业(每次运行蜘蛛)都会创建自己的进程,它们不会相互干扰。
    • 如果有帮助请告诉我
    • 但是我从哪里得到爬虫呢? (对不起 - 不那么与 python 一起使用)
    猜你喜欢
    • 1970-01-01
    • 2015-10-14
    • 1970-01-01
    • 2021-01-04
    • 1970-01-01
    • 2011-01-12
    • 2014-08-16
    • 2015-02-27
    • 2017-12-03
    相关资源
    最近更新 更多