【发布时间】:2019-12-05 18:22:35
【问题描述】:
我正在尝试对this 网站进行网络抓取,并在您完成本网站的整个生命周期后下载可用的 pdf 文件。我为此使用 Scrapy。我在在正确的时间捕获验证码时遇到了一些问题。
此站点是一个 ASPX 网页,并使用“Viewstates”来跟踪每个 POST 请求。现在,如果您浏览此站点,您将了解到,每当您填写任何下拉字段时,它都会将带有“Viewstate”值的 POST 请求发送到某个 URL 路径,您可以在浏览器控制台中看到该路径。但同时,它向另一个 URL 发送另一个 GET 请求以获取“CAPTCHA”图像。但我无法得到这个回应。我不知道使用 Scrapy 是否可以同时捕获多个请求多个响应。
现在,我试图找到解决此问题的方法。我已经关注了StackOverflow 帖子中提到的几乎所有内容,但作为回应,我收到了带有 javascript 警报代码的 HTML 代码,其中提到“插入了错误的文本,请输入图像文本框中显示的新字符”。所以,这个解决方案也不适合我。
这是我的爬虫代码:
# -*- coding: utf-8 -*-
import scrapy
import cv2
import pytesseract
from PIL import Image
from io import BytesIO
from election_data.items import ElectionDataItem
class ElectionSpider(scrapy.Spider):
name = 'election'
allowed_domains = ['ceo.maharashtra.gov.in']
start_urls = ['https://ceo.maharashtra.gov.in/searchlist/SearchRollPDF.aspx']
dist_dict = []
def parse(self, response):
district = response.css('select#Content_DistrictList > option::attr(value)')[1].extract()
data = {
'__EVENTTARGET' : response.css('select#Content_DistrictList::attr(name)').extract_first(),
'__EVENTARGUMENT' : '',
'__LASTFOCUS' : '',
'__VIEWSTATE' : response.css('input#__VIEWSTATE::attr(value)').extract_first(),
'__EVENTVALIDATION' : response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
'ctl00$Content$DistrictList' : district,
'ctl00$Content$txtcaptcha' : ''
}
meta = {'handle_httpstatus_all': True}
request = scrapy.FormRequest(url=self.start_urls[0], method='POST', formdata=data, meta=meta, callback=self.parse_assembly)
request.meta['district'] = district
yield request
def parse_assembly(self, response):
print('parse_assembly')
assembly = response.css('select#Content_AssemblyList > option::attr(value)')[1].extract()
data = {
'__EVENTTARGET' : response.css('select#Content_AssemblyList::attr(name)').extract_first(),
'__EVENTARGUMENT' : '',
'__LASTFOCUS' : '',
'__VIEWSTATE' : response.css('input#__VIEWSTATE::attr(value)').extract_first(),
'__EVENTVALIDATION' : response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
'ctl00$Content$DistrictList' : response.meta['district'],
'ctl00$Content$AssemblyList' : assembly,
'ctl00$Content$txtcaptcha' : ''
}
meta = {'handle_httpstatus_all': True}
request = scrapy.FormRequest(url=self.start_urls[0], method='POST', formdata=data, meta=meta, callback=self.parse_part)
request.meta['district'] = response.meta['district']
request.meta['assembly'] = assembly
yield request
def parse_part(self, response):
print('parse_part')
part = response.css('select#Content_PartList > option::attr(value)')[1].extract()
data = {
'__EVENTTARGET' : response.css('select#Content_PartList::attr(name)').extract_first(),
'__EVENTARGUMENT' : '',
'__LASTFOCUS' : '',
'__VIEWSTATE' : response.css('input#__VIEWSTATE::attr(value)').extract_first(),
'__EVENTVALIDATION' : response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
'ctl00$Content$DistrictList' : response.meta['district'],
'ctl00$Content$AssemblyList' : response.meta['assembly'],
'ctl00$Content$PartList' : part,
'ctl00$Content$txtcaptcha' : ''
}
meta = {'handle_httpstatus_all': True}
request = scrapy.FormRequest(url=self.start_urls[0], method='POST', formdata=data, meta=meta, callback=self.parse_captcha)
request.meta['__VIEWSTATE'] = response.css('input#__VIEWSTATE::attr(value)').extract_first()
request.meta['__EVENTVALIDATION'] = response.css('input#__EVENTVALIDATION::attr(value)').extract_first()
request.meta['district'] = response.meta['district']
request.meta['assembly'] = response.meta['assembly']
request.meta['part'] = part
yield request
def parse_captcha(self, response):
data_for_later = response
request = scrapy.Request(url='https://ceo.maharashtra.gov.in/searchlist/Captcha.aspx', callback=self.store_image)
request.meta['__VIEWSTATE'] = response.meta['__VIEWSTATE']
request.meta['__EVENTVALIDATION'] = response.meta['__EVENTVALIDATION']
request.meta['district'] = response.meta['district']
request.meta['assembly'] = response.meta['assembly']
request.meta['part'] = response.meta['part']
request.meta['data_for_later'] = data_for_later
yield request
def store_image(self, response):
captcha_target_filename = 'filename.png'
# save the image for processing
i = Image.open(BytesIO(response.body))
i.save(captcha_target_filename)
captcha_text = self.solve_captcha(captcha_target_filename)
print(captcha_text)
data = {
'__EVENTTARGET' : '',
'__EVENTARGUMENT' : '',
'__LASTFOCUS' : '',
'__VIEWSTATE' : response.meta['__VIEWSTATE'],
'__EVENTVALIDATION' : response.meta['__EVENTVALIDATION'],
'ctl00$Content$DistrictList' : response.meta['district'],
'ctl00$Content$AssemblyList' : response.meta['assembly'],
'ctl00$Content$PartList' : response.meta['part'],
'ctl00$Content$txtcaptcha' : captcha_text,
'ctl00$Content$OpenButton': 'Open PDF'
}
captcha_form = response.meta['data_for_later']
meta = {'handle_httpstatus_all': True}
request = scrapy.FormRequest.from_response(captcha_form, method='POST', formdata=data, meta=meta, callback=self.get_pdfs)
yield request
def get_pdfs(self, response):
# THIS IS WHERE FINAL RESPONSE IS CAPTURED
print(response.text)
def solve_captcha(self, image):
image = cv2.imread(image,0)
thresh = cv2.threshold(image, 220, 255, cv2.THRESH_BINARY)[1]
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
result = 255 - close
cv2.imshow('thresh', thresh)
cv2.imshow('close', close)
cv2.imshow('result', result)
return pytesseract.image_to_string(result)
如果您浏览上述网站并填写所有表单详细信息,监控浏览器的 consols 网络选项卡,您就会对这个问题有所了解。
请指导我如何解决此问题。谢谢。
【问题讨论】: