源代码:
# coding=utf-8 import requests from lxml import etree class ChaxunSpdier: def __init__(self): self.start_url = \'http://111.40.232.237:9000/eoms35/sheet/complaint/complaint.do?method=performQuery\' self.part_url = \'http://111.40.232.237:9000/eoms35/sheet/complaint/\' self.headers = { \'Connection\': \'keep-alive\', \'Cookie\': \'TSJSESSIONID=0000YvxNFfPYx8EBo8lsKNrKIl6:1bkt8lo7d\',#每次都得换一下 \'Host\': \'111.40.232.237:9000\', \'Referer\': \'http://111.40.232.237:9000/eoms35/sheet/complaint/complaint.do?method=showQueryPage&type=interface&urlType=complaint&userName=liuhaoce&workSerial=0&isDutyMaster=false&workSerialTime=&startDuty=&endDuty=\', \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36\'} def parse_url(self, url): formdata = { \'sheetIdStringExpression\': \'like\', \'main.sheetId\': \'\', # 工单流水号 \'titleStringExpression\': \'like\', \'main.title\': \'\', \'main.status\': \'\', \'statusChoiceExpression\': \'0\', \'task.taskName\': \'\', \'sendRoleIdStringExpression\': \'in\', \'main.sendRoleId\': \'\', \'sendDeptIdStringExpression\': \'in\', \'main.sendDeptId\': \'\', \'sendUserIdStringExpression\': \'in\', \'main.sendUserId\': \'\', \'operateRoleIdStringExpression\': \'in\', \'link.operateRoleId\': \'\', \'operateDeptIdStringExpression\': \'in\', \'link.operateDeptId\': \'\', \'operateUserIdStringExpression\': \'in\', \'link.operateUserId\': \'\', \'toDeptIdStringExpression\': \'in\', \'showArea\': \'大庆, 铁通\', # 投诉受理省份 \'main.toDeptId\': \'1005, 1021\', \'main.complaintType1\': \'\', \'complaintType1ChoiceExpression\': \'1010615100202\', # 投诉类型一:家宽业务 \'main.complaintType2\': \'\', \'complaintType2ChoiceExpression\': \'\', \'main.complaintType\': \'\', \'main.complaintType4\': \'\', \'main.complaintType5\': \'\', \'main.complaintType6\': \'\', \'main.complaintType7\': \'\', \'complaintNumStringExpression\': \'\', \'main.complaintNum\': \'\', \'parentCorrelationStringExpression\': \'\', \'main.parentCorrelation\': \'\', \'customAttributionStringExpression\': \'like\', \'main.customAttribution\': \'\', \'repeatComplaintTimesStringExpression\': \'>=\', \'main.repeatComplaintTimes\': \'\', \'complaintDescStringExpression\': \'like\', \'main.complaintDesc\': \'\', \'main.sendTime\': \'\', \'sendTimeStartDateExpression\': \'>=\', \'sendTimeStartDate\': \'2020-02-02 20:13:35\', # 开始时间 \'sendTimeLogicExpression\': \'and\', \'sendTimeEndDateExpression\': \'<=\', \'sendTimeEndDate\': \'2020-02-23 20:13:35\', # 结束时间 \'queryType\': \'record\' } response = requests.post(url, data=formdata, headers=self.headers) return response.content def get_content_list(self, html_raw): html = etree.HTML(html_raw) tr_list = html.xpath(\'//tbody/tr\') # 每一个tr里放了一行投诉 content_list = [] for content in tr_list: item = {} zineirong = content.xpath(\'./td\') # 每行投诉都封装在n个td标签下 item[\'工单主题\'] = zineirong[0].xpath(\'.//text()\')[0] item[\'工单流水号\'] = zineirong[1].xpath(\'./a/text()\')[0] # item[\'处理时限\'] = zineirong[3].xpath(\'./text()\')[0] detail_link = self.part_url + zineirong[1].xpath(\'./a/@href\')[0] detail_dict = self.get_gongdan_detail(detail_link) item[\'xiangqing\'] = detail_dict content_list.append(item) next_gongdan_url = self.part_url + html.xpath("//a[text()=\'下一页\']/@href")[0] if len(html.xpath("//a[text()=\'下一页\']/@href")) > 0 else None # 下一页工单列表明细 return content_list, next_gongdan_url def get_gongdan_detail(self, url): html_raw = self.parse_url(url) html = etree.HTML(html_raw) xiangqing_dict = {} xiangqing_dict[\'投诉内容\'] = html.xpath(\'//*[@id="complainttext"]/text()\') xiangqing_dict[\'派往对象\'] = html.xpath(\'//div[@id="ext-gen47"]/table/tbody/tr[4]/td[4]/text()\')#ifram里了,查不到 xiangqing_dict[\'qita\'] = html.xpath(\'//*[@id="ext-gen47"]/text()\') return xiangqing_dict def save_content_list(self, content_list): for i, v in enumerate(content_list, start=1): print(i, v) def run(self): next_url = self.start_url#工单查询主界面 content_total_list = [] while next_url is not None: html_raw = self.parse_url(next_url) # 获取访问每一页工单源数据 content_list, next_url = self.get_content_list(html_raw) # 提取url具体内容放在里列表里,获取下一页链接 content_total_list = content_total_list + content_list #将提取每一页内容加载到列表中 self.save_content_list(content_total_list) # 每一条工单内容打印一下 if __name__ == \'__main__\': Spdier = ChaxunSpdier() Spdier.run()