突发奇想想读取12306的车票信息,最开始想用requests,但是突然又想试试selenium的无界面浏览器。有部分正则没调好,写好就懒得调了。
套用我师傅的话就是:我凭本事写的bug,凭什么要改!
二、方案思路
url = https://kyfw.12306.cn/otn/leftTicket/init
1、模拟用户是怎么查车票信息、然后通过selenium去操作浏览器。
2、最后输出字典。
三、源码
#-*- coding:utf-8 -*- #__anthor__:"Klay Zhu" #date: 2018/9/7 import re from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions from selenium.webdriver.common.by import By from time import sleep class TicketQuery: """ 查询12306车票 """ def __init__(self,**parame): try: self.obj_driver = parame["browser"] except: #无界面浏览器 options = Options() options.add_argument('-headless') # 无头参数 self.obj_driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=options) self.str_from_station = parame["fromStation"] #出发地 self.str_to_station = parame["toStation"] #目的地 self.str_train_date = parame["trainDate"] #出发日 def waitElement(self,value,way=By.ID): """等待忍耐""" if WebDriverWait(self.obj_driver, 10).until(expected_conditions.visibility_of_element_located((way,value))): return True def __jsOperation(self,element_id,value): """通过js操作界面元素""" if self.waitElement(element_id): js_value = 'document.getElementById("{}").value="{}"'.format(element_id,value) self.obj_driver.execute_script(js_value) def __getStation(self,element_id,string): """对input选择框进行选择""" self.obj_driver.find_element_by_id(element_id).clear() self.obj_driver.find_element_by_id(element_id).click() self.obj_driver.find_element_by_id(element_id).send_keys(string) __a = self.obj_driver.find_elements_by_class_name("ralign") for i in __a: if i.text == string: i.click() break #正则没调好 懒得调了 pattern = r"<tr[^<>]*><td[^<>]*>\s*<div[^<>]*><div[^<>]*><div[^<>]*><a[^<>]*>(?P<train_num>[^<>]*)</a>[^<>]*<span[^<>]*></span></div><span[^<>]*><span[^<>]*>[^<>]*</span><b[^<>]*></b></span></div><div[^<>]*><strong[^<>]*>(?P<from_station>[^<>]*)</strong><strong[^<>]*>(?P<to_station>[^<>]*)</strong></div><div[^<>]*><strong[^<>]*>(?P<arrival_time>[^<>]*)</strong><strong[^<>]*>(?P<departure_time>[^<>]*)</strong></div><div[^<>]*><strong[^<>]*>(?P<over_time>[^<>]*)</strong><span[^<>]*>(?P<intraday>[^<>]*)</span></div></div></td><td[^<>]*?>(?P<param1>([^<>]{1,4})|(<div>\d+</div>))</td>\s*<td[^<>]*?>(?P<param2>([^<>]{1,4})|(<div>\d+</div>))</td>\s*<td[^<>]*?>(?P<param3>([^<>]{1,4})|(<div>\d+</div>))</td>\s*<td[^<>]*?>(?P<param4>([^<>]{1,4})|(<div>\d+</div>))</td>\s*<td[^<>]*?>(?P<param5>([^<>]{1,4})|(<div>\d+</div>))</td>\s*<td[^<>]*?>(?P<param6>([^<>]{1,4})|(<div>\d+</div>))</td>\s*<td[^<>]*?>(?P<param7>([^<>]{1,4})|(<div>\d+</div>))</td>\s*<td[^<>]*?>(?P<param8>([^<>]{1,4})|(<div>\d+</div>))</td>\s*<td[^<>]*?>(?P<param9>([^<>]{1,4})|(<div>\d+</div>))</td>\s*<td[^<>]*?>(?P<param10>([^<>]{1,4})|(<div>\d+</div>))</td>\s*<td[^<>]*?>(?P<param11>([^<>]{1,4})|(<div>\d+</div>))</td>\s*<td[^<>]*?>.*?</a></td></tr>" __url = r"https://kyfw.12306.cn/otn/leftTicket/init" def run(self): """执行操作""" sleep(2) # 不等这2秒会出现页面没刷新,数据获取不了的情况 self.obj_driver.get(self.__url) self.obj_driver.maximize_window() self.__getStation("fromStationText",self.str_from_station) self.__getStation("toStationText",self.str_to_station) self.__jsOperation("train_date",self.str_train_date) try: self.obj_driver.find_element_by_id("a_search_ticket").click() except: self.obj_driver.find_element_by_id("query_ticket").click() sleep(2) # 不等这2秒会出现页面没刷新,数据获取不了的情况 html = self.obj_driver.find_element_by_id("queryLeftTable").get_attribute("outerHTML") self.obj_driver.close() return html def outData(self,string = ""): if not string:string=self.run() obj_rr=re.finditer(self.pattern, string) list_data=[] if obj_rr: for obj_r in obj_rr: dict_ticket_info = { "train_num": "", # 车次 "from_station": "", # 出发站 "to_station": "", # 抵达站 "arrival_time": "", # 到站时间 "departure_time": "", # 离站时间 "over_time": "", # 历经时间 "intraday": True, # 是否当日到达 # 车票数据[0]特等座,[1]一等座,[2]二等座,[3]高级软卧,[4]软卧,[5]动卧,[6]硬卧,[7]软座, # [8]硬座,[9]无座,[10]其他,int数据代表剩余票数,Ture代表有票数量未知,False代表没票 "data": [], } dict_ticket_info["train_num"] = obj_r.group("train_num") dict_ticket_info["from_station"] = obj_r.group("from_station") dict_ticket_info["to_station"] = obj_r.group("to_station") dict_ticket_info["arrival_time"] = obj_r.group("arrival_time") dict_ticket_info["departure_time"] = obj_r.group("departure_time") dict_ticket_info["over_time"] = obj_r.group("over_time") if obj_r.group("intraday") == "当日到达":dict_ticket_info["intraday"] = True else:dict_ticket_info["intraday"] = False for i in range(1,11): str_info = obj_r.group("param" + str(i)) obj_r2=re.search(r"<div>(?P<num>\d+)</div>",str_info) if obj_r2:dict_ticket_info["data"].append(obj_r2.group("num")) elif str_info=="有":dict_ticket_info["data"].append(True) else:dict_ticket_info["data"].append(False) # print(dict_ticket_info) list_data.append(dict_ticket_info) return list_data if __name__ == '__main__': import datetime m_time=(datetime.datetime.now()+datetime.timedelta(days=10)).strftime("%Y-%m-%d") testdata = { "fromStation":"北京", "toStation":"上海", "trainDate":m_time, #10天后 # "browser":webdriver.Chrome() #调试用 } data = TicketQuery(**testdata).run() print(data)