概览页抓取链接
1 import requests 2 import re 3 import pymysql 4 import hashlib 5 import datetime 6 7 8 class Demo(object): 9 def __init__(self): 10 self.host = \'127.0.0.1\' 11 self.db = \'app_mark\' 12 self.user = \'root\' 13 self.passwd = \'123456\' 14 self.charset = \'utf8mb4\' 15 self.headers = { 16 \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36\', 17 } 18 self.url = \'http://www.lvmama.com/\' 19 self.channel_link = [ 20 \'http://s.lvmama.com/group/H13K110000?keyword=%E6%99%AE%E5%90%89%E5%B2%9B&k=0#list\', # 海岛 21 \'http://s.lvmama.com/route/H13K310000?keyword=%E6%96%B0%E5%8A%A0%E5%9D%A1&k=0#list\', # 东南亚 22 \'http://s.lvmama.com/route/H13K310000?keyword=%E9%A6%99%E6%B8%AF&k=0#list\', # 中国港澳台 23 \'http://s.lvmama.com/group/H13K110000?keyword=%E8%BF%AA%E6%8B%9C&k=0#list\', # 迪拜 24 \'http://s.lvmama.com/group/C262H13K110000?keyword=%E4%BF%84%E7%BD%97%E6%96%AF&tabType=group#list\', # 俄罗斯 25 \'http://s.lvmama.com/group/H13K110000Y4?keyword=%E8%B6%8A%E5%8D%97#list#list\', # 越南 26 \'http://s.lvmama.com/group/C265H13K110000?keyword=%E6%B3%95%E5%9B%BD&tabType=group#list%22\', # 法瑞意德 27 \'http://s.lvmama.com/group/H13K110000?keyword=%E5%B7%B4%E5%8E%98%E5%B2%9B&k=0#list\', # 巴厘岛 28 \'http://s.lvmama.com/route/H13K310000?keyword=%E6%97%A5%E6%9C%AC&k=0#list\', # 日本 29 \'http://s.lvmama.com/route/H13K310000?keyword=%E6%AC%A7%E6%B4%B2&k=0#list\', # 欧美 30 \'http://s.lvmama.com/route/H13K440100?keyword=%E6%96%B0%E5%8A%A0%E5%9D%A1&k=0#list\', # 新加坡 31 \'http://s.lvmama.com/route/H13K310000?keyword=%E9%A6%99%E6%B8%AF&k=0#list\', # 香港 32 \'http://s.lvmama.com/route/H13K310000?keyword=%E6%BE%B3%E6%B4%B2&k=0#list\', # 澳洲 33 \'http://s.lvmama.com/route/H13K310000?keyword=%E6%B3%B0%E5%9B%BD&k=0#list\', # 泰国 34 \'http://s.lvmama.com/route/H13K440300?keyword=%E4%B8%89%E4%BA%9A&k=0#list\', # 三亚 35 \'http://s.lvmama.com/route/H13K440300P2?keyword=%E4%B8%89%E4%BA%9A&tabType=route350\', # 三亚p2 36 \'http://s.lvmama.com/route/H13K440300P3?keyword=%E4%B8%89%E4%BA%9A&tabType=route350\', # 三亚p3 37 \'http://s.lvmama.com/route/H13K440300P4?keyword=%E4%B8%89%E4%BA%9A&tabType=route350\', # 三亚p4 38 \'http://s.lvmama.com/route/H13K440300?keyword=%E5%8E%A6%E9%97%A8&k=0#list\', # 厦门 39 \'http://s.lvmama.com/route/H13K440300?keyword=%E5%B9%BF%E4%B8%9C&k=0#list\', # 广东 40 \'http://s.lvmama.com/route/H13K440300?keyword=%E4%BA%91%E5%8D%97&k=0#list\', # 云南 41 \'http://s.lvmama.com/route/H13K440300?keyword=%E4%B8%8A%E6%B5%B7&k=0#list\', # 上海 42 \'http://s.lvmama.com/route/H13K440300?keyword=%E8%A5%BF%E5%AE%89&k=0#list\', # 西安 43 \'http://s.lvmama.com/route/H13K440300?keyword=%E6%88%90%E9%83%BD&k=0#list\', # 成都 44 \'http://s.lvmama.com/route/H13K440300?keyword=%E5%90%89%E6%9E%97&k=0#list\', # 吉林 45 \'http://s.lvmama.com/route/H13K440300?keyword=%E8%A5%BF%E5%8C%97&k=0#list\', # 西北 46 \'http://s.lvmama.com/scenictour/K110000?keyword=%E5%8C%97%E4%BA%AC&k=0#list\', # 北京 47 \'http://s.lvmama.com/scenictour/K110000?keyword=%E5%B1%B1%E4%B8%9C&k=0#list\', # 山东 48 \'http://s.lvmama.com/scenictour/K110000?keyword=%E5%B1%B1%E8%A5%BF&k=0#list\', # 山西 49 \'http://s.lvmama.com/scenictour/K110000?keyword=%E6%B2%B3%E5%8C%97&k=0#list\', # 河北 50 \'http://s.lvmama.com/scenictour/K110000?keyword=%E8%BE%BD%E5%AE%81&k=0#list\', # 辽宁 51 ] 52 self.channel_name = [ 53 \'海岛\', 54 \'东南亚\', 55 \'中国港澳台\', 56 \'迪拜\', 57 \'俄罗斯\', 58 \'越南\', 59 \'法瑞意德\', 60 \'巴厘岛\', 61 \'日本\', 62 \'欧洲\', 63 \'新加坡\', 64 \'香港\', 65 \'澳洲\', 66 \'泰国\', 67 \'三亚\', 68 \'三亚p2\', 69 \'三亚p3\', 70 \'三亚p4\', 71 \'厦门\', 72 \'广东\', 73 \'云南\', 74 \'上海\', 75 \'西安\', 76 \'成都\', 77 \'吉林\', 78 \'西北\', 79 \'北京\', 80 \'山东\', 81 \'山西\', 82 \'河北\', 83 \'辽宁\', 84 ] 85 86 def get_html(self, url): 87 response = requests.get(url, headers=self.headers) 88 response.encoding = response.apparent_encoding 89 html = response.text 90 return html 91 92 def get_data(self): 93 # 首页抓取 94 # html = self.get_html(self.url) 95 # datas = re.findall(\'<li data-mmurl=.*?<div class="footLink">\', html, re.S)[0] 96 # lis = re.findall(\'(<li data-mmurl=.*?</li>)\', datas, re.S) 97 # for li in lis: 98 # # detail_url = re.findall(\'<li data-mmurl="(.*?)"\', li, re.S) # 详情页app链接 99 # detail_url = re.findall(\'href="(.*?)"\', li, re.S)[0] # 详情页网页链接 100 # self.save_data(detail_url) 101 # print(datas) 102 103 # 频道抓取 104 urls = [] 105 # 正则匹配链接 106 for index, channel in enumerate(self.channel_link): 107 html = self.get_html(channel) 108 divs = re.findall(\'<div class="product-left".*<div class="paging orangestyle"\', html, re.S)[0] 109 divs = re.findall(\'<div class="product-section">.*?</div>\', divs, re.S) 110 for div in divs: 111 print(self.channel_name[index]) 112 url = re.findall(\'<a href="(.*?)"\', div, re.S)[0] 113 self.save_data(url) 114 115 def save_data(self, url): 116 print(url) 117 hkey = hashlib.md5(url.encode(encoding=\'utf-8\')).hexdigest() 118 sitename = \'驴妈妈旅游\' 119 lasttime = datetime.datetime.now().strftime(\'%Y-%m-%d %H:%M:%S\') 120 tag = \'0\' 121 list_sql = [url, hkey, tag, sitename, lasttime] 122 con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) 123 cur = con.cursor() 124 sql = \'insert into gly(link, hkey, tag, sitename, lasttime) values (%s, %s, %s, %s, %s)\' 125 try: 126 cur.execute(sql, list_sql) 127 print(\'insert success\') 128 except Exception as e: 129 con.rollback() 130 print(\'error~\', e) 131 else: 132 con.commit() 133 cur.close() 134 con.close() 135 136 137 if __name__ == \'__main__\': 138 demo = Demo() 139 demo.get_data()
细览页解析字段
1 import pymysql 2 import re 3 import requests 4 from multiprocessing.dummy import Pool as ThreadPool 5 import datetime 6 7 8 class XLY(object): 9 def __init__(self): 10 self.host = \'127.0.0.1\' 11 self.db = \'app_mark\' 12 self.user = \'root\' 13 self.passwd = \'123456\' 14 self.charset = \'utf8mb4\' 15 self.headers = { 16 \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36\' 17 } 18 self.start = datetime.datetime.now() 19 20 def get_data(self): 21 # 从gly表中拿链接 22 con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) 23 cur = con.cursor() 24 sql = \'select link from gly where tag = "1" and sitename="驴妈妈旅游"\' 25 after_sql = \'update gly set tag="1" where tag="0" and sitename = "驴妈妈旅游"\' 26 try: 27 cur.execute(sql) 28 results = cur.fetchall() 29 cur.execute(after_sql) 30 except Exception as e: 31 con.rollback() 32 results = None 33 print(\'error~\', e) 34 else: 35 con.commit() 36 cur.close() 37 con.close() 38 return results 39 40 def parse_data(self, url): 41 # 正则匹配各个字段 42 print(url) 43 url = url[0] 44 # 匹配id 45 id = url.split(\'/\')[-1] 46 id = re.sub(\'\?.*\', \'\', id) 47 # print(id) 48 response = requests.get(url, headers=self.headers) 49 html = response.text 50 if \'scenic\' not in url and \'hotel\' not in url: 51 # 去掉酒店和景点 52 # 匹配标题 53 title = re.findall(\'<h.*?tit">(.*?)</h.*?>\', html, re.S) 54 if title: 55 title = title[0] 56 title = re.sub(\'\n|\r| |自营|<[\s\S]*?>\', \'\', title) 57 title = title.strip() 58 else: 59 title = re.findall(\'<p class="nchtitle">(.*?)</p>\', html, re.S) 60 if title: 61 title = title[0] 62 title = re.sub(\'\n|\r| |自营|<[\s\S]*?>\', \'\', title) 63 title = title.strip() 64 # 匹配价格 65 price = re.findall(\'<dfn.*?>(\d+)</dfn>\', html, re.S) 66 if price: 67 price = price[0] 68 else: 69 price = re.findall(\'<span class="product_price">.*?(\d+).*?</span>\', html, re.S) 70 if price: 71 price = price[0] 72 else: 73 price = re.findall(\'¥<em>(\d+)</em>\', html, re.S) 74 if price: 75 price = price[0] 76 else: 77 price = re.findall(\'<span class="product-price-value">.*?(\d+).*?</span>\', html, re.S) 78 if price: 79 price = price[0] 80 else: 81 price = None 82 # 匹配好评率 83 praise = re.findall(\'<p class="product_top_dp">[\s\S]*?<span>([\s\S]*?)</span>[\s\S]*?</p>\', html, re.S) 84 if praise: 85 praise = praise[0] 86 praise = re.sub(\'<.*?>\', \'\', praise) 87 praise = praise.strip() 88 else: 89 praise = re.findall(\'<a href="#pro_comment".*?<span>([\s\S]*?)</span>\', html, re.S) 90 if praise: 91 praise = praise[0] 92 else: 93 praise = re.findall(\'<span class="c_f60">([\s\S]*?)</span>\', html, re.S) 94 if praise: 95 praise = praise[0] 96 praise = praise.strip() 97 else: 98 praise = re.findall(\'<p class="product_top_dp">[\s\S]*?<span>([\s\S]*?)<small>%</small>[\s\S]*?</span>\', html, re.S) 99 if praise: 100 praise = praise[0] 101 praise = praise.strip() 102 else: 103 praise = re.findall(\'<span class="val">([\s\S]*?)</span>\', html, re.S) 104 if praise: 105 praise = praise[0] 106 if praise: 107 if \'%\' in praise: 108 praise = re.sub(\'%\', \'\', praise) 109 praise = float(praise) 110 if praise > 100: 111 praise = None 112 print(\'好评率抓取错误\') 113 else: 114 pass 115 else: 116 praise = None 117 # 匹配出发地 118 starting_city = re.findall(\'<dl class="info-city">[\s\S]*?出发城市[\s\S]*?<ii>([\s\S]*?)</ii></dd>\', html, re.S) 119 target_city = re.findall(\'<dt>目的地[\s\S]*?<dd>([\s\S]*?)</dd>\', html, re.S) 120 if starting_city: 121 starting_city = starting_city[0] 122 starting_city = re.sub(\'<.*?>\', \'\', starting_city) 123 # 匹配目的地 124 target_city = target_city[0] 125 target_city = re.sub(\'<.*?>\', \'\', target_city) 126 # 匹配天数 127 days_spent = re.findall(\'<dt>出游天数[\s\S]*?<dd>([\s\S]*?)</dd>\', html, re.S)[0] 128 days_spent = re.sub(\'<.*?>\', \'\', days_spent) 129 # print(days_spent) 130 else: 131 starting_city = target_city = days_spent = None 132 # 匹配类型 133 type_ = re.findall(\'<i class="t-category">([\s\S]*?)</i>\', html, re.S) 134 if type_: 135 type_ = type_[0] 136 else: 137 type_ = re.findall(\'<span class="product_top_type product_type_zyx">([\s\S]*?)</span>\', html, re.S) 138 if type_: 139 type_ = type_[0] 140 else: 141 type_ = re.findall(\'<span class="dpn_group">([\s\S]*?)</span>\', html, re.S) 142 if type_: 143 type_ = type_[0] 144 else: 145 type_ = None 146 # print(type_) 147 list_data = [id, title, price, praise, starting_city, target_city, days_spent, type_, url] 148 self.save_data(list_data) 149 150 def save_data(self, list_data): 151 # 写入数据库 152 con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) 153 cur = con.cursor() 154 sql = \'insert into lvmama(id_num, title, price, praise, starting_city, target_city, days_spent, type_, link) values (%s, %s, %s, %s, %s, %s, %s, %s, %s)\' 155 # cur.execute(sql, list_data) 156 # con.commit() 157 try: 158 cur.execute(sql, list_data) 159 print(\'insert success\') 160 except Exception as e: 161 con.rollback() 162 print(\'error~\', e) 163 else: 164 con.commit() 165 cur.close() 166 con.close() 167 168 169 if __name__ == \'__main__\': 170 xly = XLY() 171 urls = xly.get_data() 172 if urls: 173 # 开启多线程 174 pool = ThreadPool(20) 175 pool.map(xly.parse_data, urls) 176 pool.close() 177 pool.join() 178 end = datetime.datetime.now() 179 print(\'耗时:\', (end-xly.start)) 180 # for url in urls: 181 # url = url[0] 182 # xly.parse_data(url) 183 # break