【需求】输入关键字,如书包,可以搜索出对应商品的信息,包括:商品标题、商品链接、价格范围;且最终的商品信息需要符合:包邮、价格差不会超过某数值
#coding=utf-8 """ 以下三个字可以自行设置:search_keyword、page、price_interval_max """ #设置搜索的关键字 search_keyword = "戒指" #设置需要搜索的商品的页数,比如设置10,就是淘宝搜出结果中前10页的商品数据,淘宝默认一页有44个商品 page = 10 #设置最大价格和最小价格之间可接受的差 price_interval_max = 1000 import re, os, requests, sys, time, shutil from selenium import webdriver from lxml import etree from xlrd import open_workbook from xlutils.copy import copy reload(sys) sys.setdefaultencoding( "utf-8" ) time1 = time.time() phantomjs_path = os.getcwd() + "phantomjs.exe" driver=webdriver.PhantomJS(executable_path=\'D:/Python27/Scripts/phantomjs.exe\') # driver=webdriver.PhantomJS(executable_path=phantomjs_path) search_url = \'https://s.taobao.com/search\' payload = {\'q\':search_keyword, \'s\':\'1\', \'ie\':\'utf8\'} #字典传递url参数 payload1 = {\'ie\':\'utf8\'} excel_path_ori = os.getcwd() + "//result.xls" excel_path = os.getcwd() + "//tb_result.xls" if not os.path.exists(excel_path): shutil.copy(excel_path_ori, excel_path) else: os.remove(excel_path) shutil.copy(excel_path_ori, excel_path) file = open(\'taobao_test.txt\', \'w\') sheetName = "Sheet1" url_lineindex = 0 title_lineindex = 1 price_lineindex = 2 price_interval_lineindex = 3 interval_lineindex = 4 fee_lineindex = 5 def Write_Excel(rowIndex, lineIndex, content): """ - rowIndex:行 - lineIndex:列 """ rowIndex = int(rowIndex) lineIndex = int(lineIndex) rb = \'r+w\' rb = open_workbook(excel_path, \'r\') rbook = open_workbook(excel_path, \'w\') wb = copy(rbook) sheetIndex = rbook.sheet_names().index(sheetName) wb.get_sheet(int(sheetIndex)).write(int(rowIndex), int(lineIndex), content) wb.save(excel_path) def get_detail_price(url): """ 获取价格范围字段 :param url: :return: """ driver.get(url) time.sleep(1) html=driver.page_source selector=etree.HTML(html) if "tmall" in url: detail_price = selector.xpath(\'//div[@class="tm-promo-price"]/span[@class="tm-price"]/text()\') elif "taobao" in url: detail_price = selector.xpath(\'//em[@class="tb-rmb-num"]/text()\') return detail_price def get_price_interval(price): """ 部分商品的价格是一个范围,如:12.00-25.00,以下获取价格范围,及价格差 :param price: :return: """ print price price_interval = price[0] price_interval = \'\'.join(price_interval) if "-" in price_interval: start_price = price_interval.split("-")[0] end_price = price_interval.split("-")[1] interval = float(end_price) - float(start_price) else: interval = 0 return price_interval, interval def get_url_test(): """ 获取商品信息:标题、链接、最大价格、价格范围、价格差 :return:NONE """ j = 0 Write_Excel(j, url_lineindex, u"商品链接") Write_Excel(j, title_lineindex, u"商品标题") Write_Excel(j, price_lineindex, u"最低价格") Write_Excel(j, price_interval_lineindex, u"价格范围") Write_Excel(j, interval_lineindex, u"价格差") Write_Excel(j, fee_lineindex, u"运费") for k in range(0, page): #10次,就是10页的商品数据 payload[\'s\'] = 44 * k + 1 #此处改变的url参数为s,s为1时第一页,s为45是第二页,89时第三页以此类推 resp = requests.get(search_url, params=payload) #设置编码 title = re.findall(r\'"raw_title":"([^"]+)"\', resp.text, re.I) #正则保存所有raw_title的内容,这个是书名,下面是价格,地址 price = re.findall(r\'"view_price":"([^"]+)"\', resp.text, re.I) loc = re.findall(r\'"i003d568963194127tem_loc":"([^"]+)"\', resp.text, re.I) url = re.findall(r\'"detail_url":"([^"]+)"\', resp.text, re.I) fee = re.findall(r\'"view_fee":"([^"]+)"\', resp.text, re.I) x = len(title) #每一页商品的数量 for i in range(0, x) : #把缓冲中的数据保存到文件中 print i print(\'商品标题:\' + title[i]) print(\'最低价格:\' + price[i]) print(\'运费:\' + fee[i]) #获取商品链接 url[i] = url[i].replace("\u003d","=").replace("\u0026","&") # print(\'goods_url:\' + url[i]) url[i] = "https:" + url[i] print(\'商品链接:\' + url[i]) #获取商品价格区间 try: resp_detail = requests.get(url[i]) resp_detail.encoding = \'utf-8\' detail_price = get_detail_price(url[i]) data = get_price_interval(detail_price) price_interval = data[0] interval = data[1] print(\'price_interval:\' + price_interval) print(\'interval:\' + str(interval)) #保存数据 file.write( str(k * 44 + i + 1) + \'商品链接:\' + url[i] + \'\n\' + \'商品标题:\' + title[i] + \'\n\' + \'最低价格:\' + price[i] + \'\n\' + \'价格范围:\' + str(price_interval) + \'\n\' + \'价格差:\' + str(interval) + \'\n\' ) # \'goods_fee:\' + fee[i] + \'\n\') #将过滤数据写入excel表格 if fee[i] == "0.00" and interval < int(price_interval_max): print "该商品符合要求:包邮,且最大价格与最小价格差小于%s" % price_interval_max j = j + 1 Write_Excel(j, url_lineindex, url[i]) Write_Excel(j, title_lineindex, title[i]) Write_Excel(j, price_lineindex, price[i]) Write_Excel(j, price_interval_lineindex, price_interval) Write_Excel(j, interval_lineindex, interval) Write_Excel(j, fee_lineindex, fee[i]) except: print "该商品信息获取失败,跳过" continue get_url_test() # #环境恢复 file.close() os.system("taskkill /im phantomjs.exe") time2 = time.time() print u\'ok,结束!\' print u\'总共耗时:\' + str((time2 - time1)/60) + \'分钟\'