使用Pyquery+selenium抓取淘宝商品信息

配置文件，配置好数据库名称，表名称，要搜索的产品类目，要爬取的页数

MONGO_URL = \'localhost\'
MONGO_DB = \'taobao\'
MONGO_TABLE = \'phone\'

SERVICE_ARGS = [
    \'--disk-cache=true\',  # 在phantomjs时使用缓存
    \'--load-images=false\'  # 使用phantomjs时不加载出图片
]

KEYWORD = \'手机\'
MAXPAGE = 5

主程序

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2018-06-14 22:02:26
# @Author  : Chenjun (320316430@qq.com;)
# @Link    : http://example.org
# @Version : $Id$

import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from pyquery import PyQuery as pq
from config import *
import pymongo   #使用mongodb数据库存储，在此python提供pymongo库方便使用

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)  #使用phantomjs无界面浏览器，在爬虫抓取时更方便，并且提供api配置
browser.set_window_size(1400, 900)
wait = WebDriverWait(browser, 10)  #设置等待时长等待信息加载出来
 

#拿到所有的商品信息
def search():
    print(\'正在搜索...\')
    try:
        browser.get(\'https://www.taobao.com\')
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, \'#q\')) #等待输入框加载出来并插入光标
        )
        submit = wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, \'#J_TSearchForm > div.search-button > button\')))  #等待搜索兼可被点击
        input.send_keys(KEYWORD)  #模拟用户输入
        submit.click()  #模拟用户点击
        get_products()
        total = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, \'#mainsrp-pager > div > div > div > div.total\')))  #获取搜索结果总页数
        return total.text
    except TimeoutException:
        return search()


def next_page(page_number):
    print(\'正在翻页...\')
    try:
        input = wait.until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, \'#mainsrp-pager > div > div > div > div.form > input\')) #等待输入页码框加载出来并插入光标
        )
        submit = wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, \'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit\')))  #等待跳转按钮可以被点击
        input.clear()  #清除当前页码
        input.send_keys(page_number)  #模拟输入新页码
        submit.click()  #模拟点击
        wait.until(EC.text_to_be_present_in_element(
            (By.CSS_SELECTOR, \'#mainsrp-pager > div > div > div > ul > li.item.active > span\'), str(page_number)))  #等到网页跳转到输入的页面
        get_products()
    except TimeoutException:
        next_page(page_number)


#拿到具体商品信息
def get_products():
    wait.until(EC.presence_of_element_located((
        By.CSS_SELECTOR, \'#mainsrp-itemlist .items .item\')))  #等待商品被加载出来
    html = browser.page_source  #拿到当前页面dom文档
    doc = pq(html)
    items = doc(\'#mainsrp-itemlist .items .item\').items()
    count = 0
    for item in items:
        count += 1
        product = { #pyquery解析文档
            \'image\': item.find(\'.pic .img\').attr(\'src\'),
            \'price\': item.find(\'.price\').text(),
            \'deal\': item.find(\'.deal-cnt\').text()[:-3],
            \'title\': item.find(\'.title\').text(),
            \'shop\': item.find(\'.shop\').text(),
            \'location\': item.find(\'.location\').text()
        }
        save_to_mongo(product, count)
    print(type(items), type(item))


def save_to_mongo(result, count):
    try:
        if db[MONGO_TABLE].insert(result):  #存储到mongodb
            print(f\'存储{count}到了MONGODB成功\')
    except Exception:
        print(\'存储失败\')


def main():
    try:
        total = search()
        total = int(re.compile(\'(\d+)\').search(total).group(1))
        if total >= MAXPAGE:
            total = MAXPAGE
        for i in range(2, total + 1):
            next_page(i)
    except Exception:
        print(\'出错啦!\')
    finally:
        browser.close() #无论成败，记得关闭浏览器

if __name__ == \'__main__\':
    main()