2bjiujiu

需求:

  获取西刺网代理ip信息,包括ip地址、端口号、ip类型

  西刺网:http://www.xicidaili.com/nn/

那,如何解决这个问题?

  分析页面结构和url设计得知:

    数据都在本页面可以全部获取,没有单独的详情页面

    下一页通过更改当前页面最后url后缀进行跳转页面,那我实现URL的拼接不就解决这个问题了

那,软件的运行环境?

    python3.5

    scrapy

    twisted

    request

    pymysql

  以上是第三方包,通过pip安装

  MySQL服务

其中db,user,password的值根据实际情况而定

#!/usr/bin/python3

__author__ = \'beimenchuixue\'
__blog__ = \'http://www.cnblogs.com/2bjiujiu/\'

import requests
import pymysql
from time import sleep
from random import randint, choice
from scrapy.selector import Selector
from twisted.enterprise import adbapi
from twisted.internet import reactor

# 数据库基本配置, 自行配置
db_settings = {
    \'host\': \'localhost\',
    \'db\': \'db_name\',
    \'user\': \'user_name\',
    \'password\': \'password\',
    \'charset\': \'utf8\',
    \'use_unicode\': True
}
# conn = pymysql.connect(**db_settings)
# cursor = conn.cursor()

# 生成连接池
db_conn = adbapi.ConnectionPool(\'pymysql\', **db_settings)


def go_sleep():
    """进行随机io堵塞,模仿人访问"""
    while randint(0, 1):
        sleep(choice([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]))


def get_sql(ip, port, ip_type):
    """获得sql语句"""
    if ip and port and ip_type:
        sql = """insert into
              ip_server(ip, port, ip_type)
               value (%s, %s, %s)
              on DUPLICATE key update ip=values(ip), port=values(port), ip_type=values(ip_type)"""
        try:
            params = (ip, int(port), ip_type)
        except Exception as e:
            print(e)
            return None
        return sql, params
    else:
        return None


def go_insert(cursor, sql, params):
    """数据库插入操作"""
    try:
        cursor.execute(sql, params)
    except Exception as e:
        print(e)


def get_ip():
    """爬取ip信息并存入数据库"""
    # 设置请求头
    headers = {
        \'Referer\': \'http://www.xicidaili.com/nn/\',
        \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36\'
    }
    # 获取50页的数据
    for page in range(1, 50):
        # 建立关系映射,增加程序可阅读性
        ip_index, port_index, type_index = 2, 3, 6
        # 爬取的url
        url = \'http://www.xicidaili.com/nn/{page}\'.format(page=page)
        
        go_sleep()
        
        response = requests.get(url, headers=headers)
        # 打印状态码
        print(response.status_code)
        # 进行页面解析
        selectors = Selector(text=response.text)
        all_trs = selectors.css(\'#ip_list .odd\')
        for tr in all_trs:
            ip = tr.css(\'td:nth-child(%s)::text\' % ip_index).extract_first()
            port = tr.css(\'td:nth-child(%s)::text\' % port_index).extract_first()
            ip_type = tr.css(\'td:nth-child(%s)::text\' % type_index).extract_first()
            sql, params = get_sql(ip, port, ip_type)
            if sql:
                try:
                    # cursor.execute(sql, params)
                    # conn.commit()
                    # 执行sql操作
                    db_conn.runInteraction(go_insert, sql, params)
    
                except Exception as e:
                    print(e)
            else:
                break

if __name__ == \'__main__\':
    get_ip()
    # 让twisted的sql操作去完成
    reactor.callLater(4, reactor.stop)
    reactor.run()

  

分类:

技术点:

相关文章:

  • 2022-02-07
  • 2022-12-23
  • 2022-12-23
  • 2021-07-02
  • 2022-02-07
  • 2022-02-07
  • 2021-10-07
  • 2022-02-07
猜你喜欢
  • 2022-12-23
  • 2022-02-07
  • 2022-02-07
  • 2021-04-08
  • 2022-02-07
  • 2022-02-07
相关资源
相似解决方案