dadazunzhe

爬虫之 网易新闻(体育)

https://sports.163.com/{i} i in [\'nba\',\'cba\',\'china\']

import os
import re
import requests

if not os.path.exists(\'网易新闻\'): # 生成文件夹
    os.mkdir(\'网易新闻\')

count = 0
for i in [\'nba\',\'cba\',\'china\']:
    # 获取所有的url
    response = requests.get(f\'https://sports.163.com/{i}/\')
    data = response.text
    url_res = re.findall(\'href="(https://sports.163.com/.*?)"\', data)
    url_res = set(url_res)

    # 针对单个url
    for url in url_res:
        url_response = requests.get(url)
        url_data = url_response.text

        try:
            title = re.findall(\'<h1>(.*?)</h1>\', url_data, re.S)[0]
            news_res = \
                re.findall(\'<div class="post_text" id="endText" style="border-top:1px solid #ddd;">(.*?责任编辑:.*?)</span>\',
                           url_data, re.S)[0]  #
            news_res = re.sub(\'<.*?>\', \'\', news_res)
        except:
            continue

        title = re.sub(\'[!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~,…]|\s\', \'\', title)  # 除掉标题所有的脏字符
        title_path = os.path.join(\'网易新闻\', f\'{title}.txt\')  # 拼接出新闻的路径
        f = open(title_path, \'w\', encoding=\'utf8\')
        f.write(news_res)
        f.flush()
        f.close()
        count += 1

        print(f\'完成{count}篇, {title} done...\')

分类:

技术点:

相关文章: