网页抓取时写入多个文件作为输出 - python bs4答案

【问题标题】：Writing multiple files as output when webscraping - python bs4网页抓取时写入多个文件作为输出 - python bs4
【发布时间】：2019-07-06 15:35:22
【问题描述】：

前言 - 我对 python 很陌生，我的 HTML 技能是幼儿园水平。

因此，我正在尝试保存来自this website 的报价，其中包含许多针对美国大选候选人的每个成员的链接。

我已经设法获得了提取引号的实际代码（在 soem stackoverflow 用户的帮助下），但我不知道如何将这些引号写入每个候选人的单独文本文件中。

例如，第一页，包含 Justin Amash 的所有引语应写入文件：JustinAmash.txt。第二页，Michael Bennet 的所有引言都应该写到 MichaelBennet.txt（或那种形式的东西）。等等..有没有办法做到这一点？

作为参考，为了抓取页面，以下代码有效：

import bs4
from urllib.request import Request,urlopen as uReq, HTTPError 
#Import HTTPError in order to avoid the links with no content/resource of interest
from bs4 import BeautifulSoup as soup_
import re
#define url of interest
my_url = 'http://archive.ontheissues.org/Free_Trade.htm'


def make_soup(url):
    # set up known browser user agent for the request to bypass HTMLError
    req=Request(url,headers={'User-Agent': 'Mozilla/5.0'})

    #opening up connection, grabbing the page
    uClient = uReq(req)
    page_html = uClient.read()
    uClient.close()

    #html is jumbled at the moment, so call html using soup function
    soup = soup_(page_html, "lxml") 
    return soup

# Test: print title of page
#soup.title

soup = make_soup(my_url)
tags = soup.findAll("a" , href=re.compile("javascript:pop\("))
#print(tags)

# open a text file and write it if it doesn't exist
file1 = open("Quotefile.txt","w")

# get list of all URLS
for links in tags:
    link = links.get('href')
    if "java" in link: 
        print("http://archive.ontheissues.org" + link[18:len(link)-3])
        main_url = "http://archive.ontheissues.org" + link[18:len(link)-3] 
        try:
            sub_soup = make_soup(main_url)
            content_collexn = sub_soup.body.contents #Splitting up the page into contents for iterative access 
            #text_data = [] #This list can be used to store data related to every person
            for item in content_collexn:
                #Accept an item if it belongs to the following classes
                if(type(item) == str):
                    print(item.get_text())
                elif(item.name == "h3"):
                    #Note that over here, every h3 tagged title has a string following it
                    print(item.get_text())   
                    #Hence, grab that string too
                    print(item.next_sibling) 
                elif(item.name in ["p", "ul", "ol"]):
                    print(item.get_text())

        except HTTPError: #Takes care of missing pages and related HTTP exception
            print("[INFO] Resource not found. Skipping to next link.")

        #print(text_data)

【问题讨论】：

标签： python-3.x for-loop web-scraping beautifulsoup

【解决方案1】：

您可以将该文本数据存储到以text_data 开头的列表中。加入所有这些项目，然后写入文件：

比如：

import bs4
from urllib.request import Request,urlopen as uReq, HTTPError 
#Import HTTPError in order to avoid the links with no content/resource of interest
from bs4 import BeautifulSoup as soup_
import re
#define url of interest
my_url = 'http://archive.ontheissues.org/Free_Trade.htm'


def make_soup(url):
    # set up known browser user agent for the request to bypass HTMLError
    req=Request(url,headers={'User-Agent': 'Mozilla/5.0'})

    #opening up connection, grabbing the page
    uClient = uReq(req)
    page_html = uClient.read()
    uClient.close()

    #html is jumbled at the moment, so call html using soup function
    soup = soup_(page_html, "lxml") 
    return soup

# Test: print title of page
#soup.title

soup = make_soup(my_url)
tags = soup.findAll("a" , href=re.compile("javascript:pop\("))
#print(tags)

# open a text file and write it if it doesn't exist
#file1 = open("Quotefile.txt","w")

# get list of all URLS
candidates = []
for links in tags:

    link = links.get('href')
    if "java" in link: 
        #print("http://archive.ontheissues.org" + link[18:len(link)-3])
        main_url = "http://archive.ontheissues.org" + link[18:len(link)-3]
        candidate = link.split('/')[-1].split('_Free_Trade')[0]

        if candidate in candidates:
            continue
        else:
            candidates.append(candidate)

        try:
            sub_soup = make_soup(main_url)
            content_collexn = sub_soup.body.contents #Splitting up the page into contents for iterative access 
            text_data = [] #This list can be used to store data related to every person
            for item in content_collexn:
                #Accept an item if it belongs to the following classes
                if(type(item) == str):
                    #print(item.get_text())
                    text_data.append(item.get_text())
                elif(item.name == "h3"):
                    #Note that over here, every h3 tagged title has a string following it
                    #print(item.get_text()) 
                    text_data.append(item.get_text())
                    #Hence, grab that string too
                    #print(item.next_sibling) 
                    text_data.append(item.next_sibling)
                elif(item.name in ["p", "ul", "ol"]):
                    #print(item.get_text())
                    text_data.append(item.get_text())

        except HTTPError: #Takes care of missing pages and related HTTP exception
            print("[INFO] Resource not found. Skipping to next link.")
            candidates.remove(candidate)
            continue

        text_data = '\n'.join(text_data)
        with open("C:/%s.txt" %(candidate), "w") as text_file:
            text_file.write(text_data)
        print('Aquired: %s' %(candidate))

【讨论】：