【发布时间】:2021-12-30 03:08:38
【问题描述】:
我正在使用 Beautifulsoup 从单个 MP 页面中提取信息,例如https://publications.parliament.uk/pa/cm/cmregmem/211115/cox_geoffrey.htm
我想提取每个带编号的粗体标题下的文本(例如“1. 就业和收入”)并单独保存。每个不同 MP 的标题都会发生变化(例如,有些声明“3. 来自英国的礼物、福利和款待”,有些则没有)——我想要一个适用于任何 MP 页面的脚本。
目前,我正陷入一团糟,试图用循环来做这件事。我对 BS(和 python)很陌生,所以我觉得我可能错过了一个技巧。有人有什么想法吗?
import requests
from bs4 import BeautifulSoup
#urls
home_url = "https://publications.parliament.uk/pa/cm/cmregmem/211101/"
#extracting list of mp names and links + save as tuples in list (mp_list)
home_page = requests.get(home_url+'contents.htm')
home_soup = BeautifulSoup(home_page.content, "html.parser")
mp_list = []
mp_elements = home_soup.find_all("p", attrs={'class':None, 'xmlns':'http://www.w3.org/1999/xhtml'})
for mp_element in mp_elements:
try:
mp_name = list(mp_element.children)[1].text.strip()
mp_url = list(mp_element.children)[1]['href']
mp_list.append((mp_name,mp_url))
except:
pass
#extract text from mp page
mp_url = home_url+mp_list[115][1] ##this is just to pick out an example MP page to test with
print(mp_url)
mp_page = requests.get(mp_url)
mp_soup = BeautifulSoup(mp_page.content, "html.parser")
mp_text_all = mp_soup.find_all("p")
mp_text_list = []
for item in mp_text_all:
mp_text_list.append(item.text)
编辑:我最终想出了这个。见下文。
def compile_indv_mp_page_dict(text):
## save consituency to mp_page_dict before it's removed in next line
mp_constituency = get_constituency(text[0])
mp_page_dict_v1 = {}
## mp_page_dict_v1 {'h1':0, 'h8':9, ...}
for line in text:
if line in list(headings_dict.values()):
for h in list(headings_dict.keys()):
if headings_dict[h] == line:
mp_page_dict_v1[h] = text.index(line)
## mp_page_dict {'h1':[0,1,2,3], 'h8':[4,5,6], ...}
h_end = len(text)
for index, item in enumerate(list(mp_page_dict_v1.items())):
try:
h_var1 = list(mp_page_dict_v1.items())[index][0]
h_var2 = list(mp_page_dict_v1.items())[index+1][0]
mp_page_dict_v1[h_var1] = list(range(mp_page_dict_v1[h_var1], mp_page_dict_v1[h_var2]))
except:
mp_page_dict_v1[h_var1] = list(range(mp_page_dict_v1[h_var1],h_end))
mp_page_dict = {}
## mp_page_dict {'h1':['text', 'text', 'text'], 'h2':['text','text','text'], ...}
for key, line_list in list(mp_page_dict_v1.items()):
text_list = []
for line in line_list:
if text[line] in list(headings_dict.values()):
pass
else:
text_list.append(text[line])
full_heading = headings_dict[key]
mp_page_dict[full_heading] = "\n".join(text_list)
return mp_page_dict
【问题讨论】:
标签: python python-3.x web-scraping beautifulsoup scrapy