此脚本从页面中获取有关人员的信息并以 JSON 格式打印出字符串:
import re
import json
import requests
from bs4 import BeautifulSoup
url = 'https://ofsistorage.blob.core.windows.net/publishlive/ConList.html'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
persons = []
for li in soup.select('li:has(b:contains("Name 6:"))'):
name = [name.find_next_sibling(text=True).strip() for name in li.select('b')[:6]]
name = [n for n in name if '/' not in n]
if len(name) > 1:
last, *_, first = name
else:
last, first = '-', name[0]
dob = li.select_one('b:contains("DOB:")')
dob = dob.find_next_sibling(text=True).strip().replace('\xa0', '') if dob else '-'
pob = li.select_one('b:contains("POB:")')
pob = pob.find_next_sibling(text=True).strip().replace('\xa0', '') if pob else '-'
nationality = li.select_one('b:contains("Nationality:")')
nationality = nationality.find_next_sibling(text=True).strip().replace('\xa0', '') if nationality else '-'
gender = re.findall(r'((?:fe)?male)', li.get_text(strip=True, separator=' '), flags=re.I)
gender = gender[0] if gender else '-'
other = li.select_one('b:contains("Other Information:")')
other = other.find_next_sibling(text=True).strip().replace('\xa0', '') if other else '-'
persons.append({
'firstname': first,
'lastname': last,
'about': {
'date_of_birth': dob,
'place_of_birth': pob,
'nationality': nationality,
'gender': gender
},
'other': other
})
print(json.dumps(persons, indent=4))
打印:
[
{
"firstname": "ABDUL AZIZ",
"lastname": "ABBASIN",
"about": {
"date_of_birth": "--/--/1969.",
"place_of_birth": "Sheykhan village, Pirkowti Area, Orgun District, Paktika Province, Afghanistan",
"nationality": "-",
"gender": "-"
},
"other": "UN Ref TAi.155. Key commander in the Haqqani Network (TAe.012) under Sirajuddin Jallaloudine Haqqani (TAi.144). Taliban Shadow Governor of Orgun District, Paktika Province, as of early 2010. Operated a training camp for non-Afghan fighters in Paktika Province. Has been involved in the transport of weapons to Afghanistan."
},
{
"firstname": "AZIZIRAHMAN",
"lastname": "ABDUL AHAD",
"about": {
"date_of_birth": "--/--/1972.",
"place_of_birth": "Shega District, Kandahar Province, Afghanistan",
"nationality": "Afghan",
"gender": "-"
},
"other": "UN Ref TAi.121. Belongs to Hotak tribe."
},
{
"firstname": "BARADAR",
"lastname": "ABDUL AHMAD TURK",
"about": {
"date_of_birth": "--/--/1968.",
"place_of_birth": "Yatimak village, Dehrawood District, Uruzgan Province, Afghanistan",
"nationality": "Afghan",
"gender": "-"
},
"other": "UN Ref TAi.024. Arrested in Feb 2010 and in custody in Pakistan. Extradition request to Afghanistan pending in Lahore High Court, Pakistan as of June 2011. Belongs to Popalzai tribe. Senior Taliban military commander and member of Taliban Quetta Council as of May 2007. DOB is approximate."
},
... and so on.