【发布时间】:2021-11-02 10:32:41
【问题描述】:
我的代码应该提供以下格式的输出。
我尝试修改代码但我破坏了它。
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
# options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
# print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def parse_data(url):
try:
browser = create_driver()
browser.get(url)
df = pd.read_html(browser.page_source)[0]
except KeyError:
print('KeyError')
return None
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
# URLs go here
urls = {
"https://www.oddsportal.com/matches/soccer/20210903/",
}
if __name__ == '__main__':
results = None
# To limit the number of browsers we will use
# (set to a large number if you don't want a limit):
MAX_BROWSERS = 5
pool = ThreadPool(min(MAX_BROWSERS, len(urls)))
for game_data in pool.imap(parse_data, urls):
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
# ensure all the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
print(results.head())
我收到了这个错误:
Traceback (most recent call last):
File "C:\Users\harsh\AppData\Roaming\JetBrains\PyCharmCE2021.2\scratches\scratch_13.py", line 107, in <module>
for game_data in pool.imap(parse_data, urls):
File "C:\Program Files\Python39\lib\multiprocessing\pool.py", line 870, in next
raise value
File "C:\Program Files\Python39\lib\multiprocessing\pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "C:\Users\harsh\AppData\Roaming\JetBrains\PyCharmCE2021.2\scratches\scratch_13.py", line 72, in parse_data
league = count[2].text
IndexError: list index out of range
结果通常采用以下格式:
date time game score home_odds draw_odds away_odds country league
0 None 15:30 Wolves - Manchester Utd 0:1 393/100 69/25 39/50 England Premier League
1 None 13:00 Burnley - Leeds 1:1 231/100 64/25 123/100 England Premier League
2 None 13:00 Tottenham - Watford 1:0 23/50 87/25 709/100 England Premier League
3 28 Aug 2021 16:30 Liverpool - Chelsea 1:1 29/20 59/25 207/100 England Premier League
4 28 Aug 2021 14:00 Aston Villa - Brentford 1:1 109/100 58/25 74/25 England Premier League
5 28 Aug 2021 14:00 Brighton - Everton 0:2 33/25 113/50 239/100 England Premier League
6 28 Aug 2
021 14:00 Newcastle - Southampton 2:2 73/50 257/100 189/100 England Premier League
如何获取数据?
详细:
我有一个代码循环运行 url 以进行下一场比赛,我想修改它。用于oddsportal 的匹配“下一个匹配”的 Xpath 是://*[@id="col-content"]/div[3]/div/div/span 这个图片。
请帮忙
【问题讨论】:
-
@booboo 我正在尝试将behaviour of this code 模拟为您非常有用的解决方案。
-
每次您认为我们的回答不够快时,您都无法创建一个新问题
-
同意,但是这是另一个解决方案/问题
-
如果您的代码中有四个不同的索引错误来抓取同一个网站,那么您显然需要在此处询问之前对该主题进行更多的个人研究
标签: python web-scraping beautifulsoup index-error