【发布时间】:2021-04-01 10:59:32
【问题描述】:
我正在尝试为新闻头条构建一个边缘档案刮板,我的主要目标是从给定的月份和年份中刮取数据。该代码几天前可以正常工作,并且可以正常滚动,但现在无法滚动并且每次都卡住了。我试图滚动 CTRL+END 的动作链,但它不起作用。我也尝试了其他方法,但没有运气
def scrolling_func(wait,driver):
print("It is trying to scroll")
SCROLL_PAUSE_TIME = 5
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
ActionChains(driver).key_down(Keys.CONTROL).send_keys('END').key_up(Keys.CONTROL).perform()
load_button = driver.find_element_by_css_selector('.p-button')
# driver.execute_script("arguments[0].scrollIntoView();", load_button)
element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.p-button')))
# ActionChains(driver).move_to_element(load_button).click().perform()
load_button.click()
time.sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# driver.delete_all_cookies()
time.sleep(1)
ActionChains(driver).key_down(Keys.CONTROL).send_keys('HOME').key_up(Keys.CONTROL).perform()
而刮刀是
def scraper(years,months):
PATH = r"C:\Users\astar\Stock market tutorials\chromedriver_win64\chromedriver.exe"
options = webdriver.ChromeOptions()
options.use_chromium = True
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
driver = webdriver.Chrome(executable_path=PATH,options=options)
driver.maximize_window()
urls = parse_dates(years,months)
final_headlines = []
final_dates = []
final_links = []
for url in urls:
driver.get(url)
done=True
while done:
try:
wait = WebDriverWait(driver,10)
scrolling_func(wait,driver)
except:
done=False
ActionChains(driver).key_down(Keys.CONTROL).send_keys('HOME').key_up(Keys.CONTROL).perform()
soup = BeautifulSoup(driver.page_source,'lxml')
#https://stackoverflow.com/questions/5041008/how-to-find-elements-by-class
#https://stackoverflow.com/questions/42732958/python-parallel-execution-with-selenium
#https://stackoverflow.com/questions/44245451/how-to-scrape-multiple-html-page-in-parallel-with-beautifulsoup-in-python
#https://stackoverflow.com/questions/45816619/selenium-firefox-webdriver-for-python-keyerror-value
num_articles = soup.find("h1",class_="p-page-title").text
current = num_articles[num_articles.find("for")+4:num_articles.find("(")]
articles_num = num_articles[num_articles.find("(")+1:-1]
titles = soup.find_all("h2",class_="c-entry-box--compact__title")
dates = soup.find_all("time",class_="c-byline__item")
if articles_num != len(titles):
logger.warning("Actual #articles {} and #scraped articles {} for {}".format(articles_num,len(titles),current))
print(len(titles),len(dates))
headlines_results = map(title_extractor,titles)
dates_results = map(date_extractor,dates)
links_results = map(link_extractor,titles)
def list_process(gens):
return [gen for gen in gens]
headlines = list_process(headlines_results)
dates = list_process(dates_results)
links = list_process(links_results)
final_headlines.extend(headlines)
final_dates.extend(dates)
final_links.extend(links)
time.sleep(15)
print(len(final_headlines),len(final_dates),len(final_links))
assert len(final_headlines)==len(final_dates)==len(final_links), f'Different lengths of headlines {len(headlines)} and date {len(dates)}'
data = {"Headlines":final_headlines,"Dates":final_dates,"Links":final_links}
df = pd.DataFrame(data)
df.to_csv('file1.csv')
return df
如果 name == "main": 刮刀(["2021"],["3"])
正如我所说,它无法滚动,几天前它运行良好,但现在它坏了。同样早些时候,我遇到了无法加载页面的整个列表的问题,因为它被卡住了。有人能帮我一下吗?提前致谢。
【问题讨论】:
标签: python-3.x selenium selenium-webdriver web-scraping selenium-chromedriver