【发布时间】:2022-01-14 17:28:51
【问题描述】:
嘿,我正在尝试使用 python selenium 从网站中抓取数据。虽然我植入了 time.sleeps() 在 2 次输入抓取后仍然被阻止,但网站限制是每秒总共不超过 10 个请求。我还尝试在每一行之后插入 time.sleep,但我仍然被阻止。是什么原因,能帮帮我吗?!!!
enter code here
def get_results(search_term):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-gpu')
# Replace YOUR-PATH-TO-CHROMEDRIVER with your chromedriver location
driver = webdriver.Chrome(executable_path=r'\\chromedriver_win32\\chromedriver.exe', options=chrome_options)
# Getting page HTML through request
page = driver.get(url)
# Parsing content using beautifulsouptotalScrapedInfo = [] # In this list we will save all the information we scrape
soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
#'puting data in search bar'
searchInput = driver.find_elements_by_css_selector("#entity-full-form")[0]
searchInput.clear()
searchInput.send_keys(search_term)
first_link = driver.find_elements_by_css_selector('#search')[0]
first_link.click()
driver.save_screenshot("screenshot-before-search.png")
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#col-cik")))
Ticker_link = driver.find_elements_by_css_selector('#col-cik')[0]
driver.save_screenshot("screenshot-after-search.png")
actions = ActionChains(driver)
actions.move_to_element(Ticker_link)
#Ticker_link.click()
Ticker_link.send_keys(Keys.SPACE)
time.sleep(10)
driver.save_screenshot("screenshot-after-checkbox-click.png")
tickerSymbols = driver.find_elements_by_css_selector("td.cik")[0].text
print(tickerSymbols)
print(search_term)
df['cik_col_e'][i]=tickerSymbols
df['cik_col_e_names'][i]=search_term
driver.save_screenshot("ticker1.png")
print(i)
except IndexError:
ND = driver.find_elements_by_css_selector("h4.text-center")[0].text
print('ND')
print(search_term)
df['cik_col_e'][i]='ND'
df['cik_col_e_names'][i]=search_term
print(i)
time.sleep(10)
return
dfee=dfE
dfee=dfee[1669:2362]
for names in dfee:
get_results(names)
i=i+1
df.to_excel("pythonrun_crsp_e.xlsx")
if i>=len(dfE):
break
【问题讨论】:
标签: python selenium web-scraping