【发布时间】:2021-12-27 15:53:45
【问题描述】:
我必须抓取 CCI 网站 (https://www.cci.fr/agent-immobilier?company_name=agences%20immobili%C3%A8res%20&brand_name=&siren=&numero_carte=&code_region=84&city=&code_postal=&person_name=&state_recherche=1&name_region=AUVERGNE-RHONE-ALPES&page=0) 并且我需要转到每个证明(链接与“证明 de ...”)抓取整个页面并返回主页(第一个链接) 并继续为网站的所有链接的所有页面执行此操作。所以我设法为一个链接做它并返回,但我正在努力为整个第一页然后其他页面做它,直到没有更多页面离开。此外,这可能需要几天甚至几周的时间,因为我必须为所有地区做这件事,我听说过多处理,但我正在努力实现它,无论如何,我可能需要一些帮助。谢谢 !这是我的代码:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
import time
with open('output_test_auvergne.csv', 'w') as file:
file.write("business_names; town_pc; region \n")
driver = webdriver.Chrome(ChromeDriverManager().install()) # initialise chrome driver
driver.get(
'https://www.cci.fr/agent-immobilier?company_name=agences%20immobili%C3%A8res%20&brand_name=&siren=&numero_carte=&code_region=84&city=&code_postal=&person_name=&state_recherche=1&name_region=AUVERGNE-RHONE-ALPES&__cf_chl_captcha_tk__=pmd_74hrnIdUsNgz2TJJCM33kpVYFY4hRG420hx18Sk1ITA-1634596843-0-gqNtZGzNBBCjcnBszQil')
driver.maximize_window()
time.sleep(3)
cookie = driver.find_element_by_xpath("//*[@id='tarteaucitronPersonalize2']")
try:
cookie.click()
finally:
pass
visited_pages = ['1']
with open('output_test_auvergne.csv', 'w') as file:
while True:
table_rows = driver.find_elements_by_css_selector('table tr')
business_name = None
for row in table_rows: #loop for identifying elements
try:
business_name = row.find_element_by_css_selector('.titre_entreprise').text
continue
except:
pass
try:
if row.get_attribute('class') == 'lien-fiche':
for index, td in enumerate(row.find_elements_by_css_selector('td')):
if index == 0:
attestation_name = td.text
if index == 1:
city = td.text
if index == 2:
region = td.text
except:
pass
link2 = row.find_element_by_xpath('//*[@id="main-content"]/div[5]/table/tbody/tr/td/a')
for links2 in link2:
link2.click()
num_attestation = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[1]/strong').text
date_delivrance = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[2]/div[1]/strong').text
delivre_par = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[2]/div[1]/div/strong').text
date_disponibilite = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[2]/div[2]/strong').text
president = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[2]/div[3]/strong').text
fonction = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[3]/div[3]/strong').text
etendue_pouvoir = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[3]/div[4]/strong').text
num_carte_pro = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[4]/div[1]/div[1]/strong[1]').text
dispo_carte_pro = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[4]/div[1]/div[1]/strong[2]').text
date_delivrance_carte_pro = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[4]/div[1]/div[2]/strong').text
organisme_delivrance = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[4]/div[1]/div[3]/strong').text
titulaire_carte = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[4]/div[2]/div[2]/strong[1]').text
forme_juridique = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[4]/div[2]/div[2]/strong[2]').text
adresse = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[4]/div[2]/div[4]/strong/div[1]').text
nom_commercial = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[4]/div[2]/div[5]/strong[1]').text
num_identification = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[4]/div[2]/div[5]/strong[2]').text
representant_legal_nom = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[5]/div/strong[1]').text
representant_legal_prnom = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[5]/div/strong[2]').text
garantie_fonciere = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[6]/div[2]/div[3]/strong').text
detention_fonds = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[6]/div[2]/div[2]').text
assurance_nom = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[7]/div[3]/strong').text
adresse_assurance = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[7]/div[4]/strong').text
cp_ville_assurance = driver.find_element_by_xpath('//*[@id="agent-immobilier__document"]/div[2]/div[7]/div[6]/strong').text
driver.back();
file.write(business_name + ";" + attestation_name + ';' + city + ";" + region + ";" + num_attestation + ";" + date_delivrance + ";" + delivre_par + ";" + date_disponibilite + ";" (president) + ";" + fonction + ";" + etendue_pouvoir + ";" + num_carte_pro + ";" + dispo_carte_pro + ";" + date_delivrance_carte_pro + ";" + organisme_delivrance + ";" + titulaire_carte + ";" + forme_juridique + ";" + adresse + ";" + nom_commercial + ";" + num_identification + ";" + representant_legal_nom + ";" + representant_legal_prnom + ";" + garantie_fonciere + ";" + detention_fonds + ";" + assurance_nom + ";" + adresse_assurance + ";" + cp_ville_assurance + "\n")
pass
number_of_pages = driver.find_element_by_css_selector("a[rel='next']").click()
current_page = driver.find_element_by_css_selector('a[title="Page courante"]').text
if current_page in visited_pages:
break
visited_pages =+ current_page
file.close()
driver.close()
【问题讨论】:
标签: python loops selenium-webdriver web-scraping pagination