用 Beautiful Soup 和 Python 抓取到 CSV答案

【问题标题】：Scraping with Beautiful Soup and Python to CSV用 Beautiful Soup 和 Python 抓取到 CSV
【发布时间】：2020-04-22 12:08:42
【问题描述】：

尝试使用 Beautiful Soup 和 Selenium 从房地产网站上的列表中获取楼层尺寸（平方英尺）和地块尺寸（公顷）。

在控制台中可以很好地打印楼层尺寸

但在写入 csv 文件时，不会提取地板尺寸列下的“平方英尺”信息

似乎如果 BS4 在规定的 ID 元素中找到“sq ft”，则会返回该 ID 元素，并且在写入 csv 时，所有其他“sq ft”文本都会在每个其他 url 上传递。正如您在 (image) 上看到的那样，尽管这两个链接也有公顷数，但其中两个列表都有这个：

http://property.shw.co.uk/propertyInfo/11080/145151-London-Road-Croydon--CR0-2RG http://property.shw.co.uk/propertyInfo/16162/Public-HouseRestaurant-Site-Westvale-Park-Horley-Surrey--RH6-0HJ

有人可以解释为什么 sq ft 打印在控制台上但没有写入 csv 吗？任何帮助将不胜感激。

CP2_CPContent_conDetails1_divDetails 是楼层大小和地块大小的相关定位符的相关 HTML：

<div id="CP2_CPContent_conDetails1_divDetails">
                0.3 Acres <br>(0.12 Hectares)
                <div class="clear"></div>

                <div id="CP2_CPContent_conDetails1_divDes" class="divInfo">
                      Potential building size of 6,458 sq ft (600 sq m)<br>
                </div>

代码如下：

driver = webdriver.Chrome()
shw_search_url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(shw_search_url)


#identify and extract listing links from each page
def get_house_links(url, driver, pages=3):
    house_links = []
    driver.get(url)
    for i in range(pages):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        listings = soup.find_all("a", class_="L")
        page_data = [row['href'] for row in listings]
        house_links.append(page_data)
        time.sleep(np.random.lognormal(0, 1))
        next_button = soup.select('img[src*="propNext"]')
        if next_button:
            next_button = next_button[0].find_parent('a')
            next_button_link = 'http://property.shw.co.uk' + next_button['href']
            driver.get(next_button_link)
    return house_links

#get html data from url and return as object
def get_html_data(url, driver):
    driver.get(url)
    time.sleep(np.random.lognormal(0,1))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    return soup

def get_lot_size(soup):
    try:
        for element in soup.find_all('div', {'id':'CP2_CPContent_conDetails1_divDetails'}):
            lot_size = element.find_next(text=re.compile('Hectares'))
        lot_size = lot_size.replace("(", "").replace(")", "")
        print(lot_size)
        return lot_size
    except:
        return 'NA'

def get_floor_size(soup):
    try:
        for element in soup.find('div', {'id': 'CP2_CPContent_conDetails1_divDetails'}):
            floor_size = element.find_next(text=re.compile('sq ft'))
        print(floor_size)
        return floor_size
    except:
        return 'NA'

def flatten_list(house_links):
    house_links_flat = []
    for sublist in house_links:
        for item in sublist:
            house_links_flat.append(item)
    return house_links_flat

def get_house_data(driver, house_links_flat):
    house_data = []
    for link in house_links_flat:
        soup = get_html_data(link, driver)
        floor_size = get_floor_size(soup)
        lot_size = get_lot_size(soup)
        house_data.append([floor_size, lot_size])

    return house_data

house_links_3pages = get_house_links(shw_search_url,driver,pages=3)
house_links_flat = flatten_list(house_links_3pages)
house_data_3pages = get_house_data(driver,house_links_flat)


#open and write results to csv
file_name = "SHW %s_%s.csv" % (str(time.strftime("%Y-%m-%d")),
                           str(time.strftime("%H:%M%S")))
columns = ["Floor_Size", "Lot_Size"]
pd.DataFrame(house_data_3pages, columns = columns).to_csv(
    file_name, index = False, encoding = "UTF-8"
)

【问题讨论】：

在不同的地方使用 print() 来查看变量中的值 - 也许你忘记将它们添加到列表/数据框。
或者您可能将它放在不同的列中 - 即。有地址。
您应该始终使用except Exception as ex: print('ex:', ex) - 也许您有错误导致所有这些问题，您甚至都不知道。

标签： python pandas selenium beautifulsoup export-to-csv

【解决方案1】：

使用您的代码获取Hectares 没有问题。

sq ft 有问题 - 它甚至不显示。都是因为你在

中使用了find() 而不是find_all()

 for element in soup.find()

但是find() 不会返回包含元素的列表，而是单个元素，然后for 不会从列表中获取此元素，但它可能会获取它的子元素，并且它会在错误的位置搜索sq ft。

from selenium import webdriver
import numpy as np
import time
import re
from bs4 import BeautifulSoup
import pandas as pd

driver = webdriver.Chrome()
shw_search_url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(shw_search_url)


#identify and extract listing links from each page
def get_house_links(url, driver, pages=3):
    house_links = []
    driver.get(url)
    for i in range(pages):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        listings = soup.find_all("a", class_="L")
        page_data = [row['href'] for row in listings]
        house_links.append(page_data)
        time.sleep(np.random.lognormal(0, 1))
        next_button = soup.select('img[src*="propNext"]')
        if next_button:
            next_button = next_button[0].find_parent('a')
            next_button_link = 'http://property.shw.co.uk' + next_button['href']
            driver.get(next_button_link)
    return house_links

#get html data from url and return as object
def get_html_data(url, driver):
    driver.get(url)
    time.sleep(np.random.lognormal(0,1))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    return soup

def get_lot_size(soup):
    try:
        for element in soup.find_all('div', {'id':'CP2_CPContent_conDetails1_divDetails'}):
            lot_size = element.find_next(text=re.compile('Hectares'))
            if lot_size:
                lot_size = lot_size.replace("(", "").replace(")", "")
                lot_size = lot_size.strip()
            print('lot_size:', lot_size)
        return lot_size
    except Exception as ex:
        print("EX:", ex)
        return 'NA'

def get_floor_size(soup):
    try:
        for element in soup.find_all('div', {'id': 'CP2_CPContent_conDetails1_divDetails'}):
            floor_size = element.find_next(text=re.compile('sq ft'))
            if floor_size:
                floor_size = floor_size.strip()
            print('floor_size:', floor_size)
        return floor_size
    except Exception as ex:
        print("EX:", ex)
        return 'NA'

def flatten_list(house_links):
    house_links_flat = []
    for sublist in house_links:
        for item in sublist:
            house_links_flat.append(item)
    return house_links_flat

def get_house_data(driver, house_links_flat):
    house_data = []
    for link in house_links_flat:
        soup = get_html_data(link, driver)
        floor_size = get_floor_size(soup)
        lot_size = get_lot_size(soup)
        house_data.append([floor_size, lot_size])
        print('-------------------')

    return house_data

house_links_3pages = get_house_links(shw_search_url,driver,pages=3)
house_links_flat = flatten_list(house_links_3pages)
house_data_3pages = get_house_data(driver,house_links_flat)


#open and write results to csv
file_name = "SHW %s_%s.csv" % (str(time.strftime("%Y-%m-%d")),
                           str(time.strftime("%H:%M%S")))
columns = ["Floor_Size", "Lot_Size"]
pd.DataFrame(house_data_3pages, columns = columns).to_csv(
    file_name, index = False, encoding = "UTF-8"
)

CSV：

Floor_Size,Lot_Size
,0.21 Hectares
7342 sq ft,
1665 sq ft,
"The existing property extends to approximately 2,290 sq m (24,649 sq ft) GIA and sits within an L-shaped site extending to approximately 0.6 acres (0.25 hectares). Fronting London Road is a four storey commercial building, built as a garage with offices above which is currently occupied by a motor company at ground floor level, and by a church across the upper floors and basement. To the rear of the site fronting Montague Road are a number of single storey industrial buildings, currently occupied by a hand carwash. The remainder of the front forecourt and rear of the site is hard standing, predominantly used as car parking.",0.25 Hectares
4672 to 20302 sq ft,
,0.36 Hectares
,0.08 Hectares
,0.18 Hectares
2325 sq ft,
,0.02 Hectares
5288 sq ft,
0 sq ft,
,0.36 Hectares
,0.18 Hectares
"*  Potential building size of 6,458 sq ft (600 sq m)",0.12 Hectares
1258 to 5385 sq ft,
,0.13 Hectares
3600 sq ft,
,0.24 Hectares
6781 to 6871 sq ft,

【讨论】：

如果我想将 house_links 中的 URL 写入 csv，你能告诉我这样做的方法吗？ @furas
house_data.append([link, floor_size, lot_size]) 和 columns = ["Link", "Floor_Size", "Lot_Size"] 它应该添加指向 CSV 的链接
非常感谢@furas！非常感谢您的帮助