【问题标题】:Best way to store the data and re-access it in Python存储数据并在 Python 中重新访问数据的最佳方式
【发布时间】:2021-08-30 09:01:42
【问题描述】:

我正在构建一个可以连续运行拍卖网站的刮板。刮板先刮汽车的链接,然后去每个链接检查汽车是否出售。如果汽车已售出,刮刀会将数据刮到 CSV 文件中。如果未售出,则继续下一个链接并完成该过程。

该过程一旦完成,就会从头开始。刮掉汽车链接,然后将链接附加到列表中,然后从该链接中刮掉每辆车。 现在,这个过程的缺点是,如果脚本因任何原因停止,存储在列表中的数据也会丢失。

那么,存储数据的最佳方式是什么,这样即使脚本因任何原因中断,数据也不会丢失,并且可以在再次运行脚本时重新访问。 我试图将链接存储在文本文件中,但是当我读取文件时,写入文件后,它没有显示任何存储的链接。

下面是我的代码。

print('***Please enter the years range***')
year_from = 2000  # you can change this value.
year_to = 202  # you can change this value.
pause = 8  # will run again after 24 hours.

import requests
from scrapy.selector import Selector
import csv
import re
from time import sleep
import datetime
from random import randint


headers = {
    'authority': 'www.pickles.com.au',
    'cache-control': 'max-age=0',
    'sec-ch-ua': '^\\^Chromium^\\^;v=^\\^92^\\^, ^\\^',
    'sec-ch-ua-mobile': '?0',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'sec-fetch-site': 'none',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-user': '?1',
    'sec-fetch-dest': 'document',
    'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    'if-modified-since': 'Sun, 29 Aug 2021 20:36:16 GMT',
}

while True:
    pickles_links_list = []
    live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
    api_request = requests.get(url=live_auctions_api, headers=headers)
    for auctions in api_request.json():
        auction_link = auctions.get('viewSaleListingLink')
        if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
            auction_request = requests.get(url=auction_link, headers=headers)
            response = Selector(text=auction_request.text)

            sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
            sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
            if sales_id == []:
                continue
            auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range({year_from}..{year_to}).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
            auction_sale_link_requests = requests.get(url=auction_sale_link, headers=headers)

            auctions_data = auction_sale_link_requests.json().get('SearchResults')
            if auctions_data == []:
                print({"No results for": auction_sale_link_requests.url})
            for auction_data in auctions_data:
                ids = auction_data.get('TargetId')
                main_title = auction_data.get('Title')
                link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
                each_auction_link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
                pickles_links_list.append(each_auction_link)
                print({'Link': each_auction_link})

    # going through each link in the text file and checking the results
    with open('pickles.csv', 'a+', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_header = [
            'Title', 'Make','Model', 'Variant', 
            'Transmission', 'Odometer', 'State', 
            'Sale Price', 'Link', 'Sold Date & Time',
            'Sold To', 'Condition Report', 'Description',
            ]
        # csv_writer.writerow(csv_header)
        unique_links_list = list(set(pickles_links_list))
        print('''
            ###################################
            #                                 #
            #                                 #
            #     Now scraping sold items     #
            #                                 #
            #                                 #
            ###################################
                ''')
        sleep(1)
        print({'Total links': f'*** {len(unique_links_list)} ***'})
        sleep(3)
        for each_link in unique_links_list:
            print({'Scraping': each_link})
            random_delay = randint(1, 7)
            print(f'*** Sleeping for [{random_delay}] seconds ***')
            sleep(random_delay)
            each_auction_request = requests.get(each_link, headers=headers)
            response = Selector(text=each_auction_request.text)
            current_status = response.xpath('//h6[@class="mt-2"]/text()[2]').get()
            
            sold_auctions_list = []
            if current_status == 'This item has been sold. ' and each_link not in sold_auctions_list:
                ids = each_link.split('/')[-1]
                title = response.xpath('//div[@class="row"]//h1/text()').get()
                description = response.xpath('//td[@itemprop="description"]/text()').get()
                condition_report = response.xpath('//a[contains(text(), "Condition Report")]/@href').get()
                make = description.split(', ')[1]
                model = description.split(', ')[2]
                variant = description.split(', ')[3]
                transmission = response.xpath('//i[contains(@class, "transmission")]/following-sibling::span/text()').get()
                odometer = response.xpath('//i[contains(@class, "mileage")]/following-sibling::span/text()').get()
                state = response.xpath('//td[contains(text(), "Location")]/following-sibling::td/text()').get().split(', ')[-1]
                
                # bid history api 
                bid_history = f'https://www.pickles.com.au/PWR-Web/services/api/bidHistoryService/bidHistory?item={ids}'
                sold_item_request = requests.get(url=bid_history, headers=headers)
                sold_item_resp = sold_item_request.json()[0]
                winning_price = sold_item_resp.get('actualBid')
                sold_time_in_ms = sold_item_resp.get('bidTimeInMilliSeconds')
                sold_date_time = datetime.datetime.fromtimestamp(sold_time_in_ms / 1000.0, tz=datetime.timezone.utc).isoformat()
                sold_to = sold_item_resp.get('bidderAnonName')
                
                auction_values = [
                    title,  make, model, variant, transmission, odometer, 
                    state, "${:,.2f}".format(winning_price).strip() , 
                    each_auction_request.url, sold_date_time, sold_to,
                    f'https://www.pickles.com.au{condition_report}', description,
                ]
                
                csv_writer.writerow(auction_values)
                print('*** Sold item found and added to the CSV file ***')
                sold_auctions_list.append(each_link)
            else:
                print('*** This item is not sold yet ***')
                continue

【问题讨论】:

  • 数据库。
  • 您确定将其写入文本文件吗?它在代码中显示了一个扩展名为“.csv”的文件。 ://
  • 是的,我确实将数据保存到了文本文件中,但由于它在打开文件的同时没有从文件中读取,所以我删除并开始使用列表。

标签: python database web-scraping python-requests scrapy


【解决方案1】:

Python sqlitedb 方法: 参考:https://www.tutorialspoint.com/sqlite/sqlite_python.htm

  • 创建 sqlitedb。
  • 创建一个带有要 scraped 的 URL 的表,其架构类似于 创建表公司 (网址非空唯一, Status NOT NULL 默认“未开始”)
  • 现在只读取状态为“未开始”的行。
  • 抓取完成后,您可以将 URL 的状态列更改为成功。
  • 因此,无论脚本在哪里启动,它都只会运行一次未启动的运行。

【讨论】:

    【解决方案2】:

    您可以使用数据框来跟踪提取的链接,并使用 try catch 保存数据框以防脚本中断。这是示例代码。

    import pandas as pd
    import os
    
    class Scraping_data():
        def __init__(self):
            self.data_directory = 'your_data_directory'
    
        def load_links(self):
            df_links = pd.read_csv('./links_file.csv')
            if 'extracted_links.csv' in os.listdir(self.data_directory):
                df_extracted = pd.read_csv(os.path.join(self.data_directory, 'extracted_links.csv'))
                df_links = df_links[~df_links['links'].isin(df_extracted['links'])]
                df_links.reset_index(drop=True, inplace=True)
    
            else:
                df_extracted = pd.DataFrame(columns=['links', 'status'])
    
            return df_extracted, df_links
    
        def scrap_data(self):
            df_extracted, df_links = self.load_links()
            extracted_users = []
    
            try:
                for index, row in df_links.iterrows():
                    #Your Scrapping Logic Here.
                    #row['links'] will give you the current link.
    
                    #Upon Successfull extraction of a link.
                    data_row = {'links': row['link'], 'status': 'extracted'}
                    extracted_users.append(data_row)
    
                df_extracted = pd.concat([df_extracted, pd.DataFrame(data=extracted_users)], ignore_index=True)
                df_extracted.to_csv(os.path.join(self.data_directory, 'extracted_links.csv'), index=False)
    
            except:
                df_extracted = pd.concat([df_extracted, pd.DataFrame(data=extracted_users)], ignore_index=True)
                df_extracted.to_csv(os.path.join(self.data_directory, 'extracted_links.csv'), index=False)
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 2021-12-04
      • 2015-01-09
      • 2016-07-19
      • 2020-07-09
      • 2021-06-07
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多