【发布时间】:2020-06-21 08:50:22
【问题描述】:
我正在编写一个脚本,它会从一个站点中抓取所有 gpu,并将它们放入 csv 中。然后它应该将今天的csv与昨天的csv进行比较,看看是否有同名但价格不同的gpus(这样我就可以看到价格下降了)。我目前坚持这一点。有人可以帮忙吗?
代码:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import datetime
import pandas as pd
import numpy as np
class Scraper():
def __init__(self):
url = 'https://www.megekko.nl/Computer/Componenten/Videokaarten'
PATH = 'E:/win21/chromedriver_win32/chromedriver.exe'
self.driver = webdriver.Chrome(PATH)
self.today = str(datetime.date.today()).replace('-', '').replace(' ', '').replace(':','').replace('.', '')
self.yesterday = str(int(self.today) - 1)
self.fname = self.today + "products.csv"
self.f = open(self.fname, 'a')
self.fname2 = self.yesterday + "products.csv"
self.f = open(self.fname2, "a")
self.names = []
self.deliverytimes = []
self.prices = []
self.differences = []
self.driver.get(url)
self.filter()
# self.main(28)
self.compare(self.today + "products.csv", self.yesterday + "products.csv")
self.driver.close()
def main(self, ScrapedPages):
#Loop trough each page and get its data
for i in range(ScrapedPages):
self.soup = BeautifulSoup(self.driver.page_source, 'html.parser')
self.get_data()
self.export_data()
#Function to go to the next page
def next_page(self):
time.sleep(1)
next_page_element = self.driver.find_element_by_xpath('html/body/div[1]/main/div[1]/div[5]/div[1]/div[3]/img')
self.driver.execute_script("arguments[0].click();", next_page_element)
time.sleep(1)
def export_data(self):
raw_data = {"name": self.names,
"deliverytime": self.deliverytimes,
"price": self.prices}
self.df = pd.DataFrame(raw_data, columns = ['name', 'deliverytime', 'price'])
self.df.to_csv(self.today + "products.csv", index=False)
def append_data(self):
self.names.append(self.name)
self.deliverytimes.append(self.delivery_time)
self.prices.append(self.price)
print(self.name)
print(self.delivery_time)
print(self.price + "\n")
def get_data(self):
for self.container in self.soup.find_all('div', {'class':'navProductListitem'}):
self.name = self.container.div.img['title']
price_container = self.container.find_all('div', {'class':'euro'})
self.price = price_container[0]
self.price = self.price.text.strip()
self.price = self.price.replace(",", "").replace("-", "").replace('"', "")
self.price += ",-"
delivery_time_container = self.container.find_all('div', {'class':'voorraad'})
self.delivery_time = delivery_time_container[0].text.strip()
self.append_data()
self.next_page()
def compare(self, file1, file2):
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)
df_merged = pd.concat([df1, df2])
duplicates_df = df_merged[df_merged.duplicated(['name'])]
duplicates_df.to_csv("TEST1.csv")
print(duplicates_df)
# df_merged.insert(3, "Difference", "")
df_merged.to_csv("TEST2.csv", index=False)
def filter(self):
amd_checkbox_elem = self.driver.find_element_by_xpath('html/body/div/main/div[1]/div[5]/div[3]/div/div[5]/div/div[2]/label')
self.driver.execute_script("arguments[0].click();",amd_checkbox_elem)
time.sleep(2)
nvidia_checkbox_elem = self.driver.find_element_by_xpath('html/body/div/main/div[1]/div[5]/div[3]/div/div[5]/div/div[1]/label')
self.driver.execute_script("arguments[0].click();",nvidia_checkbox_elem)
time.sleep(2)
if __name__ == '__main__':
app = Scraper()
print("finished")
您可以看到我在搞乱比较功能,但它还没有工作。 它应该做什么的例子:
昨天的文件: RTX 2080Ti,1250 美元 RX 580,200 美元
今天的文件: RTX 2080Ti,1200 美元 RX 580,200 美元
比较函数应该创建一个包含 rtx 2080ti 及其从今天开始的价格的 csv/dataframe。 rx 580 应该被忽略
csv 文件结构示例:
【问题讨论】:
-
你能提供一个小例子来说明你的 df1 和 df2 在 compare() 中的结构吗?我猜 gpu 名称是列名?
-
在帖子中添加了一张图片。它们的结构相同
标签: python pandas web-scraping