【发布时间】:2020-11-16 21:20:58
【问题描述】:
我对 ps4 游戏进行了网络抓取,我想使用标签 x 作为我拥有的 product_name 变量和标签 y 作为我拥有的价格变量来构建一个条形图。还有另一个最便宜的 ps4 游戏的饼图,使用我的价格变量来确定这 5 个品牌(全部在我的品牌变量下找到)的 100 个百分比:playstation、Ubisoft、Activision、Sega 和 Electronic Arts,在我的数据中代表.所以饼图会有每个品牌的名称和百分比,其余的百分比将是标签“其他”。 任何提示我该如何开始。
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
from collections import defaultdict
import re
import matplotlib.pyplot as plt
url='https://www.newegg.com/PS4-Video-Games/SubCategory/ID-3141'
with uReq(url) as uClient:
page = uClient.read()
# parsing
page_soup = soup(page, "html.parser")
# grabs products
containers= page_soup.findAll("div",{"class":"item-container"})
# save to file
filename = "products.csv"
#creating two empty dictionaries
d = defaultdict(list)
d1 = defaultdict(list)
# for loop fills dict
for container in containers:
#brand name
brand = container.div.div.a.img["title"]
#product name
title = container.findAll("a", {"class":"item-title"})
product_name = title[0].text
#shipping
shipping_container = container.findAll("li", {"class":"price-ship"})
shipping = shipping_container[0].text.strip()
#price column
pricec = container.find("li", {"class":"price-current"})
#removing all white spaces
price= pricec.text.strip('price-current')
d['Product'].append(product_name)
d['shipping'].append(shipping)
d1['Product'].append(product_name)
d1['Brand'].append(brand)
d1['price'].append(price)
# create dataframe
df = pd.DataFrame(d) #product and shipping
df1 =pd.DataFrame(d1) #product and brand
# clean shipping column
df['shipping'] = df['shipping'].apply(lambda x: 0 if x == 'Free Shipping' else x)
#cleaning price column
df1['price'] = df1['price'].str.extract('(\d+\.?\d+)').astype(float)
#string converted to float
df['shipping'] = df['shipping'].apply(lambda x: 0 if x == 'Special Shipping' else x) # probably should be handled in a special way
df['shipping'] = df['shipping'].apply(lambda x: x if x == 0 else re.sub("[^0-9]", "", x))
df['shipping'] = df['shipping'].astype(float)
# save dataframe to csv file
df.to_csv('dataframe.csv', index=False)
df1.to_csv('dataframe1.csv', index=False)
df2 = pd.merge(df,df1, how ='inner') #pandas and merge data frames
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
from collections import defaultdict
import re
url='https://www.newegg.com/PS4-Video-Games/SubCategory/ID-3141'
with uReq(url) as uClient:
page = uClient.read()
# parsing
page_soup = soup(page, "html.parser")
# grabs products
containers= page_soup.findAll("div",{"class":"item-container"})
# save to file
filename = "products.csv"
#creating two empty dictionaries
d = defaultdict(list)
d1 = defaultdict(list)
# for loop fills dict
for container in containers:
#brand name
brand = container.div.div.a.img["title"]
#product name
title = container.findAll("a", {"class":"item-title"})
product_name = title[0].text
#shipping
shipping_container = container.findAll("li", {"class":"price-ship"})
shipping = shipping_container[0].text.strip()
#price column
pricec = container.find("li", {"class":"price-current"})
#removing all white spaces
price= pricec.text.strip('price-current')
d['Product'].append(product_name)
d['shipping'].append(shipping)
d1['Product'].append(product_name)
d1['Brand'].append(brand)
d1['price'].append(price)
# create dataframe
df = pd.DataFrame(d) #product and shipping
df1 =pd.DataFrame(d1) #product and brand
# clean shipping column
df['shipping'] = df['shipping'].apply(lambda x: 0 if x == 'Free Shipping' else x)
#cleaning price column
df1['price'] = df1['price'].str.extract('(\d+\.?\d+)').astype(float)
#string converted to float
df['shipping'] = df['shipping'].apply(lambda x: 0 if x == 'Special Shipping' else x) # probably should be handled in a special way
df['shipping'] = df['shipping'].apply(lambda x: x if x == 0 else re.sub("[^0-9]", "", x))
df['shipping'] = df['shipping'].astype(float)
# save dataframe to csv file
df.to_csv('dataframe.csv', index=False)
df1.to_csv('dataframe1.csv', index=False)
df2 = pd.merge(df,df1, how ='inner') #pandas and merge data frames
plt.bar(brand, product_name, color='blue')
plt.xlabel("Product")
plt.ylabel("Prince")
plt.title("PopularitY of Programming Language\n" + "Worldwide, Oct 2017 compared to a year ago")
plt.xticks(brand, product_name)
# Turn on the grid
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5', color='red')
# Customize the minor grid
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()```
【问题讨论】:
标签: python dataframe matplotlib