【发布时间】:2026-01-13 11:10:01
【问题描述】:
我想从这个website的表中提取数据
天气历史表
我需要什么
- 提取所有
<td>in<table class="responsive">\<tbody>\<tr>的所有内容 - 从 python 中生成包含完整数据的 Excel 文件。
我不需要的东西
- 数字旁边的所有单位(即:22°C)
(我不想在
<td>中加入<span>)<td class="data-cell alt-cell">33.8 <span class="table-unit">°C</span></td> - 排除
<tbody>\<tr class="column-heading">和<tr class="row-subheading">中的重复标头
谁能展示并解释我将如何在 Excel 文件中提取这些数据?
HTML 代码
<table id="history_table" class="responsive">
<thead>
<tr class="column-heading">
<th class="year-cell">2016</th>
<th colspan="3">Temperature</th>
<th colspan="3">Dew Point</th>
<th colspan="3">Humidity</th>
<th colspan="3">Speed</th>
<th colspan="3">Pressure</th>
<th>Precip. Accum.</th>
</tr>
<tr class="row-subheading"><th>Sep</th>
<th class="alt-cell">High</th>
<th class="alt-cell">Avg</th>
<th class="alt-cell">Low</th>
<th>High</th>
<th>Avg</th>
<th>Low</th>
<th class="alt-cell">High</th>
<th class="alt-cell">Avg</th>
<th class="alt-cell">Low</th>
<th>High</th>
<th>Avg</th>
<th>Gust</th>
<th class="alt-cell">High</th>
<th class="alt-cell">Avg</th>
<th class="alt-cell">Low</th>
<th>Sum</th>
</tr>
</thead>
<tbody>
<tr>
<td class="data-cell">12</td>
<td class="data-cell alt-cell">33.8 <span class="table-unit">°C</span></td>
<td class="data-cell alt-cell">26.1 <span class="table-unit">°C</span></td>
<td class="data-cell alt-cell">18.4 <span class="table-unit">°C</span></td>
<td class="data-cell">17.6 <span class="table-unit">°C</span></td>
<td class="data-cell">16 <span class="table-unit">°C</span></td>
<td class="data-cell">13.4 <span class="table-unit">°C</span></td>
<td class="data-cell alt-cell">88 <span class="table-unit">%</span></td>
<td class="data-cell alt-cell">55 <span class="table-unit">%</span></td>
<td class="data-cell alt-cell">30 <span class="table-unit">%</span></td>
<td class="data-cell">12 <span class="table-unit">kph</span></td>
<td class="data-cell">1 <span class="table-unit">kph</span></td>
<td class="data-cell">16 <span class="table-unit">kph</span></td>
<td class="data-cell alt-cell">1016 <span class="table-unit">hPa</span></td>
<td class="data-cell alt-cell">1014 <span class="table-unit">hPa</span></td>
<td class="data-cell alt-cell">1012 <span class="table-unit">hPa</span></td>
<td class="data-cell">0 <span class="table-unit">mm</span></td>
</tr>
<tr>
<td class="data-cell">13</td>
<td class="data-cell alt-cell">34.2 <span class="table-unit">°C</span></td>
<td class="data-cell alt-cell">29 <span class="table-unit">°C</span></td>
<td class="data-cell alt-cell">23.8 <span class="table-unit">°C</span></td>
<td class="data-cell">17.4 <span class="table-unit">°C</span></td>
<td class="data-cell">15.6 <span class="table-unit">°C</span></td>
<td class="data-cell">12.7 <span class="table-unit">°C</span></td>
<td class="data-cell alt-cell">61 <span class="table-unit">%</span></td>
<td class="data-cell alt-cell">49 <span class="table-unit">%</span></td>
<td class="data-cell alt-cell">29 <span class="table-unit">%</span></td>
<td class="data-cell">12 <span class="table-unit">kph</span></td>
<td class="data-cell">3 <span class="table-unit">kph</span></td>
<td class="data-cell">16 <span class="table-unit">kph</span></td>
<td class="data-cell alt-cell">1013 <span class="table-unit">hPa</span></td>
<td class="data-cell alt-cell">1010 <span class="table-unit">hPa</span></td>
<td class="data-cell alt-cell">1008 <span class="table-unit">hPa</span></td>
<td class="data-cell">0 <span class="table-unit">mm</span></td>
</tr>
<tr class="column-heading">
<td class="year-cell">2017</td>
<td colspan="3">Temperature</td>
<td colspan="3">Dew Point</td>
<td colspan="3">Humidity</td>
<td colspan="3">Speed</td>
<td colspan="3">Pressure</td>
<td>Precip. Accum.</td>
</tr>
<tr class="row-subheading">
<td>Apr</td>
<td class="alt-cell">High</td>
<td class="alt-cell">Avg</td>
<td class="alt-cell">Low</td>
<td>High</td>
<td>Avg</td>
<td>Low</td>
<td class="alt-cell">High</td>
<td class="alt-cell">Avg</td>
<td class="alt-cell">Low</td>
<td>High</td>
<td>Avg</td>
<td>Gust</td>
<td class="alt-cell">High</td>
<td class="alt-cell">Avg</td>
<td class="alt-cell">Low</td>
<td>Sum</td>
</tr>
<tr>
<td class="data-cell">1</td>
<td class="data-cell alt-cell">17.4 <span class="table-unit">°C</span></td>
<td class="data-cell alt-cell">14.1 <span class="table-unit">°C</span></td>
<td class="data-cell alt-cell">10.7 <span class="table-unit">°C</span></td>
<td class="data-cell">10.2 <span class="table-unit">°C</span></td>
<td class="data-cell">7.4 <span class="table-unit">°C</span></td>
<td class="data-cell">4.7 <span class="table-unit">°C</span></td>
<td class="data-cell alt-cell">82 <span class="table-unit">%</span></td>
<td class="data-cell alt-cell">68 <span class="table-unit">%</span></td>
<td class="data-cell alt-cell">45 <span class="table-unit">%</span></td>
<td class="data-cell">11 <span class="table-unit">kph</span></td>
<td class="data-cell">5 <span class="table-unit">kph</span></td>
<td class="data-cell">18 <span class="table-unit">kph</span></td>
<td class="data-cell alt-cell">1016 <span class="table-unit">hPa</span></td>
<td class="data-cell alt-cell">1015 <span class="table-unit">hPa</span></td>
<td class="data-cell alt-cell">1013 <span class="table-unit">hPa</span></td>
<td class="data-cell">0 <span class="table-unit">mm</span></td>
</tr>...
Python 代码
from xlsxwriter import Workbook
from bs4 import BeautifulSoup
def read_file():
file = open('meteo.html', 'rt', encoding='UTF8')
data = file.read()
file.close()
return data
data_path ='/Users/Xtro/Dropbox/Work/test/data/out/meteo'
def write_data_to_excel_file(datas,data_path):
#print(datas[8])
workbook=Workbook(data_path +'/meteo.xlsx')
worksheet = workbook.add_worksheet()
row=0
worksheet.write(row,0,'Date')
worksheet.write(row,1,'Température haute en °C')
worksheet.write(row,2,'Température moyenne en °C')
worksheet.write(row,3,'Température basse en °C')
worksheet.write(row,4,'Point de rosée haut en °C')
worksheet.write(row,5,'Point de rosée moyenne en °C')
worksheet.write(row,6,'Point de rosée bas')
worksheet.write(row,7,'humidité haute en %')
worksheet.write(row,8,'humidité moyenne en %')
worksheet.write(row,9,'humidité basse en %')
worksheet.write(row,10,'vitesse haute en km/h')
worksheet.write(row,11,'raffale en km/h')
worksheet.write(row,12,'Pression haute en hPa')
worksheet.write(row,13,'Pression moyenne en hPa')
worksheet.write(row,14,'Pression basse en hPa')
worksheet.write(row,15,'précipitation/jour en mm')
row+=1
for data in datas:
print(data[1])
cellule0=data[1]
worksheet.write(row,0,cellule0)
cellule0=data[0]
cellule1=data[1]
cellule2=data[2]
cellule3=data[3]
cellule4=data[4]
cellule5=data[5]
cellule6=data[6]
cellule7=data[7]
cellule8=data[8]
cellule9=data[9]
cellule10=data[10]
cellule11=data[11]
cellule12=data[12]
cellule13=data[13]
cellule14=data[14]
cellule15=data[15]
#cellule[i]=data[i]
worksheet.write(row,0,cellule0)
worksheet.write(row,1,cellule1)
worksheet.write(row,2,cellule2)
worksheet.write(row,3,cellule3)
worksheet.write(row,4,cellule4)
worksheet.write(row,5,cellule5)
worksheet.write(row,6,cellule6)
worksheet.write(row,7,cellule7)
worksheet.write(row,8,cellule8)
worksheet.write(row,9,cellule9)
worksheet.write(row,10,cellule10)
worksheet.write(row,11,cellule11)
worksheet.write(row,12,cellule12)
worksheet.write(row,13,cellule13)
worksheet.write(row,14,cellule14)
worksheet.write(row,14,cellule15)
row +=1
workbook.close()
soup = BeautifulSoup(read_file(),'lxml')
data = []
table = soup.find('table',class_='responsive')
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for tr in rows:
spans = tr.find_all('span')
#print(spans)
if spans:
continue
#print (rows)
for row in rows:
cols = row.find_all('td')
#print (cols)
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele])
write_data_to_excel_file(data,data_path)
【问题讨论】:
标签: python excel python-3.x beautifulsoup export-to-excel