【发布时间】:2017-12-05 08:20:30
【问题描述】:
我正在开发我的 Naver Crawler(它是韩国 Google :P)。我已经在这个代码上工作了一周,我还有最后一个任务要解决!所以我下面的代码显示了通过 Naver API 进行数据爬行并在每个循环中将数据接收到“js”。我需要做的就是组合每个数据帧(dfdfdf)并在底部组合。但我的结果总是显示最后循环的数据。底线是我想为我正在采取的每个循环添加 DataFrame。 我尝试了合并,加入,但它似乎不起作用。请告诉我,如果我下面的代码没有意义(或太脏),请告诉我!
import os
import sys
import urllib.request
import pandas as pd
import json
import numpy as np
from datetime import datetime, timedelta
import time
ex = pd.ExcelFile('mat_hierarchy.xlsx').parse('Sheet1')
DNA1 = []
#adding list to DNA
DNA1.extend(ex.iloc[:,3])
DNA1.extend(ex.iloc[:,2])
seen = set()
DNA = []
for item in DNA1:
if item not in seen:
seen.add(item)
DNA.append(item)
# len(DNA)
#Setting Date weekly or daily
#dd = pd.date_range('2016-01-01',datetime.now().date() - timedelta(2))
dd = pd.date_range(start = '2016-01-01',end = datetime.now().date() - timedelta(2), freq = 'W-MON')
setendDate = datetime.now().date() - timedelta(1)
endDate = setendDate.strftime('%Y-%m-%d')
#Setting DataFrame & List
Data = pd.DataFrame(index=dd)
#Naver API Connection
client_id = "ID"
client_secret = "PW"
url = "https://openapi.naver.com/v1/datalab/search";
#Setting requests
body_intro = "{\"startDate\":\"2016-01-01\",\"endDate\":\""
body_endDate = "\",\"timeUnit\":\"date\",\"keywordGroups\":[{\"groupName\":\""
body_keywords = "\",\"keywords\":[\""
body_groupName = "\"]},{\"groupName\":\""
body_last = "\"]}],\"ages\":[\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"10\",\"11\"]}"
df_list=[]
for i in range(2270,len(DNA),5):
if((len(DNA)%5==0) or (i < (len(DNA)-(len(DNA)%5)))):
body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + body_keywords + DNA[i+3] + body_groupName + DNA[i+4] + body_keywords + DNA[i+4] + body_last
print("5")
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id",client_id)
request.add_header("X-Naver-Client-Secret",client_secret)
request.add_header("Content-Type","application/json")
response = urllib.request.urlopen(request, data=body.encode("utf-8"))
rescode = response.getcode()
if(rescode==200):
response_body = response.read()
js = response_body.decode('utf-8')
else:
print("Error Code:" + rescode)
#checking empty values & append to df_list
d = json.loads(js)
lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
if len(r['data']) > 0
else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
for r in d['results']]
df = pd.concat(lst, 1)
dfdfdf = Data.join(df)
df_list.append(dfdfdf)
elif(len(DNA)%5==4):
body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + body_keywords + DNA[i+3] + body_last
print("4")
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id",client_id)
request.add_header("X-Naver-Client-Secret",client_secret)
request.add_header("Content-Type","application/json")
response = urllib.request.urlopen(request, data=body.encode("utf-8"))
rescode = response.getcode()
if(rescode==200):
response_body = response.read()
js = response_body.decode('utf-8')
else:
print("Error Code:" + rescode)
#checking empty values & append to df_list
d = json.loads(js)
lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
if len(r['data']) > 0
else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
for r in d['results']]
df = pd.concat(lst, 1)
dfdfdf = Data.join(df)
df_list.append(dfdfdf)
elif(len(DNA)%5==3):
body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_last
print("3")
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id",client_id)
request.add_header("X-Naver-Client-Secret",client_secret)
request.add_header("Content-Type","application/json")
response = urllib.request.urlopen(request, data=body.encode("utf-8"))
rescode = response.getcode()
if(rescode==200):
response_body = response.read()
js = response_body.decode('utf-8')
else:
print("Error Code:" + rescode)
#checking empty values & append to df_list
d = json.loads(js)
lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
if len(r['data']) > 0
else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
for r in d['results']]
df = pd.concat(lst, 1)
dfdfdf = Data.join(df)
df_list.append(dfdfdf)
elif(len(DNA)%5==2):
body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_last
print("2")
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id",client_id)
request.add_header("X-Naver-Client-Secret",client_secret)
request.add_header("Content-Type","application/json")
response = urllib.request.urlopen(request, data=body.encode("utf-8"))
rescode = response.getcode()
if(rescode==200):
response_body = response.read()
js = response_body.decode('utf-8')
else:
print("Error Code:" + rescode)
#checking empty values & append to df_list
d = json.loads(js)
lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
if len(r['data']) > 0
else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
for r in d['results']]
df = pd.concat(lst, 1)
dfdfdf = Data.join(df)
df_list.append(dfdfdf)
else:
body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_last
print("1")
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id",client_id)
request.add_header("X-Naver-Client-Secret",client_secret)
request.add_header("Content-Type","application/json")
response = urllib.request.urlopen(request, data=body.encode("utf-8"))
rescode = response.getcode()
if(rescode==200):
response_body = response.read()
js = response_body.decode('utf-8')
else:
print("Error Code:" + rescode)
#checking empty values & append to df_list
d = json.loads(js)
lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
if len(r['data']) > 0
else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
for r in d['results']]
df = pd.concat(lst, 1)
dfdfdf = Data.join(df)
df_list.append(dfdfdf)
#Combining all Data
#Naver = Data.join(dfdfdf)
print("end")
time.sleep(.5)
Final = pd.concat(df_list, axis=1)
Final.to_csv("Naver123.csv")
【问题讨论】:
-
只有 body 在
if块之间变化吗? -
是的。只有请求“body”的关键字在变化。
标签: python join dataframe merge web-crawler