【发布时间】:2018-04-25 08:32:00
【问题描述】:
我正在尝试解析一个 JSON 文件,该文件由使用 tweepy 流式传输的多条推文组成。尝试读取仅包含一条推文的 JSON 文件时,我可以毫无问题地读取它。尝试读取包含多条推文的 JSON 文件时,出现错误。我认为要么我缺少某种类型的编码,要么 JSON 格式不正确。
这是错误:
Traceback (most recent call last):
File "C:\Repos\Testing\parse_json.py", line 16, in <module>
tweet = json.load(data_file)
File "C:\Python\Python36-32\lib\json\__init__.py", line 299, in load
parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
File "C:\Python\Python36-32\lib\json\__init__.py", line 354, in loads
return _default_decoder.decode(s)
File "C:\Python\Python36-32\lib\json\decoder.py", line 342, in decode
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 3 column 1 (char 2753)
这是我的 parse_json.py 代码:
import json
with open('raw_tweets.json') as data_file:
tweet = json.load(data_file)
print(f"Created: {tweet['created_at']} \t Tweet: {tweet['text']} \t Language: {tweet['lang']}")
这是我的 raw_tweets.json:
{"created_at":"Sun Nov 12 16:11:40 +0000 2017","id":929743506096001024,"id_str":"929743506096001024","text":"#Sentiment analysis of The Velveteen Rabbit https:\/\/t.co\/MLSItn74wz #keyphrase #language #nltk #pattern #python","source":"\u003ca href=\"http:\/\/www.ajaymatharu.com\/\" rel=\"nofollow\"\u003eTweet Old Post\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":811014723328090112,"id_str":"811014723328090112","name":"Addie","screen_name":"RealThinks_","location":null,"url":"http:\/\/students.washington.edu\/adelak\/","description":"Chemist. Data scientist. Dinosaur enthusiast. Pretty big on the whole \"being nice to people\" thing.","translator_type":"none","protected":false,"verified":false,"followers_count":439,"friends_count":388,"listed_count":36,"favourites_count":105,"statuses_count":974,"created_at":"Tue Dec 20 01:05:52 +0000 2016","utc_offset":-28800,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"1B95E0","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/811016747037782017\/V32aJln5_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/811016747037782017\/V32aJln5_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/811014723328090112\/1482462522","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"Sentiment","indices":[0,10]},{"text":"keyphrase","indices":[68,78]},{"text":"language","indices":[79,88]},{"text":"nltk","indices":[89,94]},{"text":"pattern","indices":[95,103]},{"text":"python","indices":[104,111]}],"urls":[{"url":"https:\/\/t.co\/MLSItn74wz","expanded_url":"http:\/\/students.washington.edu\/adelak\/2016\/12\/?p=18","display_url":"students.washington.edu\/adelak\/2016\/12\u2026","indices":[44,67]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1510503100309"}
{"created_at":"Sun Nov 12 16:11:53 +0000 2017","id":929743561217671168,"id_str":"929743561217671168","text":"RT @kdnuggets: Tips for Getting Started with Text Mining in #rstats and #Python https:\/\/t.co\/dWLXBTu7Ap https:\/\/t.co\/XRyRE9lej8","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":219290077,"id_str":"219290077","name":"Lilia Quituisaca S","screen_name":"LiliaQuituisaca","location":"Guasuntos","url":"http:\/\/liliaquituisaca-poesias.blogspot.com\/","description":"Ingeniera Inform\u00e1tica. OpenSource. JAVA. R-project. Weka. Datamining. Simulaci\u00f3n. Escribo y declamo poes\u00eda -destino,sin miedo- El arte perfecciona los sentidos.","translator_type":"none","protected":false,"verified":false,"followers_count":1125,"friends_count":2308,"listed_count":631,"favourites_count":1228,"statuses_count":26375,"created_at":"Wed Nov 24 13:10:16 +0000 2010","utc_offset":-18000,"time_zone":"Quito","geo_enabled":true,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/378800000032598931\/b322745f32a73f540fa42b54c27ac75a.jpeg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/378800000032598931\/b322745f32a73f540fa42b54c27ac75a.jpeg","profile_background_tile":true,"profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/780178494466711552\/9ONWkWlC_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/780178494466711552\/9ONWkWlC_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/219290077\/1357069442","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Sun Nov 12 15:42:02 +0000 2017","id":929736050687332354,"id_str":"929736050687332354","text":"Tips for Getting Started with Text Mining in #rstats and #Python https:\/\/t.co\/dWLXBTu7Ap https:\/\/t.co\/XRyRE9lej8","display_text_range":[0,88],"source":"\u003ca href=\"http:\/\/bufferapp.com\" rel=\"nofollow\"\u003eBuffer\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":20167623,"id_str":"20167623","name":"KDnuggets","screen_name":"kdnuggets","location":"Brookline, MA, USA","url":"https:\/\/www.kdnuggets.com\/","description":"Covering #AI, #Analytics, #BigData, #DataMining, #DataScience #MachineLearning, #DeepLearning. Founded by Gregory Piatetsky-Shapiro.","translator_type":"regular","protected":false,"verified":false,"followers_count":98870,"friends_count":400,"listed_count":5477,"favourites_count":448,"statuses_count":42854,"created_at":"Thu Feb 05 17:37:26 +0000 2009","utc_offset":-18000,"time_zone":"Eastern Time (US & Canada)","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"0099B9","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/378800000050609482\/c32d3d654da4eb7c0518e7bba2609522.jpeg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/378800000050609482\/c32d3d654da4eb7c0518e7bba2609522.jpeg","profile_background_tile":true,"profile_link_color":"0099B9","profile_sidebar_border_color":"5ED4DC","profile_sidebar_fill_color":"95E8EC","profile_text_color":"3C3940","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/795788806008041473\/0nTPcRja_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/795788806008041473\/0nTPcRja_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/20167623\/1476546748","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":9,"favorite_count":6,"entities":{"hashtags":[{"text":"rstats","indices":[45,52]},{"text":"Python","indices":[57,64]}],"urls":[{"url":"https:\/\/t.co\/dWLXBTu7Ap","expanded_url":"https:\/\/buff.ly\/2zyOtix","display_url":"buff.ly\/2zyOtix","indices":[65,88]}],"user_mentions":[],"symbols":[],"media":[{"id":929736048183336960,"id_str":"929736048183336960","indices":[89,112],"media_url":"http:\/\/pbs.twimg.com\/media\/DOcWAobX4AAf9ze.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DOcWAobX4AAf9ze.jpg","url":"https:\/\/t.co\/XRyRE9lej8","display_url":"pic.twitter.com\/XRyRE9lej8","expanded_url":"https:\/\/twitter.com\/kdnuggets\/status\/929736050687332354\/photo\/1","type":"photo","sizes":{"large":{"w":800,"h":338,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":800,"h":338,"resize":"fit"},"small":{"w":680,"h":287,"resize":"fit"}}}]},"extended_entities":{"media":[{"id":929736048183336960,"id_str":"929736048183336960","indices":[89,112],"media_url":"http:\/\/pbs.twimg.com\/media\/DOcWAobX4AAf9ze.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DOcWAobX4AAf9ze.jpg","url":"https:\/\/t.co\/XRyRE9lej8","display_url":"pic.twitter.com\/XRyRE9lej8","expanded_url":"https:\/\/twitter.com\/kdnuggets\/status\/929736050687332354\/photo\/1","type":"photo","sizes":{"large":{"w":800,"h":338,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":800,"h":338,"resize":"fit"},"small":{"w":680,"h":287,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"rstats","indices":[60,67]},{"text":"Python","indices":[72,79]}],"urls":[{"url":"https:\/\/t.co\/dWLXBTu7Ap","expanded_url":"https:\/\/buff.ly\/2zyOtix","display_url":"buff.ly\/2zyOtix","indices":[80,103]}],"user_mentions":[{"screen_name":"kdnuggets","name":"KDnuggets","id":20167623,"id_str":"20167623","indices":[3,13]}],"symbols":[],"media":[{"id":929736048183336960,"id_str":"929736048183336960","indices":[104,127],"media_url":"http:\/\/pbs.twimg.com\/media\/DOcWAobX4AAf9ze.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DOcWAobX4AAf9ze.jpg","url":"https:\/\/t.co\/XRyRE9lej8","display_url":"pic.twitter.com\/XRyRE9lej8","expanded_url":"https:\/\/twitter.com\/kdnuggets\/status\/929736050687332354\/photo\/1","type":"photo","sizes":{"large":{"w":800,"h":338,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":800,"h":338,"resize":"fit"},"small":{"w":680,"h":287,"resize":"fit"}},"source_status_id":929736050687332354,"source_status_id_str":"929736050687332354","source_user_id":20167623,"source_user_id_str":"20167623"}]},"extended_entities":{"media":[{"id":929736048183336960,"id_str":"929736048183336960","indices":[104,127],"media_url":"http:\/\/pbs.twimg.com\/media\/DOcWAobX4AAf9ze.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DOcWAobX4AAf9ze.jpg","url":"https:\/\/t.co\/XRyRE9lej8","display_url":"pic.twitter.com\/XRyRE9lej8","expanded_url":"https:\/\/twitter.com\/kdnuggets\/status\/929736050687332354\/photo\/1","type":"photo","sizes":{"large":{"w":800,"h":338,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":800,"h":338,"resize":"fit"},"small":{"w":680,"h":287,"resize":"fit"}},"source_status_id":929736050687332354,"source_status_id_str":"929736050687332354","source_user_id":20167623,"source_user_id_str":"20167623"}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1510503113451"}
这是我的听众的代码:
import tweepy
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
class MyListener(StreamListener):
def on_data(self, data):
try:
with open('raw_tweets.json', 'a') as f:
f.write(data)
print("Tweet")
return True
except BaseException as e:
print("Error on_data: %s" % str(e))
return True
def on_error(self, status):
print(status)
if __name__ == '__main__':
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
twitter_stream = Stream(auth, MyListener())
print("Streaming Tweets...")
twitter_stream.filter(track=['#python'], languages=['en'])
【问题讨论】:
-
这不是有效的 JSON。至少,
json模块不会读取它。您需要将各个推文对象放入列表中。
标签: python json twitter tweepy