从 JSON 文件到 R 中的数据框答案

【问题标题】：From JSON file to Data Frame in R从 JSON 文件到 R 中的数据框
【发布时间】：2017-12-15 18:09:25
【问题描述】：

我有一个 JSON 格式的推文摘录。我附上了数据样本。我需要将此 JSON 转换为数据框。

到目前为止，我设法使用“jsonlite”包对其进行了转换：

json_data <- jsonlite::stream_in(file("myjsonfile.txt"))

但它不会加载推文中包含的所有信息。例如，我只看到转发推文的用户，但看不到发布推文的用户。您可以通过复制粘贴文件并选择格式来更好地使用此网站查看 json 文件：http://jsonviewer.stack.hu/

数据来自 Twitter API（有关此数据的更多信息，请点击此处：https://dev.twitter.com/overview/api/tweets

提前感谢您的时间和帮助。

ML_Enthousiast

{"favorited": false, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "truncated": false, "in_reply_to_user_id_str": null, "coordinates": null, "retweeted": false, "text": "RT @Antoniotalks: Revenue streams for #OpenData companies!\n#Cloud #StartUp #SMM #AI #IoT #Fintech #BigData #deeplearning #Mpgvip\u2026 ", "retweet_count": 0, "filter_level": "low", "created_at": "Thu Jun 29 18:47:18 +0000 2017", "favorite_count": 0, "retweeted_status": {"favorited": false, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "display_text_range": [0, 140], "truncated": true, "in_reply_to_user_id_str": null, "coordinates": null, "retweeted": false, "text": "Revenue streams for #OpenData companies!\n#Cloud #StartUp #SMM #AI #IoT #Fintech #BigData #deeplearning #Mpgvip\u2026 ", "retweet_count": 38, "filter_level": "low", "created_at": "Wed Jun 28 12:45:08 +0000 2017", "favorite_count": 48, "in_reply_to_screen_name": null, "extended_tweet": {"extended_entities": {"media": [{"media_url_https": "", "sizes": {"thumb": {"w": 150, "h": 150, "resize": "crop"}, "large": {"w": 1200, "h": 927, "resize": "fit"}, "medium": {"w": 1200, "h": 927, "resize": "fit"}, "small": {"w": 680, "h": 525, "resize": "fit"}}, "type": "photo", "expanded_url": "", "id": 880044388679901184, "media_url": "http://pbs.twimg.com/media/DDaLtXXXYAAI2eM.jpg", "id_str": "880044388679901184", "display_url": "pic.twitter.com/aw9HeukUYv", "indices": [139, 162], "url": ""}]}, "full_text": "Revenue streams for #OpenData companies!\n#Cloud #StartUp #SMM #AI #IoT #Fintech #BigData #deeplearning #Mpgvip #defstar5 #DataScience #CIO ", "entities": {"user_mentions": [], "hashtags": [{"text": "OpenData", "indices": [20, 29]}, {"text": "Cloud", "indices": [41, 47]}, {"text": "StartUp", "indices": [48, 56]}, {"text": "SMM", "indices": [57, 61]}, {"text": "AI", "indices": [62, 65]}, {"text": "IoT", "indices": [66, 70]}, {"text": "Fintech", "indices": [71, 79]}, {"text": "BigData", "indices": [80, 88]}, {"text": "deeplearning", "indices": [89, 102]}, {"text": "Mpgvip", "indices": [103, 110]}, {"text": "defstar5", "indices": [111, 120]}, {"text": "DataScience", "indices": [121, 133]}, {"text": "CIO", "indices": [134, 138]}], "media": [{"media_url_https": "", "sizes": {"thumb": {"w": 150, "h": 150, "resize": "crop"}, "large": {"w": 1200, "h": 927, "resize": "fit"}, "medium": {"w": 1200, "h": 927, "resize": "fit"}, "small": {"w": 680, "h": 525, "resize": "fit"}}, "type": "photo", "expanded_url": "", "id": 880044388679901184, "media_url": "", "id_str": "880044388679901184", "display_url": "pic.twitter.com/aw9HeukUYv", "indices": [139, 162], "url": ""}], "symbols": [], "urls": []}, "display_text_range": [0, 138]}, "in_reply_to_status_id": null, "source": "<a href=\"\" rel=\"nofollow\">Buffer</a>", "id_str": "880044392110796800", "entities": {"user_mentions": [], "hashtags": [{"text": "OpenData", "indices": [20, 29]}, {"text": "Cloud", "indices": [41, 47]}, {"text": "StartUp", "indices": [48, 56]}, {"text": "SMM", "indices": [57, 61]}, {"text": "AI", "indices": [62, 65]}, {"text": "IoT", "indices": [66, 70]}, {"text": "Fintech", "indices": [71, 79]}, {"text": "BigData", "indices": [80, 88]}, {"text": "deeplearning", "indices": [89, 102]}, {"text": "Mpgvip", "indices": [103, 110]}], "symbols": [], "urls": [{"display_url": "twitter.com/i/web/status/8\u2026", "indices": [112, 135], "expanded_url": "", "url": "8H"}]}, "lang": "en", "id": 880044392110796800, "is_quote_status": false, "geo": null, "user": {"screen_name": "Antoniotalks", "profile_background_image_url": "", "profile_image_url": "jpg", "follow_request_sent": null, "profile_background_tile": false, "id": 2445890839, "is_translator": false, "description": "A father & CEO of Recruitd (@imrecruitd).  Helping companies magnify their #employer and #recruitment #brand and #jobseekers with the #skillstosucceed.", "listed_count": 198, "favourites_count": 398, "created_at": "Tue Apr 15 19:13:52 +0000 2014", "notifications": null, "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "contributors_enabled": false, "profile_background_color": "C0DEED", "following": null, "friends_count": 6792, "protected": false, "default_profile": true, "profile_use_background_image": true, "name": "Antonio Giugno", "location": "London, England", "geo_enabled": true, "id_str": "2445890839", "utc_offset": -25200, "profile_banner_url": "0", "profile_text_color": "333333", "lang": "en-gb", "statuses_count": 4058, "profile_sidebar_fill_color": "DDEEF6", "default_profile_image": false, "profile_image_url_https": "4433/dVeGYfTX_normal.jpg", "profile_link_color": "1DA1F2", "url": "rnontubein", "verified": false, "profile_sidebar_border_color": "C0DEED", "followers_count": 6323, "time_zone": "Pacific Time (US & Canada)"}, "contributors": null, "possibly_sensitive": false, "place": null}, "in_reply_to_screen_name": null, "timestamp_ms": "1498762038396", "in_reply_to_status_id": null, "source": "<a href=\".com\" rel=\"nofollow\">Mobile Web (M2)</a>", "id_str": "880497923150286848", "entities": {"user_mentions": [{"screen_name": "Antoniotalks", "id": 2445890839, "id_str": "2445890839", "name": "Antonio Giugno", "indices": [3, 16]}], "hashtags": [{"text": "OpenData", "indices": [38, 47]}, {"text": "Cloud", "indices": [59, 65]}, {"text": "StartUp", "indices": [66, 74]}, {"text": "SMM", "indices": [75, 79]}, {"text": "AI", "indices": [80, 83]}, {"text": "IoT", "indices": [84, 88]}, {"text": "Fintech", "indices": [89, 97]}, {"text": "BigData", "indices": [98, 106]}, {"text": "deeplearning", "indices": [107, 120]}, {"text": "Mpgvip", "indices": [121, 128]}], "symbols": [], "urls": [{"indices": [130, 130], "expanded_url": null, "url": ""}]}, "lang": "en", "id": 880497923150286848, "is_quote_status": false, "geo": null, "user": {"screen_name": "henrymbuguak", "profile_background_image_url": "://abs.twimg.com/images/themes/theme3/bg.gif", "profile_image_url": "://pbs.twimg.com/profile_images/822772556818239489/0yTbHCGj_normal.jpg", "follow_request_sent": null, "profile_background_tile": false, "id": 310697279, "is_translator": false, "description": "I enjoy coding. Visit my github project: :// ://github.com/henrymbuguak", "listed_count": 62, "favourites_count": 978, "created_at": "Sat Jun 04 05:55:09 +0000 2011", "notifications": null, "profile_background_image_url_https": "://abs.twimg.com/images/themes/theme3/bg.gif", "contributors_enabled": false, "profile_background_color": "EDECE9", "following": null, "friends_count": 2540, "protected": false, "default_profile": false, "profile_use_background_image": true, "name": "kiarie henry mbugua", "location": "Njoro, Kenya.", "geo_enabled": false, "id_str": "310697279", "utc_offset": 10800, "profile_banner_url": "://pbs.twimg.com/profile_banners/310697279/1484999353", "profile_text_color": "634047", "lang": "en", "statuses_count": 3775, "profile_sidebar_fill_color": "E3E2DE", "default_profile_image": false, "profile_image_url_https": "//pbs.twimg.com/profile_images/822772556818239489/0yTbHCGj_normal.jpg", "profile_link_color": "088253", "url": null, "verified": false, "profile_sidebar_border_color": "D3D2CF", "followers_count": 2141, "time_zone": "Nairobi"}, "contributors": null, "place": null}

【问题讨论】：

我不相信您已经完全检查了结果数据框。 retweeted_status、entities 和 user 包含相当多的信息，因为它们本身就是数据帧

标签： json r twitter

【解决方案1】：

如果我使用

读取您的数据

indata <- jsonlite::read_json("myjsonfile.json")

然后我得到 JSON 文件中包含的所有信息。它是一个嵌套列表，因此您可能需要从列表中的一个元素中提取您想要的信息

> names(indata)
 [1] "favorited"                 "in_reply_to_status_id_str"
 [3] "in_reply_to_user_id"       "truncated"                
 [5] "in_reply_to_user_id_str"   "coordinates"              
 [7] "retweeted"                 "text"                     
 [9] "retweet_count"             "filter_level"             
[11] "created_at"                "favorite_count"           
[13] "retweeted_status"          "in_reply_to_screen_name"  
[15] "timestamp_ms"              "in_reply_to_status_id"    
[17] "source"                    "id_str"                   
[19] "entities"                  "lang"                     
[21] "id"                        "is_quote_status"          
[23] "geo"                       "user"                     
[25] "contributors"              "place"

用户信息例如（仅显示一部分）

> indata$user
$screen_name
[1] "henrymbuguak"

$profile_background_image_url
[1] "://abs.twimg.com/images/themes/theme3/bg.gif"

$profile_image_url
[1] "://pbs.twimg.com/profile_images/822772556818239489/0yTbHCGj_normal.jpg"

$follow_request_sent
NULL

$profile_background_tile
[1] FALSE

$id
[1] 310697279

这样您就可以使用indata$user$screen_name 获取用户

【讨论】：

您好，感谢您的回复。我设法获得了所有这些数据。但在我看来，“screen_name”代表的是转发而不是发布数据的用户。这是否意味着发布推文的原始用户的信息不在数据中？即：@Antoniotalks 是我认为的转发者。
该信息应包含在in_reply_to_screen_name 中，但在您提供的数据中是null。有关 API 的信息，请参阅this page。您请求的数据不在您的示例中。你能提取一些可用的数据吗？
好的，谢谢。我读到“Nullable 如果表示的推文是回复，则此字段将包含原始推文作者的网名。”所以我不确定是否清楚地理解 screen_name 和 in_reply_to_screen_name 之间的区别。无论如何谢谢！