【问题标题】:How to load and process data from multiple JSON files in a directory into a data frame in R?如何将目录中多个 JSON 文件中的数据加载和处理到 R 中的数据框中?
【发布时间】:2026-01-25 20:15:02
【问题描述】:

我们有几个 JSON 文件存储在一个目录中。这些 JSON 文件具有嵌套结构。我们编写了以下代码来从每个 JSON 文件中读取数据:

library("jsonlite")
temp = list.files(pattern="*.JSON")

for (files in temp){ 
  data <- fromJSON(files, flatten=TRUE)
  ...
  }

class(data) 现在显示“数据”是"list"。该数据的结构可以描述如下:names(data) 给出了列名:“a”“b”“c”“d”“e”“f”……等等。

列“a”有嵌套,这样:names(data$a) 给出:“nest1”“nest2”“nest3”...等等。

我们希望编写逻辑来读取所有 JSON 文件,然后是 if data$e == 1 and data$a$nest1 == TRUE,然后是 count_nest1 += 1。最终,我们希望对所有 nest1 == TRUE 的实例进行计数,并对所有 nest2 == TRUE 的实例进行计数……

实际数据文件1:

{"scans": {"Bkav": {"detected": false, "version": "1.3.0.8876", "result": null, "update": "20170613"}, "TotalDefense": {"detected": false, "version": "37.1.62.1", "result": null, "update": "20170613"}, "MicroWorld-eScan": {"detected": false, "version": "12.0.250.0", "result": null, "update": "20170613"}, "nProtect": {"detected": false, "version": "2017-06-13.02", "result": null, "update": "20170613"}, "CMC": {"detected": false, "version": "1.1.0.977", "result": null, "update": "20170613"}, "CAT-QuickHeal": {"detected": false, "version": "14.00", "result": null, "update": "20170613"}, "McAfee": {"detected": false, "version": "6.0.6.653", "result": null, "update": "20170613"}, "Malwarebytes": {"detected": false, "version": "2.1.1.1115", "result": null, "update": "20170613"}, "Zillya": {"detected": false, "version": "2.0.0.3311", "result": null, "update": "20170613"}, "SUPERAntiSpyware": {"detected": false, "version": "5.6.0.1032", "result": null, "update": "20170613"}, "TheHacker": {"detected": false, "version": "6.8.0.5.1623", "result": null, "update": "20170612"}, "K7GW": {"detected": false, "version": "10.15.23651", "result": null, "update": "20170613"}, "K7AntiVirus": {"detected": false, "version": "10.15.23640", "result": null, "update": "20170613"}, "Arcabit": {"detected": false, "version": "1.0.0.806", "result": null, "update": "20170613"}, "Baidu": {"detected": false, "version": "1.0.0.2", "result": null, "update": "20170613"}, "F-Prot": {"detected": false, "version": "4.7.1.166", "result": null, "update": "20170613"}, "Symantec": {"detected": false, "version": "1.3.1.0", "result": null, "update": "20170613"}, "ESET-NOD32": {"detected": false, "version": "15577", "result": null, "update": "20170613"}, "TrendMicro-HouseCall": {"detected": false, "version": "9.900.0.1004", "result": null, "update": "20170613"}, "Avast": {"detected": false, "version": "8.0.1489.320", "result": null, "update": "20170613"}, "ClamAV": {"detected": false, "version": "0.99.2.0", "result": null, "update": "20170613"}, "Kaspersky": {"detected": false, "version": "15.0.1.13", "result": null, "update": "20170613"}, "BitDefender": {"detected": false, "version": "7.2", "result": null, "update": "20170613"}, "NANO-Antivirus": {"detected": false, "version": "1.0.76.17389", "result": null, "update": "20170613"}, "Paloalto": {"detected": false, "version": "1.0", "result": null, "update": "20170613"}, "ViRobot": {"detected": false, "version": "2014.3.20.0", "result": null, "update": "20170613"}, "Tencent": {"detected": false, "version": "1.0.0.1", "result": null, "update": "20170613"}, "Ad-Aware": {"detected": false, "version": "3.0.3.1010", "result": null, "update": "20170613"}, "Emsisoft": {"detected": false, "version": "4.0.1.883", "result": null, "update": "20170613"}, "Comodo": {"detected": false, "version": "27271", "result": null, "update": "20170613"}, "F-Secure": {"detected": false, "version": "11.0.19100.45", "result": null, "update": "20170613"}, "DrWeb": {"detected": false, "version": "7.0.28.2020", "result": null, "update": "20170613"}, "VIPRE": {"detected": false, "version": "58800", "result": null, "update": "20170613"}, "Invincea": {"detected": false, "version": "6.3.0.25415", "result": null, "update": "20170607"}, "McAfee-GW-Edition": {"detected": false, "version": "v2015", "result": null, "update": "20170613"}, "Sophos": {"detected": false, "version": "4.98.0", "result": null, "update": "20170613"}, "Ikarus": {"detected": false, "version": "0.1.5.2", "result": null, "update": "20170613"}, "Cyren": {"detected": false, "version": "5.4.30.7", "result": null, "update": "20170613"}, "Jiangmin": {"detected": false, "version": "16.0.100", "result": null, "update": "20170613"}, "Webroot": {"detected": false, "version": "1.0.0.207", "result": null, "update": "20170613"}, "Avira": {"detected": false, "version": "8.3.3.4", "result": null, "update": "20170613"}, "Kingsoft": {"detected": false, "version": "2013.8.14.323", "result": null, "update": "20170613"}, "Endgame": {"detected": false, "version": "0.7.0", "result": null, "update": "20170612"}, "Microsoft": {"detected": false, "version": "1.1.13804.0", "result": null, "update": "20170613"}, "AegisLab": {"detected": false, "version": "4.2", "result": null, "update": "20170613"}, "ZoneAlarm": {"detected": false, "version": "1.0", "result": null, "update": "20170613"}, "GData": {"detected": false, "version": "A:25.12848B:25.9761", "result": null, "update": "20170613"}, "AhnLab-V3": {"detected": false, "version": "3.9.1.17781", "result": null, "update": "20170613"}, "ALYac": {"detected": false, "version": "1.0.1.9", "result": null, "update": "20170613"}, "AVware": {"detected": false, "version": "1.5.0.42", "result": null, "update": "20170613"}, "VBA32": {"detected": false, "version": "3.12.26.4", "result": null, "update": "20170613"}, "Zoner": {"detected": false, "version": "1.0", "result": null, "update": "20170613"}, "Rising": {"detected": false, "version": "28.0.0.1", "result": null, "update": "20170613"}, "Yandex": {"detected": false, "version": "5.5.1.3", "result": null, "update": "20170608"}, "SentinelOne": {"detected": false, "version": "1.0.0.12", "result": null, "update": "20170516"}, "Fortinet": {"detected": false, "version": "5.4.233.0", "result": null, "update": "20170613"}, "AVG": {"detected": false, "version": "8.0.1489.320", "result": null, "update": "20170613"}, "Panda": {"detected": false, "version": "4.6.4.2", "result": null, "update": "20170613"}, "CrowdStrike": {"detected": false, "version": "1.0", "result": null, "update": "20170420"}, "Qihoo-360": {"detected": false, "version": "1.0.0.1120", "result": null, "update": "20170613"}}, "scan_id": "00d9d7d8e563ae71dcecc808f35f7d0845ffd91a1731d3f69e6ea5204fd7a3d7-1497385194", "sha1": "c6a6e3977402e76379f48f09a052f0f3c50f5964", "resource": "00D9D7D8E563AE71DCECC808F35F7D0845FFD91A1731D3F69E6EA5204FD7A3D7", "response_code": 1, "scan_date": "2017-06-13 20:19:54", "permalink": "https://www.virustotal.com/file/00d9d7d8e563ae71dcecc808f35f7d0845ffd91a1731d3f69e6ea5204fd7a3d7/analysis/1497385194/", "verbose_msg": "Scan finished, information embedded", "total": 60, "positives": 0, "sha256": "00d9d7d8e563ae71dcecc808f35f7d0845ffd91a1731d3f69e6ea5204fd7a3d7", "md5": "8d95236c637c042ff7df7fd7cc502ddb"}

实际数据文件2:

{"scans": {"MicroWorld-eScan": {"detected": false, "version": "12.0.250.0", "result": null, "update": "20170610"}, "nProtect": {"detected": false, "version": "2017-06-10.02", "result": null, "update": "20170610"}, "CMC": {"detected": false, "version": "1.1.0.977", "result": null, "update": "20170610"}, "CAT-QuickHeal": {"detected": true, "version": "14.00", "result": "TrojDownloader.NSIS.Genome.V", "update": "20170610"}, "ALYac": {"detected": false, "version": "1.0.1.9", "result": null, "update": "20170610"}, "Malwarebytes": {"detected": true, "version": "2.1.1.1115", "result": "PUP.Optional.MyPCBackup", "update": "20170610"}, "Zillya": {"detected": false, "version": "2.0.0.3308", "result": null, "update": "20170610"}, "AegisLab": {"detected": false, "version": "4.2", "result": null, "update": "20170610"}, "TheHacker": {"detected": false, "version": "6.8.0.5.1596", "result": null, "update": "20170607"}, "K7GW": {"detected": false, "version": "10.14.23624", "result": null, "update": "20170610"}, "K7AntiVirus": {"detected": false, "version": "10.14.23624", "result": null, "update": "20170610"}, "Arcabit": {"detected": false, "version": "1.0.0.806", "result": null, "update": "20170610"}, "TrendMicro": {"detected": false, "version": "9.740.0.1012", "result": null, "update": "20170610"}, "Baidu": {"detected": true, "version": "1.0.0.2", "result": "Win32.*.WisdomEyes.16070401.9500.9976", "update": "20170608"}, "F-Prot": {"detected": false, "version": "4.7.1.166", "result": null, "update": "20170610"}, "Symantec": {"detected": true, "version": "1.3.1.0", "result": "PUA.MyPCBackup", "update": "20170610"}, "TotalDefense": {"detected": false, "version": "37.1.62.1", "result": null, "update": "20170610"}, "TrendMicro-HouseCall": {"detected": false, "version": "9.900.0.1004", "result": null, "update": "20170610"}, "Paloalto": {"detected": false, "version": "1.0", "result": null, "update": "20170610"}, "ClamAV": {"detected": false, "version": "0.99.2.0", "result": null, "update": "20170610"}, "Kaspersky": {"detected": false, "version": "15.0.1.13", "result": null, "update": "20170610"}, "BitDefender": {"detected": false, "version": "7.2", "result": null, "update": "20170610"}, "NANO-Antivirus": {"detected": true, "version": "1.0.76.17389", "result": "Riskware.Win32.Unwanted.dmgktv", "update": "20170610"}, "SUPERAntiSpyware": {"detected": false, "version": "5.6.0.1032", "result": null, "update": "20170610"}, "Avast": {"detected": false, "version": "8.0.1489.320", "result": null, "update": "20170610"}, "Tencent": {"detected": false, "version": "1.0.0.1", "result": null, "update": "20170610"}, "Ad-Aware": {"detected": false, "version": "3.0.3.1010", "result": null, "update": "20170610"}, "Emsisoft": {"detected": false, "version": "4.0.1.883", "result": null, "update": "20170610"}, "Comodo": {"detected": false, "version": "27254", "result": null, "update": "20170610"}, "F-Secure": {"detected": false, "version": "11.0.19100.45", "result": null, "update": "20170610"}, "DrWeb": {"detected": true, "version": "7.0.28.2020", "result": "Program.Unwanted.567", "update": "20170610"}, "VIPRE": {"detected": false, "version": "58730", "result": null, "update": "20170610"}, "Invincea": {"detected": false, "version": "6.3.0.25415", "result": null, "update": "20170607"}, "McAfee-GW-Edition": {"detected": false, "version": "v2015", "result": null, "update": "20170610"}, "Sophos": {"detected": false, "version": "4.98.0", "result": null, "update": "20170610"}, "Ikarus": {"detected": false, "version": "0.1.5.2", "result": null, "update": "20170610"}, "Cyren": {"detected": false, "version": "5.4.30.7", "result": null, "update": "20170610"}, "Jiangmin": {"detected": false, "version": "16.0.100", "result": null, "update": "20170610"}, "Webroot": {"detected": false, "version": "1.0.0.207", "result": null, "update": "20170610"}, "Avira": {"detected": true, "version": "8.3.3.4", "result": "PUA/MyPCBackup.Gen", "update": "20170610"}, "Kingsoft": {"detected": false, "version": "2013.8.14.323", "result": null, "update": "20170610"}, "Endgame": {"detected": false, "version": "0.5.0", "result": null, "update": "20170515"}, "Microsoft": {"detected": false, "version": "1.1.13804.0", "result": null, "update": "20170610"}, "ViRobot": {"detected": false, "version": "2014.3.20.0", "result": null, "update": "20170610"}, "ZoneAlarm": {"detected": false, "version": "1.0", "result": null, "update": "20170610"}, "GData": {"detected": true, "version": "A:25.12800B:25.9740", "result": "NSIS.Adware.MyPCBackup.E", "update": "20170610"}, "AhnLab-V3": {"detected": false, "version": "3.9.0.17697", "result": null, "update": "20170610"}, "McAfee": {"detected": false, "version": "6.0.6.653", "result": null, "update": "20170610"}, "AVware": {"detected": false, "version": "1.5.0.42", "result": null, "update": "20170610"}, "VBA32": {"detected": false, "version": "3.12.26.4", "result": null, "update": "20170609"}, "Zoner": {"detected": false, "version": "1.0", "result": null, "update": "20170610"}, "ESET-NOD32": {"detected": true, "version": "15562", "result": "MSIL/MyPCBackup.D potentially unwanted", "update": "20170610"}, "Rising": {"detected": true, "version": "28.0.0.1", "result": "Malware.Undefined!8.C (cloud:I1YBt1VpobT) ", "update": "20170610"}, "Yandex": {"detected": true, "version": "5.5.1.3", "result": "Riskware.Agent!", "update": "20170608"}, "SentinelOne": {"detected": false, "version": "1.0.0.12", "result": null, "update": "20170516"}, "Fortinet": {"detected": false, "version": "5.4.233.0", "result": null, "update": "20170610"}, "AVG": {"detected": false, "version": "8.0.1489.320", "result": null, "update": "20170610"}, "Panda": {"detected": false, "version": "4.6.4.2", "result": null, "update": "20170610"}, "CrowdStrike": {"detected": false, "version": "1.0", "result": null, "update": "20170420"}, "Qihoo-360": {"detected": false, "version": "1.0.0.1120", "result": null, "update": "20170610"}}, "scan_id": "00d468fa26813736cd14ff91e84f5e31fe30eaef6b35af44cafe540870ea7873-1497129945", "sha1": "7b890323abfe8f3bd33be0bc439076b5525d03b0", "resource": "00D468FA26813736CD14FF91E84F5E31FE30EAEF6B35AF44CAFE540870EA7873", "response_code": 1, "scan_date": "2017-06-10 21:25:45", "permalink": "https://www.virustotal.com/file/00d468fa26813736cd14ff91e84f5e31fe30eaef6b35af44cafe540870ea7873/analysis/1497129945/", "verbose_msg": "Scan finished, information embedded", "total": 60, "positives": 11, "sha256": "00d468fa26813736cd14ff91e84f5e31fe30eaef6b35af44cafe540870ea7873", "md5": "45922155c9628e11441aa869c6287bb7"}

实际数据文件3:

{"response_code": 0, "resource": "0E28BEDFBA37CEE5BD639AC86AC08A422C8944C3749CD2C5D7F5A0C2B37115B3", "verbose_msg": "The requested resource is not among the finished, queued or pending scans"}

我们读取文件并检查响应代码。如果响应代码为“0”,则count_not_detected += 1 否则读取 JSON 数据并计算每种防病毒类型检测到的样本数量,以便最后,我们可以说防病毒 A 检测到 323/500 个文件总数,而防病毒 B 检测到 224/总共 500 个文件等。

如果有东西可以完全展平数据并将其全部存储在数据框中,那就太好了。我们为此查看了tidyjson 包,但没有成功。

【问题讨论】:

    标签: json r


    【解决方案1】:

    虽然这些更改尚未发布到 CRAN,但我认为 tidyjson 的开发版本会很好地满足您的需求。您可以使用devtools::install_github('jeremystan/tidyjson') 安装最新的稳定开发版本。

    也就是说,我很难准确理解您在寻找什么。如果您想了解对象的大小/结构,可以使用json_structure()json_lengths()json_types() 进行调查:

    suppressMessages({
      library(jsonlite)
      library(dplyr)
      library(tidyjson)
    })
    
    rawjson1 <- "raw_json_1.json" %>% as.tbl_json()
    rawjson2 <- "raw_json_2.json" %>% as.tbl_json()
    rawjson3 <- "raw_json_3.json" %>% as.tbl_json()
    
    rawjson1 %>% json_structure()
    #> # A tbl_json: 313 x 9 tibble with a "JSON" attribute
    #>          `attr(., "JSON")` document.id parent.id level index child.id
    #>                      <chr>       <int>     <chr> <int> <int>    <chr>
    #>  1 "{\"scans\":{\"Bkav..."           1      <NA>     0     1        1
    #>  2 "{\"Bkav\":{\"detec..."           1         1     1     1      1.1
    #>  3   "\"00d9d7d8e563ae..."           1         1     1     2      1.2
    #>  4   "\"c6a6e3977402e7..."           1         1     1     3      1.3
    #>  5   "\"00D9D7D8E563AE..."           1         1     1     4      1.4
    #>  6                       1           1         1     1     5      1.5
    #>  7   "\"2017-06-13 20:..."           1         1     1     6      1.6
    #>  8   "\"https://www.vi..."           1         1     1     7      1.7
    #>  9   "\"Scan finished,..."           1         1     1     8      1.8
    #> 10                      60           1         1     1     9      1.9
    #> # ... with 303 more rows, and 4 more variables: seq <list>, name <chr>,
    #> #   type <fctr>, length <int>
    
    
    rawjson1 %>% gather_object() %>% json_lengths()
    #> # A tbl_json: 12 x 3 tibble with a "JSON" attribute
    #>          `attr(., "JSON")` document.id          name length
    #>                      <chr>       <int>         <chr>  <int>
    #>  1 "{\"Bkav\":{\"detec..."           1         scans     60
    #>  2   "\"00d9d7d8e563ae..."           1       scan_id      1
    #>  3   "\"c6a6e3977402e7..."           1          sha1      1
    #>  4   "\"00D9D7D8E563AE..."           1      resource      1
    #>  5                       1           1 response_code      1
    #>  6   "\"2017-06-13 20:..."           1     scan_date      1
    #>  7   "\"https://www.vi..."           1     permalink      1
    #>  8   "\"Scan finished,..."           1   verbose_msg      1
    #>  9                      60           1         total      1
    #> 10                       0           1     positives      1
    #> 11   "\"00d9d7d8e563ae..."           1        sha256      1
    #> 12   "\"8d95236c637c04..."           1           md5      1
    
    rawjson1 %>% gather_object() %>% json_types()
    #> # A tbl_json: 12 x 3 tibble with a "JSON" attribute
    #>          `attr(., "JSON")` document.id          name   type
    #>                      <chr>       <int>         <chr> <fctr>
    #>  1 "{\"Bkav\":{\"detec..."           1         scans object
    #>  2   "\"00d9d7d8e563ae..."           1       scan_id string
    #>  3   "\"c6a6e3977402e7..."           1          sha1 string
    #>  4   "\"00D9D7D8E563AE..."           1      resource string
    #>  5                       1           1 response_code number
    #>  6   "\"2017-06-13 20:..."           1     scan_date string
    #>  7   "\"https://www.vi..."           1     permalink string
    #>  8   "\"Scan finished,..."           1   verbose_msg string
    #>  9                      60           1         total number
    #> 10                       0           1     positives number
    #> 11   "\"00d9d7d8e563ae..."           1        sha256 string
    #> 12   "\"8d95236c637c04..."           1           md5 string
    

    也就是说,如果您的最终目标是获得一个数据框以进行进一步调查,您可以使用spread_all() 获得一个非常宽的数据框,或者我认为更有用的数据集(1 级键是列,并且然后是每次扫描的行)。请注意,我同时处理多个文件(每个文件都有一个唯一的document.id)。

    files <- c("raw_json_1.json", "raw_json_2.json")
    
    j <- files %>% as.tbl_json()
    
    clean <- j %>%
    spread_all(recursive=FALSE) %>% ## get the level 1 keys
    enter_object('scans') %>% gather_object() %>% ## enter and gather scans
    spread_all(recursive=FALSE)    ## spread the scans out
    
    names(clean)
    #>  [1] "document.id"   "scan_id"       "sha1"          "resource"     
    #>  [5] "response_code" "scan_date"     "permalink"     "verbose_msg"  
    #>  [9] "total"         "positives"     "sha256"        "md5"          
    #> [13] "name"          "detected"      "version"       "result"       
    #> [17] "update"
    
    ## use tbl_df when done parsing to strip the JSON component
    clean %>% tbl_df() %>% group_by(document.id) %>% summarize(count = n(), detected_count = sum(detected))
    #> # A tibble: 2 x 3
    #>   document.id count detected_count
    #>         <int> <int>          <int>
    #> 1           1    60              0
    #> 2           2    60             11
    
    ## look at those with detected==TRUE
    clean %>% tbl_df() %>% filter(detected) %>% select(document.id, name, version, 
      result)
    #> # A tbl_json: 11 x 4 tibble with a "JSON" attribute
    #>         `attr(., "JSON")` document.id           name             version
    #>                     <chr>       <int>          <chr>               <chr>
    #>  1 "{\"detected\":tru..."           2  CAT-QuickHeal               14.00
    #>  2 "{\"detected\":tru..."           2   Malwarebytes          2.1.1.1115
    #>  3 "{\"detected\":tru..."           2          Baidu             1.0.0.2
    #>  4 "{\"detected\":tru..."           2       Symantec             1.3.1.0
    #>  5 "{\"detected\":tru..."           2 NANO-Antivirus        1.0.76.17389
    #>  6 "{\"detected\":tru..."           2          DrWeb         7.0.28.2020
    #>  7 "{\"detected\":tru..."           2          Avira             8.3.3.4
    #>  8 "{\"detected\":tru..."           2          GData A:25.12800B:25.9740
    #>  9 "{\"detected\":tru..."           2     ESET-NOD32               15562
    #> 10 "{\"detected\":tru..."           2         Rising            28.0.0.1
    #> 11 "{\"detected\":tru..."           2         Yandex             5.5.1.3
    #> # ... with 1 more variables: result <chr>
    

    【讨论】: