【问题标题】:List of list of lists (from API call) into data frame in R列表列表(来自API调用)到R中的数据框
【发布时间】:2018-07-18 11:42:30
【问题描述】:

我知道以前有人问过这种性质的问题(将列表转换为数据框),但是我遇到了一个特殊的问题,即我想将其转换为数据框的嵌套列表列表。我得到的数据来自 R 中的 API 调用,因此我要处理这个列表结构的嵌套列表。这是我正在使用的 API 返回对象的一个​​小示例(5 场运动数据):

dput(soccer_data)
    list(structure(list(id = 1603158L, league_id = 779L, season_id = 914L, 
    stage_id = 1810L, round_id = 29156L, group_id = NULL, aggregate_id = NULL, 
    venue_id = 139L, referee_id = 656L, localteam_id = 607L, 
    visitorteam_id = 3639L, weather_report = NULL, commentaries = TRUE, 
    attendance = NULL, pitch = NULL, winning_odds_calculated = FALSE, 
    formations = structure(list(localteam_formation = "4-2-3-1", 
        visitorteam_formation = "4-1-4-1"), .Names = c("localteam_formation", 
    "visitorteam_formation")), scores = structure(list(localteam_score = 5L, 
        visitorteam_score = 1L, localteam_pen_score = 0L, visitorteam_pen_score = 0L, 
        ht_score = "1-0", ft_score = "5-1", et_score = NULL), .Names = c("localteam_score", 
    "visitorteam_score", "localteam_pen_score", "visitorteam_pen_score", 
    "ht_score", "ft_score", "et_score")), time = structure(list(
        status = "FT", starting_at = structure(list(date_time = "2017-03-04 05:30:00", 
            date = "2017-03-04", time = "05:30:00", timestamp = 1488605400L, 
            timezone = "UTC"), .Names = c("date_time", "date", 
        "time", "timestamp", "timezone")), minute = 90L, extra_minute = NULL, 
        injury_time = NULL), .Names = c("status", "starting_at", 
    "minute", "extra_minute", "injury_time")), coaches = structure(list(
        localteam_coach_id = 429924L, visitorteam_coach_id = 429940L), .Names = c("localteam_coach_id", 
    "visitorteam_coach_id")), standings = structure(list(localteam_position = NULL, 
        visitorteam_position = NULL), .Names = c("localteam_position", 
    "visitorteam_position")), deleted = FALSE), .Names = c("id", 
"league_id", "season_id", "stage_id", "round_id", "group_id", 
"aggregate_id", "venue_id", "referee_id", "localteam_id", "visitorteam_id", 
"weather_report", "commentaries", "attendance", "pitch", "winning_odds_calculated", 
"formations", "scores", "time", "coaches", "standings", "deleted"
)), structure(list(id = 1603159L, league_id = 779L, season_id = 914L, 
    stage_id = 1810L, round_id = 29156L, group_id = NULL, aggregate_id = NULL, 
    venue_id = 113L, referee_id = 3614L, localteam_id = 577L, 
    visitorteam_id = 75L, weather_report = NULL, commentaries = FALSE, 
    attendance = NULL, pitch = NULL, winning_odds_calculated = FALSE, 
    formations = structure(list(localteam_formation = "4-2-3-1", 
        visitorteam_formation = "4-2-3-1"), .Names = c("localteam_formation", 
    "visitorteam_formation")), scores = structure(list(localteam_score = 1L, 
        visitorteam_score = 1L, localteam_pen_score = 0L, visitorteam_pen_score = 0L, 
        ht_score = "1-0", ft_score = "1-1", et_score = NULL), .Names = c("localteam_score", 
    "visitorteam_score", "localteam_pen_score", "visitorteam_pen_score", 
    "ht_score", "ft_score", "et_score")), time = structure(list(
        status = "FT", starting_at = structure(list(date_time = "2017-03-04 22:00:00", 
            date = "2017-03-04", time = "22:00:00", timestamp = 1488664800L, 
            timezone = "UTC"), .Names = c("date_time", "date", 
        "time", "timestamp", "timezone")), minute = 90L, extra_minute = NULL, 
        injury_time = NULL), .Names = c("status", "starting_at", 
    "minute", "extra_minute", "injury_time")), coaches = structure(list(
        localteam_coach_id = 455860L, visitorteam_coach_id = 176760L), .Names = c("localteam_coach_id", 
    "visitorteam_coach_id")), standings = structure(list(localteam_position = NULL, 
        visitorteam_position = NULL), .Names = c("localteam_position", 
    "visitorteam_position")), deleted = FALSE), .Names = c("id", 
"league_id", "season_id", "stage_id", "round_id", "group_id", 
"aggregate_id", "venue_id", "referee_id", "localteam_id", "visitorteam_id", 
"weather_report", "commentaries", "attendance", "pitch", "winning_odds_calculated", 
"formations", "scores", "time", "coaches", "standings", "deleted"
)), structure(list(id = 1603160L, league_id = 779L, season_id = 914L, 
    stage_id = 1810L, round_id = 29156L, group_id = NULL, aggregate_id = NULL, 
    venue_id = 28L, referee_id = 555L, localteam_id = 413L, visitorteam_id = 583L, 
    weather_report = NULL, commentaries = FALSE, attendance = 23554L, 
    pitch = NULL, winning_odds_calculated = FALSE, formations = structure(list(
        localteam_formation = "4-4-1-1", visitorteam_formation = "4-4-2"), .Names = c("localteam_formation", 
    "visitorteam_formation")), scores = structure(list(localteam_score = 1L, 
        visitorteam_score = 2L, localteam_pen_score = 0L, visitorteam_pen_score = 0L, 
        ht_score = "0-0", ft_score = "1-2", et_score = NULL), .Names = c("localteam_score", 
    "visitorteam_score", "localteam_pen_score", "visitorteam_pen_score", 
    "ht_score", "ft_score", "et_score")), time = structure(list(
        status = "FT", starting_at = structure(list(date_time = "2017-03-05 00:00:00", 
            date = "2017-03-05", time = "00:00:00", timestamp = 1488672000L, 
            timezone = "UTC"), .Names = c("date_time", "date", 
        "time", "timestamp", "timezone")), minute = 90L, extra_minute = NULL, 
        injury_time = NULL), .Names = c("status", "starting_at", 
    "minute", "extra_minute", "injury_time")), coaches = structure(list(
        localteam_coach_id = 429914L, visitorteam_coach_id = 429917L), .Names = c("localteam_coach_id", 
    "visitorteam_coach_id")), standings = structure(list(localteam_position = NULL, 
        visitorteam_position = NULL), .Names = c("localteam_position", 
    "visitorteam_position")), deleted = FALSE), .Names = c("id", 
"league_id", "season_id", "stage_id", "round_id", "group_id", 
"aggregate_id", "venue_id", "referee_id", "localteam_id", "visitorteam_id", 
"weather_report", "commentaries", "attendance", "pitch", "winning_odds_calculated", 
"formations", "scores", "time", "coaches", "standings", "deleted"
)), structure(list(id = 1603161L, league_id = 779L, season_id = 914L, 
    stage_id = 1810L, round_id = 29156L, group_id = NULL, aggregate_id = NULL, 
    venue_id = 411L, referee_id = 274L, localteam_id = 1062L, 
    visitorteam_id = 111L, weather_report = NULL, commentaries = FALSE, 
    attendance = NULL, pitch = NULL, winning_odds_calculated = FALSE, 
    formations = structure(list(localteam_formation = "4-2-3-1", 
        visitorteam_formation = "3-5-2"), .Names = c("localteam_formation", 
    "visitorteam_formation")), scores = structure(list(localteam_score = 0L, 
        visitorteam_score = 0L, localteam_pen_score = 0L, visitorteam_pen_score = 0L, 
        ht_score = "0-0", ft_score = "0-0", et_score = NULL), .Names = c("localteam_score", 
    "visitorteam_score", "localteam_pen_score", "visitorteam_pen_score", 
    "ht_score", "ft_score", "et_score")), time = structure(list(
        status = "FT", starting_at = structure(list(date_time = "2017-03-05 00:30:00", 
            date = "2017-03-05", time = "00:30:00", timestamp = 1488673800L, 
            timezone = "UTC"), .Names = c("date_time", "date", 
        "time", "timestamp", "timezone")), minute = 90L, extra_minute = NULL, 
        injury_time = NULL), .Names = c("status", "starting_at", 
    "minute", "extra_minute", "injury_time")), coaches = structure(list(
        localteam_coach_id = 456638L, visitorteam_coach_id = 516577L), .Names = c("localteam_coach_id", 
    "visitorteam_coach_id")), standings = structure(list(localteam_position = NULL, 
        visitorteam_position = NULL), .Names = c("localteam_position", 
    "visitorteam_position")), deleted = FALSE), .Names = c("id", 
"league_id", "season_id", "stage_id", "round_id", "group_id", 
"aggregate_id", "venue_id", "referee_id", "localteam_id", "visitorteam_id", 
"weather_report", "commentaries", "attendance", "pitch", "winning_odds_calculated", 
"formations", "scores", "time", "coaches", "standings", "deleted"
)), structure(list(id = 1603162L, league_id = 779L, season_id = 914L, 
    stage_id = 1810L, round_id = 29157L, group_id = NULL, aggregate_id = NULL, 
    venue_id = 11573L, referee_id = 370L, localteam_id = 179L, 
    visitorteam_id = 641L, weather_report = NULL, commentaries = FALSE, 
    attendance = NULL, pitch = NULL, winning_odds_calculated = FALSE, 
    formations = structure(list(localteam_formation = "4-2-3-1", 
        visitorteam_formation = "4-3-1-2"), .Names = c("localteam_formation", 
    "visitorteam_formation")), scores = structure(list(localteam_score = 1L, 
        visitorteam_score = 0L, localteam_pen_score = 0L, visitorteam_pen_score = 0L, 
        ht_score = "0-0", ft_score = "1-0", et_score = NULL), .Names = c("localteam_score", 
    "visitorteam_score", "localteam_pen_score", "visitorteam_pen_score", 
    "ht_score", "ft_score", "et_score")), time = structure(list(
        status = "FT", starting_at = structure(list(date_time = "2017-03-05 02:00:00", 
            date = "2017-03-05", time = "02:00:00", timestamp = 1488679200L, 
            timezone = "UTC"), .Names = c("date_time", "date", 
        "time", "timestamp", "timezone")), minute = 90L, extra_minute = NULL, 
        injury_time = NULL), .Names = c("status", "starting_at", 
    "minute", "extra_minute", "injury_time")), coaches = structure(list(
        localteam_coach_id = 524071L, visitorteam_coach_id = 261458L), .Names = c("localteam_coach_id", 
    "visitorteam_coach_id")), standings = structure(list(localteam_position = NULL, 
        visitorteam_position = NULL), .Names = c("localteam_position", 
    "visitorteam_position")), deleted = FALSE), .Names = c("id", 
"league_id", "season_id", "stage_id", "round_id", "group_id", 
"aggregate_id", "venue_id", "referee_id", "localteam_id", "visitorteam_id", 
"weather_report", "commentaries", "attendance", "pitch", "winning_odds_calculated", 
"formations", "scores", "time", "coaches", "standings", "deleted"
)))

soccer_data 有 5 场 MLS 足球数据,这是我目前正在做的将其转换为数据框的方法:

# grab the "scores" info from the nested list $scores (from each game)
season_scores <- data.frame()
for(i in 1:length(soccer_data)) {  
  game_scores <- as.data.frame(t(unlist(soccer_data[[i]]$scores)), stringsAsFactors = FALSE)
  game_scores$date <- as.Date(soccer_data[[i]]$time$starting_at$date)
  season_scores <- rbind.fill(season_scores, game_scores)
}
season_scores <- season_scores %>% readr::type_convert()

# create df of the game scores, add the season scores, and drop the bad cols
season_boxscores <- as.data.frame(do.call(rbind, soccer_data), stringsAsFactors = FALSE) %>%
  dplyr::select(-one_of(c('scores', 'group_id', 'aggregate_id', 'time', 'standings'))) %>%
  cbind(season_scores) %>%
  readr::type_convert()

不幸的是,这种方法的问题是最后一个 type_convert() 函数调用没有做我想要的,并且生成的 season_boxscores 数据帧的列的类主要是 class== 列表。

# check yourself
sapply(season_boxscores, class) 

那么我的问题是:

  1. 我怎样才能做到这一点,以使 season_boxscores 中所有列的类不都是 class== 列表?还有,
  2. 我是否使用 do.call、rbind 和 as.data.frame 以最佳方式执行此操作(从列表列表转换)?

提前致谢!

编辑:如果所有嵌套列表(在这种情况下,soccer_data 有一些:编队、分数、时间、教练、排名)本身都没有嵌套,就像我取消嵌套它们一样,那就太好了分数的 for 循环。

编辑 2:很抱歉只为 5 场比赛分享这么大的列表对象。在列表列表或像这样的大嵌套对象中,我实际上不知道如何从每个嵌套列表中删除相同的项目,我会为这篇文章做的。 (即从每个soccer_data[[i]] 中删除league_id、round_id 等)。如果有人知道该怎么做,那就太好了!

编辑 3:因为 football_data 不仅仅是一个列表列表,而是一个列表列表(每个列表列表中都有其他非列表对象),这里没有任何解决方案 - Force list of lists into dataframe - 继续足球数据。

【问题讨论】:

    标签: r data-manipulation


    【解决方案1】:

    我还在努力学习这些东西。我测试了一百万件事,这是我能想到的最简单的:

    library(tidyverse)
    soccer_data %>% 
      map(unlist) %>% 
      map(t) %>% 
      map(as_tibble) %>% 
      bind_rows()
    

    想法:获取您的列表soccer_data,将unlist 映射到每个元素(因此它不会在第二级列出,这意味着它将所有游戏保留在最顶层列表的单独元素中)。然后使用 map transpose t 将列表转换为看起来像一行的东西,然后将其转换为tibble,然后将它们全部转换为bind_rows

    结果:

    # A tibble: 5 x 30
      id      league_id season_id stage_id round_id venue_id referee_id localteam_id
      <chr>   <chr>     <chr>     <chr>    <chr>    <chr>    <chr>      <chr>       
    1 1603158 779       914       1810     29156    139      656        607         
    2 1603159 779       914       1810     29156    113      3614       577         
    3 1603160 779       914       1810     29156    28       555        413         
    4 1603161 779       914       1810     29156    411      274        1062        
    5 1603162 779       914       1810     29157    11573    370        179         
    # ... with 22 more variables: visitorteam_id <chr>, commentaries <chr>,
    #   winning_odds_calculated <chr>, formations.localteam_formation <chr>,
    #   formations.visitorteam_formation <chr>, scores.localteam_score <chr>,
    #   scores.visitorteam_score <chr>, scores.localteam_pen_score <chr>,
    #   scores.visitorteam_pen_score <chr>, scores.ht_score <chr>, scores.ft_score <chr>,
    #   time.status <chr>, time.starting_at.date_time <chr>, time.starting_at.date <chr>,
    #   time.starting_at.time <chr>, time.starting_at.timestamp <chr>,
    #   time.starting_at.timezone <chr>, time.minute <chr>,
    #   coaches.localteam_coach_id <chr>, coaches.visitorteam_coach_id <chr>,
    #   deleted <chr>, attendance <chr>
    

    看起来对吗?祝你好运!

    【讨论】:

    • 现在试一试,谢谢! - map 来自哪个 tidyverse 库?编辑:它来自 purrr
    • 当我使用不同的 API 时,我会像每月一次一样重用此代码。再次感谢
    【解决方案2】:

    下面的基本 R 方法怎么样(使用 unlist):

    1. 将列表列表折叠到listchar 向量:

      # Collapse list of list of list to list of character vectors
      lst <- lapply(soccer_data, unlist);
      
    2. 确保所有列表条目具有相同的键。例如,只有样本数据的 list 条目 3 具有键 attendance

      # Make sure that all list entries have values for the same keys
      keys <- unique(unlist(lapply(lst, names)));
      
    3. NA填充缺少的键条目

      # Fill missing entries with NULL
      lst <- lapply(lst, function(x) x[match(keys, names(x))]);
      
    4. rbind 转为data.frame

      # Combind in dataframe
      df <- do.call(rbind.data.frame, lst);
      colnames(df) <- keys;
      
      
      df;
      #id league_id season_id stage_id round_id venue_id referee_id
      #1 1603158       779       914     1810    29156      139        656
      #2 1603159       779       914     1810    29156      113       3614
      #3 1603160       779       914     1810    29156       28        555
      #4 1603161       779       914     1810    29156      411        274
      #5 1603162       779       914     1810    29157    11573        370
      #localteam_id visitorteam_id commentaries winning_odds_calculated
      #1          607           3639         TRUE                   FALSE
      #2          577             75        FALSE                   FALSE
      #3          413            583        FALSE                   FALSE
      #4         1062            111        FALSE                   FALSE
      #5          179            641        FALSE                   FALSE
      #formations.localteam_formation formations.visitorteam_formation
      #1                        4-2-3-1                          4-1-4-1
      #2                        4-2-3-1                          4-2-3-1
      #3                        4-4-1-1                            4-4-2
      #4                        4-2-3-1                            3-5-2
      #5                        4-2-3-1                          4-3-1-2
      #scores.localteam_score scores.visitorteam_score scores.localteam_pen_score
      #1                      5                        1                          0
      #2                      1                        1                          0
      #3                      1                        2                          0
      #4                      0                        0                          0
      #5                      1                        0                          0
      #scores.visitorteam_pen_score scores.ht_score scores.ft_score time.status
      #1                            0             1-0             5-1          FT
      #2                            0             1-0             1-1          FT
      #3                            0             0-0             1-2          FT
      #4                            0             0-0             0-0          FT
      #5                            0             0-0             1-0          FT
      #time.starting_at.date_time time.starting_at.date time.starting_at.time
      #1        2017-03-04 05:30:00            2017-03-04              05:30:00
      #2        2017-03-04 22:00:00            2017-03-04              22:00:00
      #3        2017-03-05 00:00:00            2017-03-05              00:00:00
      #4        2017-03-05 00:30:00            2017-03-05              00:30:00
      #5        2017-03-05 02:00:00            2017-03-05              02:00:00
      #time.starting_at.timestamp time.starting_at.timezone time.minute
      #1                 1488605400                       UTC          90
      #2                 1488664800                       UTC          90
      #3                 1488672000                       UTC          90
      #4                 1488673800                       UTC          90
      #5                 1488679200                       UTC          90
      #coaches.localteam_coach_id coaches.visitorteam_coach_id deleted attendance
      #1                     429924                       429940   FALSE       <NA>
      #2                     455860                       176760   FALSE       <NA>
      #3                     429914                       429917   FALSE      23554
      #4                     456638                       516577   FALSE       <NA>
      #5                     524071                       261458   FALSE       <NA>
      

    如果你删除所有多余的文字/解释,这很短。


    更新

    很遗憾,unlist 导致列类型丢失。您可以通过以下方式将factors 转换回numeric

    # Smart-convert to numeric
    is.num <- apply(df, 2, function(x) {
        x <- x[!is.na(x)];
        all(suppressWarnings(!is.na(as.numeric(as.character(x)))));
    })
    df[, is.num] <- apply(df[, is.num], 2, function(x) as.numeric(as.character(x)));
    

    虽然有点乱,但是很管用。

    【讨论】:

    • 我不喜欢列都是 class== "factor",因为 readr::type_convert() 无法转换列类型
    • @Canovic 是的,你是对的;我没有意识到所有的列都变成了factors。我已包含将“数字”factors 转换回 numeric 的更新。
    猜你喜欢
    • 2013-02-11
    • 2012-01-18
    • 1970-01-01
    • 2012-09-12
    • 1970-01-01
    • 2021-11-15
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多