【发布时间】:2021-03-23 06:27:32
【问题描述】:
我对将行组合在一起直到满足某些条件有疑问。这是我的数据框。
| Gene | directon |intergenic_distance|
| -------- | -------------- |-------------------|
| fixA | 11 |NA |
| fixB | 11 |15 |
| fixC | 11 |51 |
| fixX | 11 |-3 |
| kefF | 11 |108 |
| kefC | 11 |-7 |
| apaH | 12 |NA |
| apaG | 12 |7 |
我想在 intergenic_distance>50 之后和下面的同一方向内对行进行分组。
| Gene | directon |intergenic_distance|operon|
| -------- | -------------- |-------------------|------|
| fixA | 11 |NA |1 |
| fixB | 11 |15 |1 |
| fixC | 11 |51 |2 |
| fixX | 11 |-3 |2 |
| kefF | 11 |108 |3 |
| kefC | 11 |-7 |3 |
| apaH | 12 |NA |4 |
| apaG | 12 |7 |4 |
我正在考虑使用 with、rle、rep、seq_along,但我不知道该怎么做。提前谢谢!
dput(head(e_coli_operon,10))
structure(list(name = c("thrA", "thrB", "thrC", "yaaW", "yaaI",
"mokC", "hokC", "insB", "insA", "yaaY"), start = c(337, 2801,
3734, 10643, 11382, 16751, 16751, 19811, 20233, 21181), end = c(2799,
3733, 5020, 11356, 11786, 16960, 16903, 20314, 20508, 21399),
strand = c(1, 1, 1, -1, -1, -1, -1, -1, -1, 1), length = c(820L,
310L, 428L, 237L, 134L, 69L, 50L, 167L, 91L, 72L), pid = c(16127996L,
16127997L, 16127998L, 16128005L, 16128007L, 16128012L, 49175991L,
16128015L, 16128016L, 16128018L), gene = c("thrA", "thrB",
"thrC", "yaaW", "yaaI", "mokC", "hokC", "insB", "insA", "yaaY"
), synonym = c("b0002", "b0003", "b0004", "b0011", "b0013",
"b0018", "b4412", "b0021", "b0022", "b0024"), code = c("-",
"-", "-", "-", "-", "-", "-", "-", "-", "-"), cog = c("COG0527E",
"COG0083E", "COG0498E", "COG4735S", "-", "-", "-", "COG1662L",
"COG3677L", "-"), product = c("fused aspartokinase I and homoserine dehydrogenase I",
"homoserine kinase", "threonine synthase", "conserved protein, UPF0174 family",
"conserved protein, UPF0412 family", "regulatory protein for HokC, overlaps CDS of hokC",
"toxic membrane protein, small", "IS1 transposase B", "IS1 repressor TnpA",
"predicted protein"), col = c("blue", "blue", "blue", "blue",
"blue", "blue", "blue", "blue", "blue", "blue"), fill = c("blue",
"blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue",
"blue"), lty = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), lwd = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1), pch = c(8, 8, 8, 8, 8, 8, 8,
8, 8, 8), cex = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), gene_type = c("arrows",
"arrows", "arrows", "arrows", "arrows", "arrows", "arrows",
"arrows", "arrows", "arrows"), directon = c("1", "1", "1",
"4", "4", "6", "6", "8", "8", "9"), intergenic_distance = c(82,
2, 1, 149, 26, NA, -209, NA, -81, NA)), row.names = c(NA,
-10L), groups = structure(list(directon = c("1", "4", "6", "8",
"9"), .rows = structure(list(1:3, 4:5, 6:7, 8:9, 10L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 5L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
【问题讨论】:
标签: r dataframe grouping sequence