【问题标题】:How to make a compound legend with color tag that correspond to the relative abundance plot?如何使用与相对丰度图相对应的颜色标签制作复合图例?
【发布时间】:2020-08-20 15:20:01
【问题描述】:

我想从下面的paper 复制一个图。

它在分离 X1 列时卡住了。我想使用正则表达式,但不知道如何使用。 我计划用下划线分隔符分隔每个单词(我有一个列表),然后将 [-tes & -ria] 和 [-ceae] 后缀的单词分别提取到 Phylum 和 Family 中。在那之后,家庭之后的词应该被收集到属中。为了准确起见,可能会将“未分类”和少于5个字符的单词分组到前面的单词中。

另外,是否可以为每个家庭添加对应于相对丰度图的小颜色标签?

library(tidyverse)
james <- read_csv("tableS2a.csv")
james <- james %>% mutate(
    Cecum = rowSums(select(james, contains("Caecum"))),
    Transverse = rowSums(select(james, contains("Transv"))),
    Sigmoid = rowSums(select(james, contains("Sigmoi")))
  )
james2 <- james %>% 
  select(X1, Cecum, Transverse, Sigmoid) 

james.tab <- james2 %>%
  mutate(meanAbundance = 
           rowMeans(
             column_to_rownames(james2, var = "X1")
             )
         ) %>%
  arrange(desc(meanAbundance)) %>%
  top_n(30, meanAbundance) # extract top30

write.csv2(james.tab, "jamestab.csv")

james.tab2 <- 
  as.data.frame(
    apply(
      select(
        james.tab, 
        Cecum, 
        Transverse, 
        Sigmoid), 2, 
      function(x) x / sum(x) * 100)
    )

james.tab3 <-
  bind_cols(
    as.data.frame(
      select(james.tab, X1)), 
    as.data.frame(james.tab2)
    )

james.X1 <- select(james.tab3, X1)

# Separate X1 to Phylum(-tes/-ria), Family (-ceae), and genus
james.list <- strsplit(pull(james.X1, X1), "_")
james.class <-
  if_else(grepl("(ceae)", james.X1) == T,
          mutate(james.X1, Family =
                   grep(
                     "[[:alpha:]]ceae(_)", 
                     strsplit(pull(james.X1, X1), "_"), 
                     value = T
                   )))

我是 R 新手,上面的代码大部分是从我以前的工作中粘贴的。如果代码效率低下,请原谅我。数据集:Original table -> Top30 csv (pastebin)

追加

这是最近的结果 我没有成功实现ggtext包,可能是主题地址错误?

library(tidyverse)
library(patchwork)
library(ggtext)
library(glue)

james <- read_csv("tableS2a.csv")
james2 <- james %>% 
  mutate(
  Cecum = rowSums(select(james, contains("Caecum"))),
  Transverse = rowSums(select(james, contains("Transv"))),
  Sigmoid = rowSums(select(james, contains("Sigmoi")))
  ) %>% 
  select(X1, Cecum, Transverse, Sigmoid) %>% 
  filter(grepl("(ceae)", james$X1)) # Filter rows with -ceae suffix only

# extract family value with selecting -ceae/les suffix word
family.naming0 <-
  regmatches(james2$X1,
             regexpr("(?<=_)(.*?(ceae|les)(?=_))", james2$X1, perl = T))
#in between "_" to fail-safe double -ceae. E.g. Bacteria_Bacteriaceae_Aceae

family.naming1 <-
  regmatches(james2$X1, regexpr("(?<=ceae_|les_)\\d", james2$X1, perl = T))

family.naming2 <- 
  regmatches(james2$X1, regexpr("(?<=ceae_|les_)unclassified", james2$X1, perl = T))

family.naming3 <-
  ifelse(
    grepl("(?<=[(ceae_)|(les_)])\\d", james2$X1, perl = T),
    paste0(family.naming0, " ", family.naming1),
    ifelse(
      grepl("(?<=[(ceae_)|(les_)])unclassified", james2$X1, perl = T),
      paste0(family.naming0, " ", family.naming2),
      paste0(family.naming0)
    ))  

james3 <- james2 %>% 
  gather("Cecum", "Transverse", "Sigmoid", key = "location", value = "abundance") %>% 
  mutate(relativeAbundance=abundance/sum(abundance) * 100) %>%
  mutate(phylum=gsub("(_.*)","", X1)) %>% # extract phylum value with selecting first word
  mutate(family=
           ifelse(
             grepl("(?<=[(ceae_)|(les_)])\\d", X1, perl = T),
             paste0(family.naming0, " ", family.naming1),
             ifelse(
               grepl("(?<=[(ceae_)|(les_)])unclassified", X1, perl = T),
               paste0(family.naming0, " ", family.naming2),
               paste0(family.naming0)
             ))) %>% 
  mutate(genus=gsub("_", " ", sub("(.*ceae)+?_((unclassified|\\d)*(_)*)", "", X1)))

# change it into percentage
james4 <-
  bind_cols(select(james2, X1), as.data.frame(
    apply(
      select(
        james2, 
        Cecum, 
        Transverse, 
        Sigmoid), 2, 
      function(x) x / sum(x) * 100)))

jamesReg <- james4 %>% 
  mutate(james4, 
         meanAbundance=rowMeans(select(james4, Cecum, Transverse, Sigmoid))) %>% 
  arrange(desc(meanAbundance)) %>% 
  top_n(30, meanAbundance) %>% 
  pull(X1)

# collect top 30 from james4X reference
james5 <- james3 %>% 
  filter(X1 %in% jamesReg)

# change order
james5$location_f <- 
  factor(james5$location, labels = c("Cecum", "Transverse", "Sigmoid"))

james6 <- 
  select(james5, location_f, relativeAbundance, genus)

# First plot
james.plot <-
  ggplot(james6,
         aes(x = location_f, y = relativeAbundance, fill = genus)) +
  geom_bar(position = "fill", stat = "identity", show.legend = F) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + # y axis percentage
  #theme_minimal() +
  theme(axis.title.x = element_blank(),
        panel.background = element_blank()) +
  ylab("Relative abundances (%)") +
  scale_fill_hue(l=60, c=80)


james.table <- data.frame("relativeAbundance"=james5$relativeAbundance[1:30]+
                            james5$relativeAbundance[31:60]+
                            james5$relativeAbundance[61:90],
                          "phylum"=james5$phylum[1:30],
                          "family"=james5$family[1:30],
                          "genus"=james5$genus[1:30])

# get colour pattern 
ggplotColours <- function(n = 6, h = c(0, 360) + 15) {
  if ((diff(h) %% 360) < 1)
    h[2] <- h[2] - 360 / n
  hcl(h = (seq(h[1], h[2], length = n)), c = 100, l = 65)
}

family <- pull(select(james.table, family))
genus <- pull(select(james.table, genus))
james.table2 <- james.table %>% 
  mutate(color=ggplotColours(nrow(james.table))) %>% # just in case 
  mutate(asv=glue("{family}: <i>{genus}</i>"))

# color for long vertical tile (phylum tile)
james.phyl.col <- c("#fddb47", "#58b9b2", "#6585c3", "#e25a4b")

# legend making or second plot
james.legend <- 
  ggplot(james.table2, aes(y = asv)) +
  geom_tile(aes(x = 1, fill = asv), width = 0.9, height = 0.9) +
  geom_tile(aes(x = 0.2), 
            fill = james.phyl.col[as.numeric(as.factor(james.table2$phylum))], 
            width = 0.4) +
  scale_y_discrete(position = "right", expand = c(0,0),
                   name = "") +
  scale_x_continuous(expand = c(0,0), breaks = NULL, name = "") +
  scale_fill_discrete(guide = "none") +
  facet_grid(phylum ~ ., scales = "free_y", space = "free_y",
             switch = "y") +
  theme(axis.ticks = element_blank(),
        strip.background = element_blank(),
        aspect.ratio = 1,
        axis.text.y = element_markdown())

# patchwork
james.plot + james.legend

最终图片final

【问题讨论】:

  • 我不认为这在 ggplot2 中是一种原生的可能性,但你可能会在网格系统中乱搞来达到你想要的效果。但是,由于您是 R 新手,因此要完全正确可能有点棘手。
  • 嗨@teunbrand!你能指出我应该寻找哪个关键字吗?是ggplot的网格系统吗?是否可以从 gg 对象中提取变量/行颜色值?也许最好是分开制作,然后拼凑或类似的组合?
  • 是的,它是grid 包和gtable 包。一个起点是打电话给ggplotGrob(my_ggplot_object),然后搞砸。将它们单独制作并拼凑在一起也是一个好主意!
  • 嗨,我目前正在尝试制作第二部分。我认为可以使用ggplotColours() 函数来制作它们,尽管它不会像我将来想象的那样多才多艺。你知道如何制作第二个情节吗?我正在考虑制作小型热图。

标签: r ggplot2 bioinformatics


【解决方案1】:

这是一个示例,说明如何开始将图例制作为单独的情节,稍后您可以将其拼凑在主情节旁边。

基本上,您正在为每个项目制作图块,然后按组对它们进行刻面。将图块与刻面精确成 1:1 有点棘手,因此您必须使用width = ...height = ... 来使其看起来正确。

library(ggplot2)

# Example of item-group relations
df <- data.frame(
  group = c("Actinobacteria", "Actinobacteria", "Bacteroidetes", "Bacteroidetes",
            "Firmicutes", "Firmicutes", "Firmicutes"),
  item = c("Bifidobacteriaceae", "Coriobacteriaceae",
           "Bacteroidaceae", "Porphyromonadacea",
           "Acidaminococcacaea", "Clostridiacea", "Clostridiales")
)

group_colours <- c("blue", "green", "red")

ggplot(df, aes(y = item)) +
  geom_tile(aes(x = 1, fill = item), width = 0.9, height = 0.9) +
  geom_tile(aes(x = 0.2), 
            fill = group_colours[as.numeric(as.factor(df$group))], 
            width = 0.4) +
  scale_y_discrete(position = "right", expand = c(0,0),
                   name = "") +
  scale_x_continuous(expand = c(0,0), breaks = NULL, name = "") +
  scale_fill_discrete(guide = "none") +
  facet_grid(group ~ ., scales = "free_y", space = "free_y",
             switch = "y") +
  theme(axis.ticks = element_blank(),
        strip.background = element_blank(),
        aspect.ratio = 1)

reprex package (v0.3.0) 于 2020-08-18 创建

【讨论】:

  • 非常感谢! geom_tile 是关键!顺便说一句,你知道如何修复下标越界错误吗?我尝试使用 ggplotGrob,但是在对对象进行字符串处理时,它每次都会返回错误。编辑:我也是stackoverflow的新手,如果我偏离主题并应该打开新查询,请告诉我。
  • 我不确定发生此错误的上下文,因此很难说,但我可以想象fill = group_colours[as.numeric(as.factor(df$group))] 会抛出该错误。解决方法通常是提供与 group_colours 变量中的组相同数量的颜色。
  • 很抱歉不清楚。我试图str() 我的ggplotGrob() 对象,它变成了“下标越界”错误。谢谢你的提示!无论如何,我已经从传说中得到了颜色!它位于james.gtable&lt;-ggplotGrobs(james.plot),然后是james.gtable$grobs[[15]]$grobs$"99_bcd2bc442c81652d80f816640caed205"$grobs[[4]][["gp"]][["fill"]]。但是我不知道如何查看图案,我唯一知道的是只有 rect, grob 类型具有这些颜色。而且我只能希望它们不是随机排列的,即便如此,我希望有structure图书馆躺在某个地方。
  • 啊,我明白了。是的,这可能是 gtable 的问题。不过,我不确定您要追求什么更大的目标。只要有可能避免使用 gtable 对象,我宁愿在 ggplot 中而不是在 gtable 中这样做。
  • 嗨,我正在成功地重新制作“情节”。可以在这里发布代码和图片作为答案吗?它还没有完成,也许你可以帮助我。例如如何斜体,重新排列瓷砖标签,缺少上部放线菌门,以及小方块瓷砖的不同颜色。
猜你喜欢
  • 1970-01-01
  • 1970-01-01
  • 2015-10-22
  • 2016-07-12
  • 2015-12-27
  • 1970-01-01
  • 1970-01-01
  • 2023-03-24
  • 2020-08-27
相关资源
最近更新 更多