plot Orthogroups.GeneCount venn and stat.R

#!/usr/bin/Rscript
###############################################################################################

library("argparse")
parser <- ArgumentParser(description='plot Orthogroups.GeneCount venn and stat ')

parser$add_argument( "-i", "--input", type="character", required=T,
                     help="nwk format [required]",
                     metavar="filepath")
# parser$add_argument("-T", "--title", type="character", default="",
#                     help="the label for main title [default %(default)s]",
#                     metavar="title")
parser$add_argument( "-x", "--xlab", type="character", required=F, default="",
                     help="input xlab [default %(default)s]",
                     metavar="xlab")
parser$add_argument( "-y", "--ylab", type="character", required=F, default="Gene number",
                     help="input ylab [default %(default)s]",
                     metavar="ylab")
parser$add_argument( "-P", "--palette", type="character", required=F, default="lancet",
                     help="fill palette in ggsci : eg npg lancet... for more info:https://nanx.me/ggsci/articles/ggsci.html [default %(default)s]",
                     metavar="palette")
parser$add_argument( "-s", "--species.order", type="character", required=F, default=NULL, nargs="+",
                     help="species name order for bar plot [default %(default)s]",
                     metavar="species.order")
parser$add_argument( "-o", "--outdir", type="character", default=getwd(),
                     help="output file directory [default %(default)s]",
                     metavar="path")
parser$add_argument("-n", "--name", type="character", default="GF",
                    help="out file name prefix [default %(default)s]",
                    metavar="name")
parser$add_argument( "-H", "--height", type="double", default=5,
                     help="the height of pic   inches  [default %(default)s]",
                     metavar="number")
parser$add_argument("-W", "--width", type="double", default=5,
                    help="the width of pic   inches [default %(default)s]",
                    metavar="number")

opt <- parser$parse_args()

# parser$print_help()

# 检查输出目录是否存在,否则创建
if (!file.exists(opt$outdir)) {
  if (!dir.create(opt$outdir, showWarnings = FALSE, recursive = TRUE)) {
    stop(paste("dir.create failed: outdir=", opt$outdir, sep=""))
  }
}

require(ggplot2)
require(reshape2)
require(RColorBrewer)
library(tidyr)
library(dplyr)
library(ggsci)
library(getopt)
script <- get_Rscript_filename()
source(paste(dirname(script), "venn_util.r", sep="/"))

#-----------------------------------------------------------------
# 读取数据
#-----------------------------------------------------------------
df <- read.table(opt$input, sep="\t", row.names=1, header=TRUE, check.names=FALSE, comment.char="")
df <- df[, c(-ncol(df))]

single_copy <- rowSums(df == 1) == ncol(df)
species_number <- rowSums(df > 0)
species_specific <- rowSums(df == 0) == (ncol(df)-1) & rowSums(df) > 1
all_species <- rowSums(df > 0) == ncol(df) & rowSums(df) > ncol(df)
unassigined_genes <- rowSums(df == 0) == (ncol(df)-1) & rowSums(df) == 1

# other <- !(single_copy | species_number | species_specific | unassigined_genes)

stat_df <- df
stat_df$species_number <- species_number
stat_df$type <- ifelse(single_copy, "Single-copy orthologs",
                       ifelse(species_specific, "Unique paralogs",
                              ifelse(unassigined_genes, "Unclusters genes",
                                     ifelse(all_species, "Multi-copy orthologs", "Other orthologs"))))

write.table(data.frame(Orthogroup = rownames(stat_df), stat_df),
            file = paste(opt$outdir, "/", opt$name, ".type_table.tsv", sep=""),
            quote = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE)

##########################################################################################
table(stat_df$type)

# 构建柱状图数据框
stat_bar_df <- stat_df %>% 
  select(-species_number) %>% 
  pivot_longer(!type, names_to = "species", values_to = "count") %>% 
  group_by(species, type) %>%  
  summarize(gene_number = sum(count), .groups = "drop")

# 设置type因子的顺序
stat_bar_df$type <- factor(stat_bar_df$type, ordered = TRUE,
                           levels = rev(c("Single-copy orthologs", "Multi-copy orthologs",
                                          "Unique paralogs", "Other orthologs", "Unclusters genes")))

# 如果用户指定了species.order则采用指定顺序,否则采用输入文件中的列顺序
if (!is.null(opt$species.order)) {
  stat_bar_df$species <- factor(stat_bar_df$species, ordered = TRUE,
                                levels = opt$species.order)
} else {
  stat_bar_df$species <- factor(stat_bar_df$species, levels = colnames(df))
}

p <- ggplot(stat_bar_df, aes(species, as.double(gene_number))) +
  geom_bar(aes(fill = type), stat = "identity") +
  get(paste0("scale_fill_", opt$palette))() +
  labs(y = "Gene number", fill = "", x = "") +
  scale_y_continuous(expand = expansion(mult = c(0, .1))) +
  theme_classic() + 
  theme(axis.text.x = element_text(colour = "black", angle = 45, hjust = 1),
        axis.text.y = element_text(colour = "black"))

pdf(file = paste(opt$outdir, "/", opt$name, ".gene_number_bar.pdf", sep=""),
    height = opt$height, width = opt$width * 1.5)
print(p)
dev.off()

png(filename = paste(opt$outdir, "/", opt$name, ".gene_number_bar.png", sep=""),
    height = opt$height * 300, width = opt$width * 300 * 1.5, res = 300, units = "px")
print(p)
dev.off()

stat_bar_df_res <- stat_bar_df %>% pivot_wider(names_from = type, values_from = gene_number)
write.table(stat_bar_df_res,
            file = paste(opt$outdir, "/", opt$name, ".type_gene_number_stat.tsv", sep=""),
            quote = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE)

###########################################################################################
# 数据预处理用于venn图
df[df == 0] <- NA
otu.id <- rownames(df)
map.id <- function(x) {
  x[!is.na(x)] <- otu.id[!is.na(x)]
  x
}
mydf <- sapply(df, map.id)

write.table(mydf, file = paste(opt$outdir, "/", opt$name, ".data_for_venn.tsv", sep=""),
            quote = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE)

mydf <- sapply(as.data.frame(mydf), na.omit)

samn <- length(mydf)

coreID <- Reduce(intersect, mydf)
flower <- outersect(mydf, length(coreID), unlist(mydf))
write.table(flower, file = paste(opt$outdir, "/", opt$name, ".flower.tsv", sep=""),
            quote = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE)

if (samn >= 2 && samn <= 5) {
  myvenn <- get(paste0("venn", samn))
  myvenn(mydf, paste(opt$outdir, "/", opt$name, sep=""), h = opt$height, w = opt$width)
} else {
  cat("more than 5 species, venn not plot\n")
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

终是蝶衣梦晓楼

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值