当前位置: 首页 > article >正文

Deseq2:MAG相对丰度差异检验

首先使用代码将contigs和MAG联系起来

https://github.com/MrOlm/drep/blob/master/helper_scripts/parse_stb.py

~/parse_stb.py --reverse -f ~/bin_dir/* -o ~/bin_dir/genomes.stb
# 查看第一列的contigs有没有重复(重复的话会影响后续比对)
awk '{print $1}' ~/bin_dir/genomes.stb | sort | uniq -d
# 查看格式
head ~/bin_dir/genomes.stb

 

# 合并多个mag.fa
sed 's/>/\n>/g' ~/bin_dir/*.fa | sed '/^$/d' > ~/bin_dir/all_MAGs.fa
# 建立index
bwa-mem2 index ~/bin_dir/all_MAGs.fa
# 批量比对文件
#!/bin/bash# 设置参考基因组路径
REF_FA=/home/zhongpei/diarrhoea/MJ/Data/RGIG_merge.fa
GENOME_DIR=./merged_rmg_usg_genomes/# 创建输出目录(可选)
mkdir -p bam_outputfor i in *.fastp.1.fq.gz; donum=${i%%.fastp.1.fq.gz}# Step 1: 比对到MAGs/home/zhongpei/hard_disk_sda2/zhongpei/Software/bwa-mem2/bwa-mem2 mem -t 36 \$REF_FA ${num}.fastp.1.fq.gz ${num}.fastp.2.fq.gz > ${num}_RGIG.sam# Step 2: SAM转BAM并排序 & 建索引samtools view -Sb -@ 36 ${num}_RGIG.sam | samtools sort -@ 36 -o bam_output/${num}_RGIG_sorted.bamsamtools index bam_output/${num}_RGIG_sorted.bam# 删除中间SAM文件rm -f ${num}_RGIG.sam
done# Step 3: 使用samtools获取每个genome的read count
samtools idxstats -@ 32 bam_output/${num}_RGIG_sorted.bam > bam_output/${num}_aligen_result.txt
sed -i '1iGeneID\tlength\tmapped_read\tunmapped_read' bam_output/${num}_aligen_result.txt

合并不同样品的raw reads 

#! /usr/bin/env python
# Combine raw counts from align_result.txt files
# Written by zp, adapted from PeiZhong's TPM combinerimport argparse
import os
import pandas as pdparser = argparse.ArgumentParser(description='Combine raw mapped read counts from align result files')
parser.add_argument('--work_path', '-p', help='Directory containing your align_result.txt files')
parser.add_argument('--file_maker', '-m', nargs='+', help='Keywords to filter relevant files (e.g., align_result)')
parser.add_argument('--output_name', '-o', help='Name of output OTU table (tsv format)')args = parser.parse_args()OperaPath = args.work_path
file_makers = args.file_maker
output_name = args.output_nameos.chdir(OperaPath)
files = os.listdir(OperaPath)selected_files = []
for file in files:if all(keyword in file for keyword in file_makers):selected_files.append(file)
selected_files.sort()
print(f"Files to process: {selected_files}")all_data = pd.DataFrame()for file_name in selected_files:# Extract sample name from file name (you can customize this parsing if needed)sample_name = file_name.replace('_align_result.txt', '')df = pd.read_csv(file_name, sep='\t', usecols=['GeneID', 'mapped_read'])df.columns = ['GeneID', sample_name]  # Rename mapped_read to sample nameif all_data.empty:all_data = dfelse:all_data = pd.merge(all_data, df, on='GeneID', how='outer')# Replace NaNs with 0 and convert to integers
all_data = all_data.fillna(0)
all_data.iloc[:, 1:] = all_data.iloc[:, 1:].astype(int)# Optional: sort by GeneID
all_data = all_data.sort_values(by='GeneID')# Save OTU table
all_data.to_csv(output_name, sep='\t', index=False)
print(f"Combined raw count table saved as: {output_name}")

 把contigs和bin的读数对应上

#!/usr/bin/env python
#########################################################
# Add contig raw read counts by bin mapping
# Written by PeiZhong in IFR of CAAS
# Optimized by ChatGPT for robustnessimport argparse
import pandas as pdparser = argparse.ArgumentParser(description='Aggregate contig raw read counts into bins')
parser.add_argument('--stb', '-m', required=True, help='Mapping file: contig to bin (TSV format)')
parser.add_argument('--raw_reads', '-r', required=True, help='Contig-level raw read count table (TSV format)')
parser.add_argument('--output_name', '-o', required=True, help='Output file name for bin-level count table (TSV)')args = parser.parse_args()# 1. Load contig-to-bin mapping
map_df = pd.read_csv(args.stb, sep='\t', header=None, names=["Contig", "Bin"])# 2. Load contig-level raw count matrix
count_df = pd.read_csv(args.raw_reads, sep='\t')# 3. Merge to add Bin info to contig count table
merged_df = pd.merge(map_df, count_df, left_on="Contig", right_on="GeneID", how='inner')# 4. Aggregate counts by bin (sum across contigs in the same bin)
bin_counts = merged_df.drop(columns=["Contig", "GeneID"]).groupby("Bin").sum()# 5. Save as TSV
bin_counts.to_csv(args.output_name, sep='\t')print(f"Bin-level count matrix saved to: {args.output_name}")

deseq2

# 加载包
library(DESeq2)
library(tidyverse)# Step 1: 读取 bin-level 原始 count 表
# 请将路径替换为你实际的文件名,例如:"bin_counts.tsv"
setwd("deseq2")
count_data <- read.table("rumen_data_RGIG_raw_reads_byBin.txt", header = TRUE, row.names = 1, sep = "\t", check.names = FALSE)# Step 2: 构建分组信息
# 根据样本名自动识别分组(假设 ATCC vs CK)
sample_names <- colnames(count_data)
group <- ifelse(grepl("^ATCC", sample_names), "ATCC", "CK")
col_data <- data.frame(row.names = sample_names, group = factor(group, levels = c("CK", "ATCC")))# Step 3: 构建 DESeq2 数据对象
dds <- DESeqDataSetFromMatrix(countData = count_data,colData = col_data,design = ~ group)# Step 4: 过滤低丰度 bin(可选但推荐)
dds <- dds[rowSums(counts(dds)) >= 100, ]# Step 5: 执行差异分析
dds <- DESeq(dds)
res <- results(dds, contrast = c("group", "ATCC", "CK"))# Step 6: 查看显著性结果
resSig <- res %>%as.data.frame() %>%rownames_to_column("Bin") %>%filter(padj < 0.05 & abs(log2FoldChange) > 1) %>%arrange(padj)# Step 7: 导出结果
write.table(as.data.frame(res), file = "DESeq2_all_bins.tsv", sep = "\t", quote = FALSE, col.names = NA)
write.table(resSig, file = "DESeq2_significant_bins.tsv", sep = "\t", quote = FALSE, row.names = FALSE)# Step 8: 可选可视化(火山图)
#if (!requireNamespace('BiocManager', quietly = TRUE))
#  install.packages('BiocManager')
#BiocManager::install('EnhancedVolcano')
library(EnhancedVolcano)# 设置颜色和标签
keyvals <- rep('gray', nrow(res))
names(keyvals) <- rep('Mid', nrow(res))# 高表达(ATCC 高): log2FC > 1 且 padj < 0.05
keyvals[which(res$log2FoldChange > 1 & res$padj < 0.05)] <- '#DC143C'
names(keyvals)[which(res$log2FoldChange > 1 & res$padj < 0.05)] <- 'high'# 低表达(CK 高): log2FC < -1 且 padj < 0.05
keyvals[which(res$log2FoldChange < -1 & res$padj < 0.05)] <- '#7B68EE'
names(keyvals)[which(res$log2FoldChange < -1 & res$padj < 0.05)] <- 'low'# 画 EnhancedVolcano 火山图
EnhancedVolcano(res,lab = rownames(res),x = 'log2FoldChange',y = 'padj',xlim = c(-4, 4),ylim = c(0, 15),title = 'ATCC vs CK (Bin Level)',pCutoff = 0.05,FCcutoff = 1,xlab = bquote(~Log[2]~ 'fold change'),ylab = bquote(~-Log[10]~adjusted~italic(P)),selectLab = rownames(res)[which(names(keyvals) %in% c('high', 'low'))],pointSize = 3.0,labSize = 3.0,colAlpha = 1,cutoffLineType = 'twodash',cutoffLineWidth = 0.8,colCustom = keyvals,border = 'full',legendLabels = c('NS','Log2 FC','Adjusted P','Adjusted P & Log2 FC'),legendPosition = 'right',drawConnectors = FALSE,boxedLabels = FALSE,legendLabSize = 14,legendIconSize = 4.0)# 保存图像
ggsave("volcano_plot.pdf", width = 8, height = 6)

http://www.lryc.cn/news/2395059.html

相关文章:

  • CTFHub-RCE 命令注入-过滤目录分隔符
  • 从零开始的数据结构教程(七) 回溯算法
  • CentOS-stream-9 Zabbix的安装与配置
  • 开源是什么?我们为什么要开源?
  • 【unity游戏开发——编辑器扩展】EditorApplication公共类处理编辑器生命周期事件、播放模式控制以及各种编辑器状态查询
  • elasticsearch低频字段优化
  • React---day3
  • PyCharm接入DeepSeek,实现高效AI编程
  • 前端面经 get和post区别
  • CTFSHOW-WEB-36D杯
  • MySQL connection close 后, mysql server上的行为是什么
  • RabbitMQ vs MQTT:深入比较与最新发展
  • 金砖国家人工智能高级别论坛在巴西召开,华院计算应邀出席并发表主题演讲
  • 【KWDB 创作者计划】_再热垃圾发电汽轮机仿真与监控系统:KaiwuDB 批量插入10万条数据性能优化实践
  • CentOS 7 安装docker缺少slirp4netnsy依赖解决方案
  • Android第十一次面试多线程篇
  • 安全,稳定可靠的政企即时通讯数字化平台
  • craw4ai 抓取实时信息,与 mt4外行行情结合实时交易,基本面来觉得趋势方向,搞一个外汇交易策略
  • Linux之守护进程
  • LiquiGen流体导入UE
  • 使用react进行用户管理系统
  • SpringBoot的java应用中,慢sql会导致CPU暴增吗
  • Ubuntu下编译mininim游戏全攻略
  • uniapp uni-id Error: Invalid password secret
  • 用 Appuploader,让 iOS 上架流程真正“可交接、可记录、可复用”:我们是这样实现的
  • 第十二节:第三部分:集合框架:List系列集合:特点、方法、遍历方式、ArrayList集合的底层原理
  • 【办公类-18-07】20250527屈光检查PDF文件拆分成多个pdf(两页一份,用幼儿班级姓名命名文件)
  • AI Agent的“搜索大脑“进化史:从Google API到智能搜索生态的技术变革
  • Arduino学习-跑马灯
  • python创建args命令行分析