Available Bioinformatics Tools
ShennongTools provides a comprehensive collection of pre-configured bioinformatics tools. Each tool comes with multiple commands and standardized parameters, making it easy to build reproducible workflows.
🧬 Sequence Analysis Tools
FastP - FASTQ Quality Control
FastP is an all-in-one FASTQ preprocessor for quality control and filtering.
library(ShennongTools)
# Single-end read processing
result <- sn_run("fastp", "filter",
input1 = "sample.fastq.gz",
output1 = "clean.fastq.gz",
html = "fastp_report.html",
json = "fastp_report.json",
threads = 8
)
# Paired-end read processing with adapter trimming
result <- sn_run("fastp", "filter",
input1 = "sample_R1.fastq.gz",
input2 = "sample_R2.fastq.gz",
output1 = "clean_R1.fastq.gz",
output2 = "clean_R2.fastq.gz",
html = "fastp_report.html",
json = "fastp_report.json",
detect_adapter_for_pe = TRUE,
threads = 8
)
SeqKit - FASTA/Q Manipulation
SeqKit provides ultrafast tools for FASTA/Q file processing.
# Get basic statistics
result <- sn_run("seqkit", "stats",
input = "sequences.fasta"
)
# Convert FASTQ to FASTA
result <- sn_run("seqkit", "fq2fa",
input = "reads.fastq.gz",
output = "reads.fasta"
)
# Extract sequences by pattern
result <- sn_run("seqkit", "grep",
input = "sequences.fasta",
output = "filtered.fasta",
pattern = "ATP"
)
🗺️ Read Mapping & Alignment
HISAT2 - RNA-seq Alignment
Fast and sensitive alignment program for mapping RNA-seq reads.
# Build genome index
result <- sn_run("hisat2", "build",
reference = "reference_genome.fa",
index_base = "genome_index",
threads = 8
)
# Align paired-end reads
result <- sn_run("hisat2", "align",
index = "genome_index",
read1 = "sample_R1.fastq.gz",
read2 = "sample_R2.fastq.gz",
bam = "aligned.bam",
threads = 8,
summary_file = "alignment_summary.txt"
)
STAR - Universal RNA-seq Aligner
Ultrafast universal RNA-seq aligner with splice junction detection.
# Generate genome index
result <- sn_run("star", "generate_index",
genome_dir = "star_index",
genome_fasta = "reference.fa",
gtf = "annotations.gtf",
threads = 16,
sjdb_overhang = 99
)
# Align reads
result <- sn_run("star", "align",
genome_dir = "star_index",
read1 = "sample_R1.fastq.gz",
read2 = "sample_R2.fastq.gz",
output_dir = "star_alignment",
threads = 16,
outSAMtype = "BAM SortedByCoordinate"
)
🧮 Quantification & Assembly
Salmon - Transcript Quantification
Near-optimal probabilistic RNA-seq transcript quantification.
# Build transcript index
result <- sn_run("salmon", "index",
transcripts = "transcripts.fa",
index = "salmon_index",
threads = 8
)
# Quantify paired-end reads
result <- sn_run("salmon", "quant",
index = "salmon_index",
mates1 = "sample_R1.fastq.gz",
mates2 = "sample_R2.fastq.gz",
output = "salmon_quant",
threads = 8,
lib_type = "A"
)
Kallisto - RNA-seq Quantification
Near-optimal probabilistic RNA-seq quantification without alignment.
# Build transcript index
result <- sn_run("kallisto", "index",
transcripts = "transcripts.fa",
index = "kallisto_index"
)
# Quantify reads
result <- sn_run("kallisto", "quant",
index = "kallisto_index",
read1 = "sample_R1.fastq.gz",
read2 = "sample_R2.fastq.gz",
output_dir = "kallisto_output",
bootstrap_samples = 100,
threads = 8
)
StringTie - Transcript Assembly
Transcript assembly and quantification for RNA-seq.
# Assemble transcripts
result <- sn_run("stringtie", "assemble",
input = "aligned.bam",
gtf = "reference.gtf",
output = "assembled.gtf",
threads = 8,
abundance = "abundance.tab"
)
# Merge transcript assemblies
result <- sn_run("stringtie", "merge",
gtf_list = "gtf_files.txt",
reference_gtf = "reference.gtf",
output = "merged.gtf"
)
🔧 File Processing Tools
SAMtools - SAM/BAM Processing
Essential tools for manipulating SAM/BAM alignment files.
# Convert SAM to BAM and sort
result <- sn_run("samtools", "view",
input = "alignment.sam",
output = "alignment.bam",
flags = "-Sb",
threads = 4
)
# Sort BAM file
result <- sn_run("samtools", "sort",
input = "alignment.bam",
output = "sorted.bam",
threads = 4
)
# Index BAM file
result <- sn_run("samtools", "index",
input = "sorted.bam"
)
# Get alignment statistics
result <- sn_run("samtools", "stats",
input = "sorted.bam",
output = "alignment_stats.txt"
)
BEDtools - Genome Arithmetic
Swiss-army knife for genome interval operations.
# Find overlapping intervals
result <- sn_run("bedtools", "intersect",
a = "peaks.bed",
b = "genes.bed",
output = "overlaps.bed"
)
# Get genomic coverage
result <- sn_run("bedtools", "genomecov",
input = "alignment.bam",
genome = "genome.txt",
output = "coverage.bedgraph",
bg = TRUE
)
# Merge overlapping intervals
result <- sn_run("bedtools", "merge",
input = "intervals.bed",
output = "merged.bed"
)
DeepTools - ChIP-seq/ATAC-seq Analysis
Tools for exploring deep-sequencing data.
# Create bigWig from BAM
result <- sn_run("deeptools", "bamCoverage",
bam = "treatment.bam",
output = "coverage.bw",
binSize = 10,
threads = 8,
normalizeUsing = "RPKM"
)
# Compute matrix for plotting
result <- sn_run("deeptools", "computeMatrix",
regions = "genes.bed",
scores = "coverage.bw",
output = "matrix.gz",
referencePoint = "TSS",
beforeRegionStartLength = 2000,
afterRegionStartLength = 2000
)
# Create heatmap
result <- sn_run("deeptools", "plotHeatmap",
matrix = "matrix.gz",
output = "heatmap.png",
colorMap = "Blues"
)
🎯 Specialized Analysis
MACS2 - Peak Calling
Model-based Analysis for ChIP-seq peak calling.
# Call peaks from ChIP-seq data
result <- sn_run("macs2", "callpeak",
treatment = "ChIP.bam",
control = "Input.bam",
name = "sample",
format = "BAM",
gsize = "hs", # human genome
qvalue = 0.01,
call_summits = TRUE,
bdg = TRUE
)
# Call broad peaks for histone marks
result <- sn_run("macs2", "callpeak",
treatment = "H3K27me3.bam",
control = "Input.bam",
name = "H3K27me3",
format = "BAM",
gsize = "hs",
broad = TRUE,
broad_cutoff = 0.1
)
Kraken2 - Taxonomic Classification
Ultra-fast metagenomic sequence classification.
# Classify reads against database
result <- sn_run("kraken2", "classify",
input1 = "sample_R1.fastq.gz",
input2 = "sample_R2.fastq.gz",
database = "/path/to/kraken2_db",
output = "classifications.txt",
report = "kraken_report.txt",
threads = 8,
paired = TRUE
)
MultiQC - Report Aggregation
Aggregate bioinformatics analysis results across samples.
# Generate comprehensive QC report
result <- sn_run("multiqc", "report",
input_dir = "analysis_results",
output_dir = "multiqc_report",
filename = "analysis_report.html",
title = "RNA-seq Analysis Report"
)
🐍 Single-cell Analysis
Scanpy - Single-cell Python Analysis
Single-cell analysis in Python with comprehensive preprocessing and analysis.
# Basic single-cell preprocessing workflow
result <- sn_run("scanpy", "preprocess",
input = "raw_counts.h5ad",
output = "preprocessed.h5ad",
min_genes = 200,
min_cells = 3,
max_genes = 5000,
mt_gene_names = "^MT-"
)
# Clustering and UMAP visualization
result <- sn_run("scanpy", "cluster",
input = "preprocessed.h5ad",
output = "clustered.h5ad",
n_neighbors = 15,
n_pcs = 40,
resolution = 0.5
)
pySCENIC - Regulatory Network Inference
Single-cell regulatory network inference and cell state prediction.
# Run SCENIC workflow
result <- sn_run("pyscenic", "grn",
expression_mtx = "expression.tsv",
tf_names = "transcription_factors.txt",
output = "adjacencies.tsv",
num_workers = 8
)
result <- sn_run("pyscenic", "ctx",
adjacencies = "adjacencies.tsv",
database = "motif_database.feather",
output = "regulons.csv",
num_workers = 8
)
Getting Tool Help
For any tool, you can get detailed help and see all available commands:
# Get overview of all tools
sn_list_tools()
# Get detailed help for a specific tool
sn_help("samtools")
# Get help for a specific command
sn_help("samtools", "view")
# See raw tool help output
sn_help("samtools", raw = TRUE)
Tool Configuration
Each tool is defined by a YAML configuration file that specifies:
- Environment: Conda dependencies and channels
- Commands: Individual tool subcommands
- Parameters: Input files, output files, and options
- Templates: Shell or Python execution templates
This standardized approach ensures consistent behavior across all tools while maintaining the flexibility to accommodate tool-specific requirements.
For more information on creating custom tools, see the YAML Specification.