Advanced ShennongTools Usage
This guide covers advanced features and power-user techniques for ShennongTools.
Custom Toolbox Management
Creating Custom Toolboxes
# Create a custom toolbox with specific base directory
custom_toolbox <- sn_initialize_toolbox(base_dir = "/path/to/custom/tools")
# Add tools to custom toolbox
custom_toolbox <- sn_add_tool(custom_toolbox, "samtools", version = "1.19.2")
custom_toolbox <- sn_add_tool(custom_toolbox, "hisat2", version = "2.2.1")
# Use custom toolbox (requires manual management)
# Note: sn_run() uses the default global toolbox
Tool Installation Management
# Install tool without running
sn_install_tool("star", version = "2.7.11b")
# Check installation status
toolbox <- sn_initialize_toolbox()
tool <- sn_get_tool(toolbox, "star")
is_installed <- sn_is_tool_installed(tool, "2.7.11b")
# Get all installed versions
installed_versions <- sn_get_installed_versions(tool)
print(installed_versions)
# Validate installation
is_valid <- sn_validate_tool("star", version = "2.7.11b")
Environment and Version Management
Environment Information
# Get environment details
result <- sn_run("samtools", "view",
input = "test.bam",
dry_run = TRUE
)
# Environment path is shown in dry run output
# You can also access the toolbox directly
toolbox <- sn_initialize_toolbox()
env_path <- file.path(toolbox@base_dir, "samtools", "1.19.2")
print(env_path)
Advanced Logging and Monitoring
Detailed Resource Monitoring
# Run with detailed monitoring
result <- sn_run("star", "align",
genome_dir = "star_index",
read1 = "sample_R1.fastq.gz",
read2 = "sample_R2.fastq.gz",
output_dir = "star_output",
threads = 16,
log_level = "normal"
)
# Comprehensive resource information
resources <- result@resources
cat("Command:", result@rendered_command, "\n")
cat("Exit Code:", resources$exit_code, "\n")
cat("Runtime:", resources$runtime_seconds, "seconds\n")
cat("Peak Memory:", resources$peak_memory_mb, "MB\n")
cat("CPU Usage:", resources$cpu_percent, "%\n")
# Timestamps
cat("Started:", result@start_time, "\n")
cat("Finished:", result@end_time, "\n")
Custom Log Management
# Set global log directory
sn_options(log_dir = "/path/to/analysis/logs")
# Per-command log directory
result <- sn_run("hisat2", "align",
index = "genome_index",
read1 = "sample_R1.fastq.gz",
read2 = "sample_R2.fastq.gz",
bam = "aligned.bam",
log_dir = "./hisat2_logs",
log_level = "normal"
)
# Access log file path
log_file <- result@log_file
cat("Log saved to:", log_file, "\n")
Template System Deep Dive
Testing Templates
# Test a custom template
template <- "{{ binary }} view -@ {{ threads }} {{ input }} -o {{ output }}"
test_params <- list(
binary = "samtools",
threads = 4,
input = "test.bam",
output = "filtered.bam"
)
rendered <- sn_test_template(template, test_params)
print(rendered)
Understanding Parameter Resolution
# Parameters are resolved in this order:
# 1. User-provided parameters
# 2. Command defaults from YAML
# 3. Automatic parameter generation (binary, version, etc.)
# Example: See what parameters are actually used
result <- sn_run("samtools", "view",
input = "test.bam",
dry_run = TRUE,
log_level = "normal" # Shows parameter resolution
)
Workflow Integration
Building Analysis Pipelines
# Create a complete RNA-seq processing pipeline
process_rnaseq <- function(sample_name, read1, read2, genome_index, gtf) {
# 1. Quality control
qc_result <- sn_run("fastp", "filter",
input1 = read1,
input2 = read2,
output1 = paste0(sample_name, "_clean_R1.fastq.gz"),
output2 = paste0(sample_name, "_clean_R2.fastq.gz"),
html = paste0(sample_name, "_fastp.html"),
json = paste0(sample_name, "_fastp.json"),
threads = 8
)
if (!sn_is_toolcall_success(qc_result)) {
stop("Quality control failed for ", sample_name)
}
# 2. Alignment
align_result <- sn_run("hisat2", "align",
index = genome_index,
read1 = paste0(sample_name, "_clean_R1.fastq.gz"),
read2 = paste0(sample_name, "_clean_R2.fastq.gz"),
bam = paste0(sample_name, "_aligned.bam"),
threads = 8,
summary_file = paste0(sample_name, "_summary.txt")
)
if (!sn_is_toolcall_success(align_result)) {
stop("Alignment failed for ", sample_name)
}
# 3. Sort and index
sort_result <- sn_run("samtools", "sort",
input = paste0(sample_name, "_aligned.bam"),
output = paste0(sample_name, "_sorted.bam"),
threads = 4
)
sn_run("samtools", "index", input = paste0(sample_name, "_sorted.bam"))
# 4. Quantification
quant_result <- sn_run("stringtie", "assemble",
input = paste0(sample_name, "_sorted.bam"),
gtf = gtf,
output = paste0(sample_name, "_assembled.gtf"),
abundance = paste0(sample_name, "_abundance.tab"),
threads = 8
)
return(list(
qc = qc_result,
alignment = align_result,
sorting = sort_result,
quantification = quant_result
))
}
# Use the pipeline
results <- process_rnaseq(
sample_name = "sample1",
read1 = "sample1_R1.fastq.gz",
read2 = "sample1_R2.fastq.gz",
genome_index = "genome_index",
gtf = "annotations.gtf"
)
Parallel Processing
# Process multiple samples in parallel
library(parallel)
samples <- data.frame(
name = c("sample1", "sample2", "sample3"),
read1 = c("s1_R1.fastq.gz", "s2_R1.fastq.gz", "s3_R1.fastq.gz"),
read2 = c("s1_R2.fastq.gz", "s2_R2.fastq.gz", "s3_R2.fastq.gz")
)
# Set up cluster
cl <- makeCluster(3)
clusterEvalQ(cl, library(ShennongTools))
# Process samples in parallel
results <- parLapply(cl, 1:nrow(samples), function(i) {
row <- samples[i, ]
# Quality control for each sample
sn_run("fastp", "filter",
input1 = row$read1,
input2 = row$read2,
output1 = paste0(row$name, "_clean_R1.fastq.gz"),
output2 = paste0(row$name, "_clean_R2.fastq.gz"),
html = paste0(row$name, "_fastp.html"),
threads = 2 # Reduced threads per job
)
})
stopCluster(cl)
Error Handling and Debugging
Robust Error Handling
# Wrap tool calls in error handling
safe_run <- function(tool, command, ..., max_retries = 3) {
for (attempt in 1:max_retries) {
tryCatch(
{
result <- sn_run(tool, command, ...)
if (sn_is_toolcall_success(result)) {
return(result)
} else {
warning(
"Command failed on attempt ", attempt,
", exit code: ", result@resources$exit_code
)
}
},
error = function(e) {
warning("Error on attempt ", attempt, ": ", e$message)
}
)
if (attempt < max_retries) {
Sys.sleep(2^attempt) # Exponential backoff
}
}
stop("Command failed after ", max_retries, " attempts")
}
# Use safe wrapper
result <- safe_run("samtools", "view",
input = "problematic.bam",
output = "output.bam",
flags = "-q 30"
)
Debugging Failed Commands
# When a command fails, inspect the details
result <- sn_run("star", "align",
genome_dir = "star_index",
read1 = "sample_R1.fastq.gz",
read2 = "sample_R2.fastq.gz",
output_dir = "star_output"
)
if (!sn_is_toolcall_success(result)) {
# Check exit code
cat("Exit code:", result@resources$exit_code, "\n")
# Check rendered command
cat("Command:", result@rendered_command, "\n")
# Check log file
cat("Log file:", result@log_file, "\n")
# Read log contents
if (file.exists(result@log_file)) {
log_contents <- readLines(result@log_file)
cat("Last 10 lines of log:\n")
cat(tail(log_contents, 10), sep = "\n")
}
}
Performance Optimization
Resource Optimization
# Monitor system resources before running heavy jobs
system_info <- function() {
list(
cpu_cores = parallel::detectCores(),
memory_gb = as.numeric(system("free -g | awk '/^Mem:/{print $2}'", intern = TRUE)),
disk_space = system("df -h . | awk 'NR==2{print $4}'", intern = TRUE)
)
}
info <- system_info()
cat("Available cores:", info$cpu_cores, "\n")
cat("Available memory:", info$memory_gb, "GB\n")
cat("Available disk:", info$disk_space, "\n")
# Adjust thread counts based on system resources
optimal_threads <- min(info$cpu_cores - 1, 16) # Leave one core free, max 16
result <- sn_run("star", "align",
genome_dir = "star_index",
read1 = "sample_R1.fastq.gz",
read2 = "sample_R2.fastq.gz",
output_dir = "star_output",
threads = optimal_threads
)
Batch Processing Strategies
# Process files in batches to manage memory
process_batch <- function(files, batch_size = 5) {
results <- list()
for (i in seq(1, length(files), batch_size)) {
batch_end <- min(i + batch_size - 1, length(files))
batch_files <- files[i:batch_end]
cat(
"Processing batch", ceiling(i / batch_size),
":", length(batch_files), "files\n"
)
batch_results <- lapply(batch_files, function(file) {
sn_run("seqkit", "stats", input = file)
})
results <- c(results, batch_results)
# Clean up memory between batches
gc()
}
return(results)
}
# Use batch processing
fasta_files <- list.files(pattern = "\\.fa$", full.names = TRUE)
results <- process_batch(fasta_files, batch_size = 3)
Integration with Other R Packages
With Bioconductor
# Example: Integrate with GenomicRanges
library(GenomicRanges)
library(rtracklayer)
# Use ShennongTools to create coverage, then import to R
sn_run("deeptools", "bamCoverage",
bam = "aligned.bam",
output = "coverage.bw",
binSize = 10,
threads = 8
)
# Import the bigWig file into R
coverage <- import("coverage.bw", format = "BigWig")
print(coverage)
With data.table/dplyr
library(data.table)
# Process multiple samples and collect results
samples <- c("sample1", "sample2", "sample3")
results_dt <- rbindlist(lapply(samples, function(sample) {
result <- sn_run("seqkit", "stats",
input = paste0(sample, ".fasta"),
log_level = "silent"
)
# Parse tool output (would need actual output format)
# This is a simplified example
data.table(
sample = sample,
runtime = sn_get_toolcall_runtime(result),
success = sn_is_toolcall_success(result),
memory_mb = result@resources$peak_memory_mb
)
}))
print(results_dt)
This advanced guide covers the power-user features of ShennongTools. With these techniques, you can build robust, scalable bioinformatics workflows that integrate seamlessly with the broader R ecosystem.