"""Configuration dataclass for the CAPELLINI pipeline."""
from __future__ import annotations
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
import yaml
[docs]
@dataclass
class CapelliniConfig:
"""All settings for the CAPELLINI pipeline, mirroring the notebook Settings section.
Required fields (no defaults) must be provided explicitly or via from_yaml/from_dict.
Derived path fields are computed in __post_init__ when left as empty strings.
"""
# ── Global ────────────────────────────────────────────────────────────────
base: str = ""
download_path: str = ""
# Derived folder paths (computed from base in __post_init__ if empty)
input_fasta_folder: str = ""
dada2_folder: str = ""
mmseq_folder: str = ""
sp_folder: str = ""
procs_folder: str = ""
enhanced_networks_folder: str = ""
# Reference paths
silva_ref_path: str = ""
silva_taxmap_path: str = ""
full_ncbi_taxonomy_path: str = ""
virus_fasta_name: str = ""
metadata_path: str = ""
bacterial_raw_fasta_folder: str = ""
species_level: bool = False
fresh_start: bool = False
ref_removal: bool = True
# Force-regenerate bundled references (off by default — bundled files are reused)
regenerate_16S_reference: bool = False
regenerate_spacers_collection: bool = False
# External download URLs (fixed)
genes_reference_url: str = (
"http://progenomes3.embl.de/data/repGenomes/"
"progenomes3.genes.representatives.fasta.bz2"
)
bacContigs_reference_url: str = (
"http://progenomes3.embl.de/data/repGenomes/"
"progenomes3.contigs.representatives.fasta.bz2"
)
protein_reference_url: str = (
"http://progenomes3.embl.de/data/repGenomes/"
"progenomes3.proteins.representatives.fasta.bz2"
)
# ── DADA2 ─────────────────────────────────────────────────────────────────
direction: str = "forward"
bacteria_fasta_name: str = "16S_DADA2_bacteria.fasta"
fasta_generation: bool = True
# ── MMSeqs2 ───────────────────────────────────────────────────────────────
isolate_ref_16S: bool = True
mapping_saving: bool = True
min_bitscore: int = 50
max_matches: int = 20
add_taxonomy: bool = True
extend_taxonomy: bool = True
# ── SpacePHARER ───────────────────────────────────────────────────────────
min_n_spacers: int = 3
min_length: int = 23
max_length: int = 47
fdr: float = 0.05
keep_spacers_collection: bool = True
remove_decomp_fasta: bool = True
# ── ProCs ─────────────────────────────────────────────────────────────────
proteins_extraction_path: str = ""
clustering_path: str = ""
matrix_type: str = "count"
save_single_bacgenome_collection: bool = False
keep_coords: bool = False
filter_1bac_1vir: bool = False
remove_collections: bool = False
batch_size: int = 1500
# ── Network ───────────────────────────────────────────────────────────────
OUTPUT_ROOT: str = ""
OVERWRITE: bool = False
VERBOSE: bool = True
RUN_COMMON_ABUNDANCE: bool = True
RUN_SHRINKAGE_CORRELATIONS: bool = True
RUN_RAW_CRISPR_NETWORKS: bool = True
RUN_SMOOTH_CRISPR: bool = True
RUN_XSTAR: bool = True
PREVALENCE: float = 0.10
KEEP_COLUMN: str = "keep_for_analysis"
BACTERIA_TAXONOMY_RANK: str = "target_taxids"
BACTERIAL_RANKS: list = field(
default_factory=lambda: ["Phylum", "Class", "Order", "Family", "progenomes_taxid_genus"]
)
BACTERIAL_WEIGHTS: list = field(default_factory=lambda: [1, 2, 3, 6, 8])
CRISPR_SMOOTH_ALPHA: float = 0.95
TRANSPOSE_RAW_CRISPR_AFTER_LOAD: bool = True
PSEUDOCOUNT: float = 1e-6
LAM: float = 0.5
N_STEPS: int = 1
PRESERVE_SCALE: bool = False
STUDY: str = "default"
# Network input file paths
virus_abundance_raw: str = ""
bacteria_otu: str = ""
bacteria_taxonomy: str = ""
phage_host_predictions: str = ""
tax_bac_for_smoothing: str = ""
tax_vir: str = ""
viral_ranks: list = field(
default_factory=lambda: ["lev8", "lev7", "lev6", "lev5", "lev4", "lev3", "lev2", "lev1", "lev0"]
)
viral_weights: list = field(default_factory=lambda: [1, 1, 2, 3, 4, 6, 8, 10, 12])
aggregate_viral_rank: str = "lev0"
def __post_init__(self) -> None:
if self.base:
if not self.input_fasta_folder:
self.input_fasta_folder = self.base + "/Inputs/Fasta Collection"
if not self.dada2_folder:
self.dada2_folder = self.base + "/DADA2 output"
if not self.mmseq_folder:
self.mmseq_folder = self.base + "/MMSeqs2 Output"
if not self.sp_folder:
self.sp_folder = self.base + "/SpacePHARER output"
if not self.procs_folder:
self.procs_folder = self.base + "/Procs Estimations"
if not self.enhanced_networks_folder:
self.enhanced_networks_folder = self.base + "/Enhanced Networks"
if self.download_path:
if not self.full_ncbi_taxonomy_path:
self.full_ncbi_taxonomy_path = os.path.join(self.download_path, "names.dmp")
if self.procs_folder:
if not self.proteins_extraction_path:
self.proteins_extraction_path = self.procs_folder + "/Targets Proteins Extraction/"
if not self.clustering_path:
self.clustering_path = self.procs_folder + "/Clustering"
if self.enhanced_networks_folder and not self.OUTPUT_ROOT:
self.OUTPUT_ROOT = self.enhanced_networks_folder
if self.dada2_folder:
suffix = {"forward": "F", "reverse": "R", "paired": "P"}.get(self.direction, "F")
if not self.bacteria_taxonomy:
self.bacteria_taxonomy = (
f"{self.dada2_folder}/taxonomy_table_{suffix}_progenomeLikeNCBIIDs.csv"
)
if not self.bacteria_otu:
self.bacteria_otu = f"{self.dada2_folder}/OTU_table_{suffix}.csv"
if self.sp_folder and not self.phage_host_predictions:
self.phage_host_predictions = f"{self.sp_folder}/output/phage_host_predictions.tsv"
if self.dada2_folder and not self.tax_bac_for_smoothing:
suffix = {"forward": "F", "reverse": "R", "paired": "P"}.get(self.direction, "F")
self.tax_bac_for_smoothing = (
f"{self.dada2_folder}/taxonomy_table_{suffix}_progenomeLikeNCBIIDs.csv"
)
# ── Class methods ─────────────────────────────────────────────────────────
[docs]
@classmethod
def default(cls) -> "CapelliniConfig":
"""Return a config with all default values (paths will be empty until base is set)."""
return cls()
[docs]
@classmethod
def from_dict(cls, d: dict[str, Any]) -> "CapelliniConfig":
"""Load config from a plain Python dict."""
known = {f.name for f in cls.__dataclass_fields__.values()} # type: ignore[attr-defined]
filtered = {k: v for k, v in d.items() if k in known}
return cls(**filtered)
[docs]
@classmethod
def from_yaml(cls, path: str | Path) -> "CapelliniConfig":
"""Load config from a YAML file.
Args:
path: Path to the YAML configuration file.
Returns:
CapelliniConfig populated from the YAML.
"""
with open(path, "r") as fh:
data = yaml.safe_load(fh) or {}
return cls.from_dict(data)
[docs]
def to_yaml(self, path: str | Path) -> None:
"""Serialize config to a YAML file.
Args:
path: Destination YAML file path.
"""
import dataclasses
data = dataclasses.asdict(self)
Path(path).parent.mkdir(parents=True, exist_ok=True)
with open(path, "w") as fh:
yaml.dump(data, fh, default_flow_style=False, allow_unicode=True, sort_keys=False)
[docs]
def virus_fasta_path(self) -> Path:
"""Resolved absolute path to the virus FASTA file."""
return Path(self.input_fasta_folder) / self.virus_fasta_name