"""Configuration dataclass for the CAPELLINI pipeline."""
from __future__ import annotations
import os
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
import yaml
[docs]
@dataclass
class CapelliniConfig:
"""All settings for the CAPELLINI pipeline, mirroring the notebook Settings section.
Required fields (no defaults) must be provided explicitly or via from_yaml/from_dict.
Derived path fields are computed in __post_init__ when left as empty strings.
"""
# ── Global ────────────────────────────────────────────────────────────────
base: str = ""
download_path: str = ""
# Derived folder paths (computed from base in __post_init__ if empty)
input_fasta_folder: str = ""
dada2_folder: str = ""
mmseq_folder: str = ""
sp_folder: str = ""
procs_folder: str = ""
enhanced_networks_folder: str = ""
# Reference paths
silva_ref_path: str = ""
silva_taxmap_path: str = ""
full_ncbi_taxonomy_path: str = ""
ncbi_accessory_path: str = ""
virus_fasta_name: str = ""
metadata_path: str = ""
bacterial_raw_fasta_folder: str = ""
species_level: bool = False
fresh_start: bool = False
ref_removal: bool = True
# Force-regenerate bundled references (off by default — bundled files are reused)
regenerate_16S_reference: bool = False
regenerate_spacers_collection: bool = False
# External download URLs (fixed; mirror the Resources block of the notebook)
# SILVA 138.1
silva_ref_url: str = (
"https://zenodo.org/records/4587955/files/"
"silva_nr99_v138.1_train_set.fa.gz"
)
silva_taxmap_url: str = (
"https://www.arb-silva.de/fileadmin/silva_databases/release_138_1/"
"Exports/taxonomy/tax_slv_ssu_138.1.txt.gz"
)
# NCBI taxonomy
full_ncbi_taxonomy_url: str = "" # filled in later when the asset is published
# NCBI taxonomy accessory (taxdmp ZIP, names.dmp is extracted from it)
ncbi_taxdmp_url: str = "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip"
# proGenomes3
genes_reference_url: str = (
"http://progenomes3.embl.de/data/repGenomes/"
"progenomes3.genes.representatives.fasta.bz2"
)
bacContigs_reference_url: str = (
"http://progenomes3.embl.de/data/repGenomes/"
"progenomes3.contigs.representatives.fasta.bz2"
)
protein_reference_url: str = (
"http://progenomes3.embl.de/data/repGenomes/"
"progenomes3.proteins.representatives.fasta.bz2"
)
# ── DADA2 ─────────────────────────────────────────────────────────────────
direction: str = "forward"
bacteria_fasta_name: str = "16S_DADA2_bacteria.fasta"
fasta_generation: bool = True
# ── MMSeqs2 ───────────────────────────────────────────────────────────────
isolate_ref_16S: bool = True
mapping_saving: bool = True
min_bitscore: int = 50
max_matches: int = 20
add_taxonomy: bool = True
extend_taxonomy: bool = True
# ── SpacePHARER ───────────────────────────────────────────────────────────
min_n_spacers: int = 3
min_length: int = 23
max_length: int = 47
fdr: float = 0.05
keep_spacers_collection: bool = True
remove_decomp_fasta: bool = True
# ── ProCs ─────────────────────────────────────────────────────────────────
proteins_extraction_path: str = ""
clustering_path: str = ""
matrix_type: str = "count"
save_single_bacgenome_collection: bool = False
keep_coords: bool = False
filter_1bac_1vir: bool = False
remove_collections: bool = False
batch_size: int = 1500
# ── Network ───────────────────────────────────────────────────────────────
output_root: str = ""
overwrite: bool = False
verbose: bool = True
run_common_abundance: bool = True
run_shrinkage_correlations: bool = True
run_raw_crispr_networks: bool = True
run_smooth_crispr: bool = True
run_xstar: bool = True
prevalence: float = 0.10
keep_column: str = "keep_for_analysis"
bacteria_taxonomy_rank: str = "target_taxids"
bacterial_ranks: list = field(
default_factory=lambda: ["Phylum", "Class", "Order", "Family", "progenomes_taxid_genus"]
)
bacterial_weights: list = field(default_factory=lambda: [1, 2, 3, 6, 8])
crispr_smooth_alpha: float = 0.95
transpose_raw_crispr_after_load: bool = True
pseudocount: float = 1e-6
lam: float = 0.5
n_steps: int = 1
preserve_scale: bool = False
# Network input file paths
virus_abundance_raw: str = ""
bacteria_otu: str = ""
bacteria_taxonomy: str = ""
phage_host_predictions: str = ""
tax_bac_for_smoothing: str = ""
tax_vir: str = ""
viral_ranks: list = field(
default_factory=lambda: ["lev8", "lev7", "lev6", "lev5", "lev4", "lev3", "lev2", "lev1", "lev0"]
)
viral_weights: list = field(default_factory=lambda: [1, 1, 2, 3, 4, 6, 8, 10, 12])
aggregate_viral_rank: str = "lev0"
def __post_init__(self) -> None:
if self.base:
if not self.input_fasta_folder:
self.input_fasta_folder = self.base + "/Inputs/Fasta Collection"
if not self.dada2_folder:
self.dada2_folder = self.base + "/DADA2 output"
if not self.mmseq_folder:
self.mmseq_folder = self.base + "/MMSeqs2 Output"
if not self.sp_folder:
self.sp_folder = self.base + "/SpacePHARER output"
if not self.procs_folder:
self.procs_folder = self.base + "/Procs Estimations"
if not self.enhanced_networks_folder:
self.enhanced_networks_folder = self.base + "/Enhanced Networks"
if self.download_path:
if not self.full_ncbi_taxonomy_path:
self.full_ncbi_taxonomy_path = os.path.join(
self.download_path, "ncbi_taxonomy_all_taxids.tsv"
)
if not self.ncbi_accessory_path:
self.ncbi_accessory_path = os.path.join(self.download_path, "names.dmp")
if not self.silva_ref_path:
self.silva_ref_path = os.path.join(
self.download_path, "silva_nr99_v138.1_train_set.fa.gz"
)
if not self.silva_taxmap_path:
self.silva_taxmap_path = os.path.join(
self.download_path, "tax_slv_ssu_138.1.txt"
)
if self.procs_folder:
if not self.proteins_extraction_path:
self.proteins_extraction_path = self.procs_folder + "/Targets Proteins Extraction/"
if not self.clustering_path:
self.clustering_path = self.procs_folder + "/Clustering"
if self.enhanced_networks_folder and not self.output_root:
self.output_root = self.enhanced_networks_folder
if self.dada2_folder:
suffix = {"forward": "F", "reverse": "R", "paired": "P"}.get(self.direction, "F")
if not self.bacteria_taxonomy:
self.bacteria_taxonomy = (
f"{self.dada2_folder}/taxonomy_table_{suffix}_progenomeLikeNCBIIDs.csv"
)
if not self.bacteria_otu:
self.bacteria_otu = f"{self.dada2_folder}/OTU_table_{suffix}.csv"
if self.sp_folder and not self.phage_host_predictions:
self.phage_host_predictions = f"{self.sp_folder}/output/phage_host_predictions.tsv"
if self.dada2_folder and not self.tax_bac_for_smoothing:
suffix = {"forward": "F", "reverse": "R", "paired": "P"}.get(self.direction, "F")
self.tax_bac_for_smoothing = (
f"{self.dada2_folder}/taxonomy_table_{suffix}_progenomeLikeNCBIIDs.csv"
)
# ── Class methods ─────────────────────────────────────────────────────────
[docs]
@classmethod
def default(cls) -> "CapelliniConfig":
"""Return a config with all default values (paths will be empty until base is set)."""
return cls()
# Legacy → current key map for the network section. Old YAMLs that still
# use UPPER_CASE keys keep working transparently. The ``STUDY`` field is
# silently dropped (no per-study subfolder anymore).
_LEGACY_KEY_MAP = {
"OUTPUT_ROOT": "output_root",
"OVERWRITE": "overwrite",
"VERBOSE": "verbose",
"RUN_COMMON_ABUNDANCE": "run_common_abundance",
"RUN_SHRINKAGE_CORRELATIONS": "run_shrinkage_correlations",
"RUN_RAW_CRISPR_NETWORKS": "run_raw_crispr_networks",
"RUN_SMOOTH_CRISPR": "run_smooth_crispr",
"RUN_XSTAR": "run_xstar",
"PREVALENCE": "prevalence",
"KEEP_COLUMN": "keep_column",
"BACTERIA_TAXONOMY_RANK": "bacteria_taxonomy_rank",
"BACTERIAL_RANKS": "bacterial_ranks",
"BACTERIAL_WEIGHTS": "bacterial_weights",
"CRISPR_SMOOTH_ALPHA": "crispr_smooth_alpha",
"TRANSPOSE_RAW_CRISPR_AFTER_LOAD": "transpose_raw_crispr_after_load",
"PSEUDOCOUNT": "pseudocount",
"LAM": "lam",
"N_STEPS": "n_steps",
"PRESERVE_SCALE": "preserve_scale",
}
_LEGACY_DROP = {"STUDY"}
# YAML doesn't interpret Python f-string syntax: a value like
# ``f"{dada2_folder}/OTU_table_F.csv"`` is loaded as a literal string and
# would crash downstream. Detect those and treat them as empty so the
# ``__post_init__`` auto-derivation kicks in.
_F_STRING_RE = re.compile(r"^\s*f['\"].*['\"]\s*$")
[docs]
@classmethod
def from_dict(cls, d: dict[str, Any]) -> "CapelliniConfig":
"""Load config from a plain Python dict.
- Old UPPER_CASE network keys are auto-translated to the new
lowercase names.
- ``STUDY`` is silently dropped.
- YAML values that look like un-interpolated Python f-strings
(``f"{dada2_folder}/..."``) are normalised to empty so the
path-derivation logic in ``__post_init__`` can fill them in.
"""
known = {f.name for f in cls.__dataclass_fields__.values()} # type: ignore[attr-defined]
translated: dict[str, Any] = {}
for k, v in d.items():
if k in cls._LEGACY_DROP:
continue
new_k = cls._LEGACY_KEY_MAP.get(k, k)
if new_k not in known:
continue
if isinstance(v, str) and cls._F_STRING_RE.match(v):
print(
f"[capellini] config: '{new_k}' looks like an "
f"un-interpolated Python f-string ({v!r}); ignoring it "
f"and letting the package auto-derive the path."
)
v = ""
translated[new_k] = v
return cls(**translated)
[docs]
@classmethod
def from_yaml(cls, path: str | Path) -> "CapelliniConfig":
"""Load config from a YAML file.
Args:
path: Path to the YAML configuration file.
Returns:
CapelliniConfig populated from the YAML.
"""
with open(path, "r") as fh:
data = yaml.safe_load(fh) or {}
return cls.from_dict(data)
[docs]
def to_yaml(self, path: str | Path) -> None:
"""Serialize config to a YAML file.
Args:
path: Destination YAML file path.
"""
import dataclasses
data = dataclasses.asdict(self)
Path(path).parent.mkdir(parents=True, exist_ok=True)
with open(path, "w") as fh:
yaml.dump(data, fh, default_flow_style=False, allow_unicode=True, sort_keys=False)
[docs]
def virus_fasta_path(self) -> Path:
"""Resolved absolute path to the virus FASTA file."""
return Path(self.input_fasta_folder) / self.virus_fasta_name