Source code for capellini.config

"""Configuration dataclass for the CAPELLINI pipeline."""

from __future__ import annotations

import os
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

import yaml


[docs] @dataclass class CapelliniConfig: """All settings for the CAPELLINI pipeline, mirroring the notebook Settings section. Required fields (no defaults) must be provided explicitly or via from_yaml/from_dict. Derived path fields are computed in __post_init__ when left as empty strings. """ # ── Global ──────────────────────────────────────────────────────────────── base: str = "" download_path: str = "" # Derived folder paths (computed from base in __post_init__ if empty) input_fasta_folder: str = "" dada2_folder: str = "" mmseq_folder: str = "" sp_folder: str = "" procs_folder: str = "" enhanced_networks_folder: str = "" # Reference paths silva_ref_path: str = "" silva_taxmap_path: str = "" full_ncbi_taxonomy_path: str = "" ncbi_accessory_path: str = "" virus_fasta_name: str = "" metadata_path: str = "" bacterial_raw_fasta_folder: str = "" species_level: bool = False fresh_start: bool = False ref_removal: bool = True # Force-regenerate bundled references (off by default — bundled files are reused) regenerate_16S_reference: bool = False regenerate_spacers_collection: bool = False # External download URLs (fixed; mirror the Resources block of the notebook) # SILVA 138.1 silva_ref_url: str = ( "https://zenodo.org/records/4587955/files/" "silva_nr99_v138.1_train_set.fa.gz" ) silva_taxmap_url: str = ( "https://www.arb-silva.de/fileadmin/silva_databases/release_138_1/" "Exports/taxonomy/tax_slv_ssu_138.1.txt.gz" ) # NCBI taxonomy full_ncbi_taxonomy_url: str = "" # filled in later when the asset is published # NCBI taxonomy accessory (taxdmp ZIP, names.dmp is extracted from it) ncbi_taxdmp_url: str = "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip" # proGenomes3 genes_reference_url: str = ( "http://progenomes3.embl.de/data/repGenomes/" "progenomes3.genes.representatives.fasta.bz2" ) bacContigs_reference_url: str = ( "http://progenomes3.embl.de/data/repGenomes/" "progenomes3.contigs.representatives.fasta.bz2" ) protein_reference_url: str = ( "http://progenomes3.embl.de/data/repGenomes/" "progenomes3.proteins.representatives.fasta.bz2" ) # ── DADA2 ───────────────────────────────────────────────────────────────── direction: str = "forward" bacteria_fasta_name: str = "16S_DADA2_bacteria.fasta" fasta_generation: bool = True # ── MMSeqs2 ─────────────────────────────────────────────────────────────── isolate_ref_16S: bool = True mapping_saving: bool = True min_bitscore: int = 50 max_matches: int = 20 add_taxonomy: bool = True extend_taxonomy: bool = True # ── SpacePHARER ─────────────────────────────────────────────────────────── min_n_spacers: int = 3 min_length: int = 23 max_length: int = 47 fdr: float = 0.05 keep_spacers_collection: bool = True remove_decomp_fasta: bool = True # ── ProCs ───────────────────────────────────────────────────────────────── proteins_extraction_path: str = "" clustering_path: str = "" matrix_type: str = "count" save_single_bacgenome_collection: bool = False keep_coords: bool = False filter_1bac_1vir: bool = False remove_collections: bool = False batch_size: int = 1500 # ── Network ─────────────────────────────────────────────────────────────── output_root: str = "" overwrite: bool = False verbose: bool = True run_common_abundance: bool = True run_shrinkage_correlations: bool = True run_raw_crispr_networks: bool = True run_smooth_crispr: bool = True run_xstar: bool = True prevalence: float = 0.10 keep_column: str = "keep_for_analysis" bacteria_taxonomy_rank: str = "target_taxids" bacterial_ranks: list = field( default_factory=lambda: ["Phylum", "Class", "Order", "Family", "progenomes_taxid_genus"] ) bacterial_weights: list = field(default_factory=lambda: [1, 2, 3, 6, 8]) crispr_smooth_alpha: float = 0.95 transpose_raw_crispr_after_load: bool = True pseudocount: float = 1e-6 lam: float = 0.5 n_steps: int = 1 preserve_scale: bool = False # Network input file paths virus_abundance_raw: str = "" bacteria_otu: str = "" bacteria_taxonomy: str = "" phage_host_predictions: str = "" tax_bac_for_smoothing: str = "" tax_vir: str = "" viral_ranks: list = field( default_factory=lambda: ["lev8", "lev7", "lev6", "lev5", "lev4", "lev3", "lev2", "lev1", "lev0"] ) viral_weights: list = field(default_factory=lambda: [1, 1, 2, 3, 4, 6, 8, 10, 12]) aggregate_viral_rank: str = "lev0" def __post_init__(self) -> None: if self.base: if not self.input_fasta_folder: self.input_fasta_folder = self.base + "/Inputs/Fasta Collection" if not self.dada2_folder: self.dada2_folder = self.base + "/DADA2 output" if not self.mmseq_folder: self.mmseq_folder = self.base + "/MMSeqs2 Output" if not self.sp_folder: self.sp_folder = self.base + "/SpacePHARER output" if not self.procs_folder: self.procs_folder = self.base + "/Procs Estimations" if not self.enhanced_networks_folder: self.enhanced_networks_folder = self.base + "/Enhanced Networks" if self.download_path: if not self.full_ncbi_taxonomy_path: self.full_ncbi_taxonomy_path = os.path.join( self.download_path, "ncbi_taxonomy_all_taxids.tsv" ) if not self.ncbi_accessory_path: self.ncbi_accessory_path = os.path.join(self.download_path, "names.dmp") if not self.silva_ref_path: self.silva_ref_path = os.path.join( self.download_path, "silva_nr99_v138.1_train_set.fa.gz" ) if not self.silva_taxmap_path: self.silva_taxmap_path = os.path.join( self.download_path, "tax_slv_ssu_138.1.txt" ) if self.procs_folder: if not self.proteins_extraction_path: self.proteins_extraction_path = self.procs_folder + "/Targets Proteins Extraction/" if not self.clustering_path: self.clustering_path = self.procs_folder + "/Clustering" if self.enhanced_networks_folder and not self.output_root: self.output_root = self.enhanced_networks_folder if self.dada2_folder: suffix = {"forward": "F", "reverse": "R", "paired": "P"}.get(self.direction, "F") if not self.bacteria_taxonomy: self.bacteria_taxonomy = ( f"{self.dada2_folder}/taxonomy_table_{suffix}_progenomeLikeNCBIIDs.csv" ) if not self.bacteria_otu: self.bacteria_otu = f"{self.dada2_folder}/OTU_table_{suffix}.csv" if self.sp_folder and not self.phage_host_predictions: self.phage_host_predictions = f"{self.sp_folder}/output/phage_host_predictions.tsv" if self.dada2_folder and not self.tax_bac_for_smoothing: suffix = {"forward": "F", "reverse": "R", "paired": "P"}.get(self.direction, "F") self.tax_bac_for_smoothing = ( f"{self.dada2_folder}/taxonomy_table_{suffix}_progenomeLikeNCBIIDs.csv" ) # ── Class methods ─────────────────────────────────────────────────────────
[docs] @classmethod def default(cls) -> "CapelliniConfig": """Return a config with all default values (paths will be empty until base is set).""" return cls()
# Legacy → current key map for the network section. Old YAMLs that still # use UPPER_CASE keys keep working transparently. The ``STUDY`` field is # silently dropped (no per-study subfolder anymore). _LEGACY_KEY_MAP = { "OUTPUT_ROOT": "output_root", "OVERWRITE": "overwrite", "VERBOSE": "verbose", "RUN_COMMON_ABUNDANCE": "run_common_abundance", "RUN_SHRINKAGE_CORRELATIONS": "run_shrinkage_correlations", "RUN_RAW_CRISPR_NETWORKS": "run_raw_crispr_networks", "RUN_SMOOTH_CRISPR": "run_smooth_crispr", "RUN_XSTAR": "run_xstar", "PREVALENCE": "prevalence", "KEEP_COLUMN": "keep_column", "BACTERIA_TAXONOMY_RANK": "bacteria_taxonomy_rank", "BACTERIAL_RANKS": "bacterial_ranks", "BACTERIAL_WEIGHTS": "bacterial_weights", "CRISPR_SMOOTH_ALPHA": "crispr_smooth_alpha", "TRANSPOSE_RAW_CRISPR_AFTER_LOAD": "transpose_raw_crispr_after_load", "PSEUDOCOUNT": "pseudocount", "LAM": "lam", "N_STEPS": "n_steps", "PRESERVE_SCALE": "preserve_scale", } _LEGACY_DROP = {"STUDY"} # YAML doesn't interpret Python f-string syntax: a value like # ``f"{dada2_folder}/OTU_table_F.csv"`` is loaded as a literal string and # would crash downstream. Detect those and treat them as empty so the # ``__post_init__`` auto-derivation kicks in. _F_STRING_RE = re.compile(r"^\s*f['\"].*['\"]\s*$")
[docs] @classmethod def from_dict(cls, d: dict[str, Any]) -> "CapelliniConfig": """Load config from a plain Python dict. - Old UPPER_CASE network keys are auto-translated to the new lowercase names. - ``STUDY`` is silently dropped. - YAML values that look like un-interpolated Python f-strings (``f"{dada2_folder}/..."``) are normalised to empty so the path-derivation logic in ``__post_init__`` can fill them in. """ known = {f.name for f in cls.__dataclass_fields__.values()} # type: ignore[attr-defined] translated: dict[str, Any] = {} for k, v in d.items(): if k in cls._LEGACY_DROP: continue new_k = cls._LEGACY_KEY_MAP.get(k, k) if new_k not in known: continue if isinstance(v, str) and cls._F_STRING_RE.match(v): print( f"[capellini] config: '{new_k}' looks like an " f"un-interpolated Python f-string ({v!r}); ignoring it " f"and letting the package auto-derive the path." ) v = "" translated[new_k] = v return cls(**translated)
[docs] @classmethod def from_yaml(cls, path: str | Path) -> "CapelliniConfig": """Load config from a YAML file. Args: path: Path to the YAML configuration file. Returns: CapelliniConfig populated from the YAML. """ with open(path, "r") as fh: data = yaml.safe_load(fh) or {} return cls.from_dict(data)
[docs] def to_yaml(self, path: str | Path) -> None: """Serialize config to a YAML file. Args: path: Destination YAML file path. """ import dataclasses data = dataclasses.asdict(self) Path(path).parent.mkdir(parents=True, exist_ok=True) with open(path, "w") as fh: yaml.dump(data, fh, default_flow_style=False, allow_unicode=True, sort_keys=False)
[docs] def virus_fasta_path(self) -> Path: """Resolved absolute path to the virus FASTA file.""" return Path(self.input_fasta_folder) / self.virus_fasta_name