Source code for capellini.config

"""Configuration dataclass for the CAPELLINI pipeline."""

from __future__ import annotations

import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

import yaml


[docs] @dataclass class CapelliniConfig: """All settings for the CAPELLINI pipeline, mirroring the notebook Settings section. Required fields (no defaults) must be provided explicitly or via from_yaml/from_dict. Derived path fields are computed in __post_init__ when left as empty strings. """ # ── Global ──────────────────────────────────────────────────────────────── base: str = "" download_path: str = "" # Derived folder paths (computed from base in __post_init__ if empty) input_fasta_folder: str = "" dada2_folder: str = "" mmseq_folder: str = "" sp_folder: str = "" procs_folder: str = "" enhanced_networks_folder: str = "" # Reference paths silva_ref_path: str = "" silva_taxmap_path: str = "" full_ncbi_taxonomy_path: str = "" virus_fasta_name: str = "" metadata_path: str = "" bacterial_raw_fasta_folder: str = "" species_level: bool = False fresh_start: bool = False ref_removal: bool = True # Force-regenerate bundled references (off by default — bundled files are reused) regenerate_16S_reference: bool = False regenerate_spacers_collection: bool = False # External download URLs (fixed) genes_reference_url: str = ( "http://progenomes3.embl.de/data/repGenomes/" "progenomes3.genes.representatives.fasta.bz2" ) bacContigs_reference_url: str = ( "http://progenomes3.embl.de/data/repGenomes/" "progenomes3.contigs.representatives.fasta.bz2" ) protein_reference_url: str = ( "http://progenomes3.embl.de/data/repGenomes/" "progenomes3.proteins.representatives.fasta.bz2" ) # ── DADA2 ───────────────────────────────────────────────────────────────── direction: str = "forward" bacteria_fasta_name: str = "16S_DADA2_bacteria.fasta" fasta_generation: bool = True # ── MMSeqs2 ─────────────────────────────────────────────────────────────── isolate_ref_16S: bool = True mapping_saving: bool = True min_bitscore: int = 50 max_matches: int = 20 add_taxonomy: bool = True extend_taxonomy: bool = True # ── SpacePHARER ─────────────────────────────────────────────────────────── min_n_spacers: int = 3 min_length: int = 23 max_length: int = 47 fdr: float = 0.05 keep_spacers_collection: bool = True remove_decomp_fasta: bool = True # ── ProCs ───────────────────────────────────────────────────────────────── proteins_extraction_path: str = "" clustering_path: str = "" matrix_type: str = "count" save_single_bacgenome_collection: bool = False keep_coords: bool = False filter_1bac_1vir: bool = False remove_collections: bool = False batch_size: int = 1500 # ── Network ─────────────────────────────────────────────────────────────── OUTPUT_ROOT: str = "" OVERWRITE: bool = False VERBOSE: bool = True RUN_COMMON_ABUNDANCE: bool = True RUN_SHRINKAGE_CORRELATIONS: bool = True RUN_RAW_CRISPR_NETWORKS: bool = True RUN_SMOOTH_CRISPR: bool = True RUN_XSTAR: bool = True PREVALENCE: float = 0.10 KEEP_COLUMN: str = "keep_for_analysis" BACTERIA_TAXONOMY_RANK: str = "target_taxids" BACTERIAL_RANKS: list = field( default_factory=lambda: ["Phylum", "Class", "Order", "Family", "progenomes_taxid_genus"] ) BACTERIAL_WEIGHTS: list = field(default_factory=lambda: [1, 2, 3, 6, 8]) CRISPR_SMOOTH_ALPHA: float = 0.95 TRANSPOSE_RAW_CRISPR_AFTER_LOAD: bool = True PSEUDOCOUNT: float = 1e-6 LAM: float = 0.5 N_STEPS: int = 1 PRESERVE_SCALE: bool = False STUDY: str = "default" # Network input file paths virus_abundance_raw: str = "" bacteria_otu: str = "" bacteria_taxonomy: str = "" phage_host_predictions: str = "" tax_bac_for_smoothing: str = "" tax_vir: str = "" viral_ranks: list = field( default_factory=lambda: ["lev8", "lev7", "lev6", "lev5", "lev4", "lev3", "lev2", "lev1", "lev0"] ) viral_weights: list = field(default_factory=lambda: [1, 1, 2, 3, 4, 6, 8, 10, 12]) aggregate_viral_rank: str = "lev0" def __post_init__(self) -> None: if self.base: if not self.input_fasta_folder: self.input_fasta_folder = self.base + "/Inputs/Fasta Collection" if not self.dada2_folder: self.dada2_folder = self.base + "/DADA2 output" if not self.mmseq_folder: self.mmseq_folder = self.base + "/MMSeqs2 Output" if not self.sp_folder: self.sp_folder = self.base + "/SpacePHARER output" if not self.procs_folder: self.procs_folder = self.base + "/Procs Estimations" if not self.enhanced_networks_folder: self.enhanced_networks_folder = self.base + "/Enhanced Networks" if self.download_path: if not self.full_ncbi_taxonomy_path: self.full_ncbi_taxonomy_path = os.path.join(self.download_path, "names.dmp") if self.procs_folder: if not self.proteins_extraction_path: self.proteins_extraction_path = self.procs_folder + "/Targets Proteins Extraction/" if not self.clustering_path: self.clustering_path = self.procs_folder + "/Clustering" if self.enhanced_networks_folder and not self.OUTPUT_ROOT: self.OUTPUT_ROOT = self.enhanced_networks_folder if self.dada2_folder: suffix = {"forward": "F", "reverse": "R", "paired": "P"}.get(self.direction, "F") if not self.bacteria_taxonomy: self.bacteria_taxonomy = ( f"{self.dada2_folder}/taxonomy_table_{suffix}_progenomeLikeNCBIIDs.csv" ) if not self.bacteria_otu: self.bacteria_otu = f"{self.dada2_folder}/OTU_table_{suffix}.csv" if self.sp_folder and not self.phage_host_predictions: self.phage_host_predictions = f"{self.sp_folder}/output/phage_host_predictions.tsv" if self.dada2_folder and not self.tax_bac_for_smoothing: suffix = {"forward": "F", "reverse": "R", "paired": "P"}.get(self.direction, "F") self.tax_bac_for_smoothing = ( f"{self.dada2_folder}/taxonomy_table_{suffix}_progenomeLikeNCBIIDs.csv" ) # ── Class methods ─────────────────────────────────────────────────────────
[docs] @classmethod def default(cls) -> "CapelliniConfig": """Return a config with all default values (paths will be empty until base is set).""" return cls()
[docs] @classmethod def from_dict(cls, d: dict[str, Any]) -> "CapelliniConfig": """Load config from a plain Python dict.""" known = {f.name for f in cls.__dataclass_fields__.values()} # type: ignore[attr-defined] filtered = {k: v for k, v in d.items() if k in known} return cls(**filtered)
[docs] @classmethod def from_yaml(cls, path: str | Path) -> "CapelliniConfig": """Load config from a YAML file. Args: path: Path to the YAML configuration file. Returns: CapelliniConfig populated from the YAML. """ with open(path, "r") as fh: data = yaml.safe_load(fh) or {} return cls.from_dict(data)
[docs] def to_yaml(self, path: str | Path) -> None: """Serialize config to a YAML file. Args: path: Destination YAML file path. """ import dataclasses data = dataclasses.asdict(self) Path(path).parent.mkdir(parents=True, exist_ok=True) with open(path, "w") as fh: yaml.dump(data, fh, default_flow_style=False, allow_unicode=True, sort_keys=False)
[docs] def virus_fasta_path(self) -> Path: """Resolved absolute path to the virus FASTA file.""" return Path(self.input_fasta_folder) / self.virus_fasta_name