Source code for capellini.config

"""Configuration dataclass for the CAPELLINI pipeline."""

from __future__ import annotations

import os
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

import yaml



[docs]
@dataclass
class CapelliniConfig:
    """All settings for the CAPELLINI pipeline, mirroring the notebook Settings section.

    Required fields (no defaults) must be provided explicitly or via from_yaml/from_dict.
    Derived path fields are computed in __post_init__ when left as empty strings.
    """

    # ── Global ────────────────────────────────────────────────────────────────
    base: str = ""
    download_path: str = ""

    # Derived folder paths (computed from base in __post_init__ if empty)
    input_fasta_folder: str = ""
    dada2_folder: str = ""
    mmseq_folder: str = ""
    sp_folder: str = ""
    procs_folder: str = ""
    enhanced_networks_folder: str = ""

    # Reference paths
    silva_ref_path: str = ""
    silva_taxmap_path: str = ""
    full_ncbi_taxonomy_path: str = ""
    ncbi_accessory_path: str = ""
    virus_fasta_name: str = ""
    metadata_path: str = ""
    bacterial_raw_fasta_folder: str = ""

    species_level: bool = False
    fresh_start: bool = False
    ref_removal: bool = True

    # Force-regenerate bundled references (off by default — bundled files are reused)
    regenerate_16S_reference: bool = False
    regenerate_spacers_collection: bool = False

    # External download URLs (fixed; mirror the Resources block of the notebook)
    # SILVA 138.1
    silva_ref_url: str = (
        "https://zenodo.org/records/4587955/files/"
        "silva_nr99_v138.1_train_set.fa.gz"
    )
    silva_taxmap_url: str = (
        "https://www.arb-silva.de/fileadmin/silva_databases/release_138_1/"
        "Exports/taxonomy/tax_slv_ssu_138.1.txt.gz"
    )
    # NCBI taxonomy
    full_ncbi_taxonomy_url: str = ""  # filled in later when the asset is published
    # NCBI taxonomy accessory (taxdmp ZIP, names.dmp is extracted from it)
    ncbi_taxdmp_url: str = "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip"
    # proGenomes3
    genes_reference_url: str = (
        "http://progenomes3.embl.de/data/repGenomes/"
        "progenomes3.genes.representatives.fasta.bz2"
    )
    bacContigs_reference_url: str = (
        "http://progenomes3.embl.de/data/repGenomes/"
        "progenomes3.contigs.representatives.fasta.bz2"
    )
    protein_reference_url: str = (
        "http://progenomes3.embl.de/data/repGenomes/"
        "progenomes3.proteins.representatives.fasta.bz2"
    )

    # ── DADA2 ─────────────────────────────────────────────────────────────────
    direction: str = "forward"
    bacteria_fasta_name: str = "16S_DADA2_bacteria.fasta"
    fasta_generation: bool = True

    # ── MMSeqs2 ───────────────────────────────────────────────────────────────
    isolate_ref_16S: bool = True
    mapping_saving: bool = True
    min_bitscore: int = 50
    max_matches: int = 20
    add_taxonomy: bool = True
    extend_taxonomy: bool = True

    # ── SpacePHARER ───────────────────────────────────────────────────────────
    min_n_spacers: int = 3
    min_length: int = 23
    max_length: int = 47
    fdr: float = 0.05
    keep_spacers_collection: bool = True
    remove_decomp_fasta: bool = True

    # ── ProCs ─────────────────────────────────────────────────────────────────
    proteins_extraction_path: str = ""
    clustering_path: str = ""
    matrix_type: str = "count"
    save_single_bacgenome_collection: bool = False
    keep_coords: bool = False
    filter_1bac_1vir: bool = False
    remove_collections: bool = False
    batch_size: int = 1500

    # ── Network ───────────────────────────────────────────────────────────────
    output_root: str = ""
    overwrite: bool = False
    verbose: bool = True
    run_common_abundance: bool = True
    run_shrinkage_correlations: bool = True
    run_raw_crispr_networks: bool = True
    run_smooth_crispr: bool = True
    run_xstar: bool = True
    prevalence: float = 0.10
    keep_column: str = "keep_for_analysis"
    bacteria_taxonomy_rank: str = "target_taxids"
    bacterial_ranks: list = field(
        default_factory=lambda: ["Phylum", "Class", "Order", "Family", "progenomes_taxid_genus"]
    )
    bacterial_weights: list = field(default_factory=lambda: [1, 2, 3, 6, 8])
    crispr_smooth_alpha: float = 0.95
    transpose_raw_crispr_after_load: bool = True
    pseudocount: float = 1e-6
    lam: float = 0.5
    n_steps: int = 1
    preserve_scale: bool = False

    # Network input file paths
    virus_abundance_raw: str = ""
    bacteria_otu: str = ""
    bacteria_taxonomy: str = ""
    phage_host_predictions: str = ""
    tax_bac_for_smoothing: str = ""
    tax_vir: str = ""
    viral_ranks: list = field(
        default_factory=lambda: ["lev8", "lev7", "lev6", "lev5", "lev4", "lev3", "lev2", "lev1", "lev0"]
    )
    viral_weights: list = field(default_factory=lambda: [1, 1, 2, 3, 4, 6, 8, 10, 12])
    aggregate_viral_rank: str = "lev0"

    def __post_init__(self) -> None:
        if self.base:
            if not self.input_fasta_folder:
                self.input_fasta_folder = self.base + "/Inputs/Fasta Collection"
            if not self.dada2_folder:
                self.dada2_folder = self.base + "/DADA2 output"
            if not self.mmseq_folder:
                self.mmseq_folder = self.base + "/MMSeqs2 Output"
            if not self.sp_folder:
                self.sp_folder = self.base + "/SpacePHARER output"
            if not self.procs_folder:
                self.procs_folder = self.base + "/Procs Estimations"
            if not self.enhanced_networks_folder:
                self.enhanced_networks_folder = self.base + "/Enhanced Networks"
        if self.download_path:
            if not self.full_ncbi_taxonomy_path:
                self.full_ncbi_taxonomy_path = os.path.join(
                    self.download_path, "ncbi_taxonomy_all_taxids.tsv"
                )
            if not self.ncbi_accessory_path:
                self.ncbi_accessory_path = os.path.join(self.download_path, "names.dmp")
            if not self.silva_ref_path:
                self.silva_ref_path = os.path.join(
                    self.download_path, "silva_nr99_v138.1_train_set.fa.gz"
                )
            if not self.silva_taxmap_path:
                self.silva_taxmap_path = os.path.join(
                    self.download_path, "tax_slv_ssu_138.1.txt"
                )
        if self.procs_folder:
            if not self.proteins_extraction_path:
                self.proteins_extraction_path = self.procs_folder + "/Targets Proteins Extraction/"
            if not self.clustering_path:
                self.clustering_path = self.procs_folder + "/Clustering"
        if self.enhanced_networks_folder and not self.output_root:
            self.output_root = self.enhanced_networks_folder
        if self.dada2_folder:
            suffix = {"forward": "F", "reverse": "R", "paired": "P"}.get(self.direction, "F")
            if not self.bacteria_taxonomy:
                self.bacteria_taxonomy = (
                    f"{self.dada2_folder}/taxonomy_table_{suffix}_progenomeLikeNCBIIDs.csv"
                )
            if not self.bacteria_otu:
                self.bacteria_otu = f"{self.dada2_folder}/OTU_table_{suffix}.csv"
        if self.sp_folder and not self.phage_host_predictions:
            self.phage_host_predictions = f"{self.sp_folder}/output/phage_host_predictions.tsv"
        if self.dada2_folder and not self.tax_bac_for_smoothing:
            suffix = {"forward": "F", "reverse": "R", "paired": "P"}.get(self.direction, "F")
            self.tax_bac_for_smoothing = (
                f"{self.dada2_folder}/taxonomy_table_{suffix}_progenomeLikeNCBIIDs.csv"
            )

    # ── Class methods ─────────────────────────────────────────────────────────


[docs]
    @classmethod
    def default(cls) -> "CapelliniConfig":
        """Return a config with all default values (paths will be empty until base is set)."""
        return cls()


    # Legacy → current key map for the network section. Old YAMLs that still
    # use UPPER_CASE keys keep working transparently. The ``STUDY`` field is
    # silently dropped (no per-study subfolder anymore).
    _LEGACY_KEY_MAP = {
        "OUTPUT_ROOT": "output_root",
        "OVERWRITE": "overwrite",
        "VERBOSE": "verbose",
        "RUN_COMMON_ABUNDANCE": "run_common_abundance",
        "RUN_SHRINKAGE_CORRELATIONS": "run_shrinkage_correlations",
        "RUN_RAW_CRISPR_NETWORKS": "run_raw_crispr_networks",
        "RUN_SMOOTH_CRISPR": "run_smooth_crispr",
        "RUN_XSTAR": "run_xstar",
        "PREVALENCE": "prevalence",
        "KEEP_COLUMN": "keep_column",
        "BACTERIA_TAXONOMY_RANK": "bacteria_taxonomy_rank",
        "BACTERIAL_RANKS": "bacterial_ranks",
        "BACTERIAL_WEIGHTS": "bacterial_weights",
        "CRISPR_SMOOTH_ALPHA": "crispr_smooth_alpha",
        "TRANSPOSE_RAW_CRISPR_AFTER_LOAD": "transpose_raw_crispr_after_load",
        "PSEUDOCOUNT": "pseudocount",
        "LAM": "lam",
        "N_STEPS": "n_steps",
        "PRESERVE_SCALE": "preserve_scale",
    }
    _LEGACY_DROP = {"STUDY"}

    # YAML doesn't interpret Python f-string syntax: a value like
    # ``f"{dada2_folder}/OTU_table_F.csv"`` is loaded as a literal string and
    # would crash downstream. Detect those and treat them as empty so the
    # ``__post_init__`` auto-derivation kicks in.
    _F_STRING_RE = re.compile(r"^\s*f['\"].*['\"]\s*$")


[docs]
    @classmethod
    def from_dict(cls, d: dict[str, Any]) -> "CapelliniConfig":
        """Load config from a plain Python dict.

        - Old UPPER_CASE network keys are auto-translated to the new
          lowercase names.
        - ``STUDY`` is silently dropped.
        - YAML values that look like un-interpolated Python f-strings
          (``f"{dada2_folder}/..."``) are normalised to empty so the
          path-derivation logic in ``__post_init__`` can fill them in.
        """
        known = {f.name for f in cls.__dataclass_fields__.values()}  # type: ignore[attr-defined]
        translated: dict[str, Any] = {}
        for k, v in d.items():
            if k in cls._LEGACY_DROP:
                continue
            new_k = cls._LEGACY_KEY_MAP.get(k, k)
            if new_k not in known:
                continue
            if isinstance(v, str) and cls._F_STRING_RE.match(v):
                print(
                    f"[capellini] config: '{new_k}' looks like an "
                    f"un-interpolated Python f-string ({v!r}); ignoring it "
                    f"and letting the package auto-derive the path."
                )
                v = ""
            translated[new_k] = v
        return cls(**translated)



[docs]
    @classmethod
    def from_yaml(cls, path: str | Path) -> "CapelliniConfig":
        """Load config from a YAML file.

        Args:
            path: Path to the YAML configuration file.

        Returns:
            CapelliniConfig populated from the YAML.
        """
        with open(path, "r") as fh:
            data = yaml.safe_load(fh) or {}
        return cls.from_dict(data)



[docs]
    def to_yaml(self, path: str | Path) -> None:
        """Serialize config to a YAML file.

        Args:
            path: Destination YAML file path.
        """
        import dataclasses

        data = dataclasses.asdict(self)
        Path(path).parent.mkdir(parents=True, exist_ok=True)
        with open(path, "w") as fh:
            yaml.dump(data, fh, default_flow_style=False, allow_unicode=True, sort_keys=False)



[docs]
    def virus_fasta_path(self) -> Path:
        """Resolved absolute path to the virus FASTA file."""
        return Path(self.input_fasta_folder) / self.virus_fasta_name