Source code for capellini.utils.io

"""I/O helpers: file reading, writing, downloading, subprocess execution."""

from __future__ import annotations

import gzip
import logging
import shutil
import subprocess
import urllib.request
from pathlib import Path
from typing import IO

logger = logging.getLogger(__name__)


[docs] def download_if_missing(url: str, dest: str | Path, *, label: str | None = None) -> Path: """Download ``url`` to ``dest`` if ``dest`` is missing. Mirrors the simple ``urllib.request.urlretrieve(url, dest)`` pattern used in the notebook checkpoints — no automatic decompression. Args: url: Source URL. Empty ⇒ raises ValueError when the file is missing. dest: Destination path. Parent directories are created if needed. label: Optional human-readable label used in log/print messages. Returns: Path to the downloaded (or already-present) file. """ dest = Path(dest) name = label or dest.name if dest.exists(): print(f"{name} found - skipping download") return dest if not url: raise ValueError( f"Cannot download {name}: no URL configured " f"(target path: {dest})" ) dest.parent.mkdir(parents=True, exist_ok=True) print(f" • Download of {name} - this could take a while") urllib.request.urlretrieve(url, str(dest)) return dest
[docs] def open_maybe_gzip(path: str | Path, mode: str = "rt", encoding: str = "utf-8") -> IO: """Open a file, transparently decompressing if it ends with .gz. Args: path: Path to the file. mode: File open mode. encoding: Text encoding. Returns: File-like object. """ p = Path(path) if p.suffix == ".gz": return gzip.open(p, mode, encoding=encoding, errors="replace") return open(p, mode, encoding=encoding, errors="replace")
[docs] def read_table(path: str | Path, index_col: int = 0, **kwargs): """Read a CSV, TSV, or Excel file into a DataFrame, including gzip-compressed variants. Args: path: Path to the tabular file. index_col: Column to use as the row index. **kwargs: Additional keyword arguments forwarded to pandas. Returns: pd.DataFrame with the file contents. """ import pandas as pd path = Path(path) suffixes = "".join(path.suffixes).lower() if suffixes.endswith(".xlsx") or suffixes.endswith(".xls"): return pd.read_excel(path, index_col=index_col, **kwargs) if suffixes.endswith(".tsv") or suffixes.endswith(".tsv.gz"): return pd.read_csv(path, sep="\t", index_col=index_col, **kwargs) if suffixes.endswith(".csv") or suffixes.endswith(".csv.gz") or suffixes.endswith(".gz"): return pd.read_csv(path, index_col=index_col, **kwargs) raise ValueError(f"Unsupported file type: {path}")
[docs] def write_df(df, path: str | Path, *, overwrite: bool = True, verbose: bool = False, **to_csv_kwargs) -> Path: """Write a DataFrame to CSV, creating parent directories as needed. Args: df: DataFrame to write. path: Destination file path. overwrite: Skip if file exists and overwrite is False. verbose: Print a message on skip or save. **to_csv_kwargs: Forwarded to DataFrame.to_csv. Returns: Path to the written file. """ path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) if path.exists() and not overwrite: if verbose: print("skip existing:", path) return path df.to_csv(path, **to_csv_kwargs) if verbose: print("saved:", path, getattr(df, "shape", "")) return path
[docs] def sh(cmd: str, desc: str = "") -> subprocess.CompletedProcess: """Run a shell command, printing description and command, raising on failure. Args: cmd: Shell command string. desc: Human-readable description printed before execution. Returns: CompletedProcess result. Raises: RuntimeError: If the command exits with a non-zero return code, including full stdout and stderr in the message. """ if desc: print(desc) print(f"Executing command: {cmd}") try: r = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True) except subprocess.CalledProcessError as e: msg = ( f"Command failed with code {e.returncode}\n" f"STDOUT:\n{e.stdout}\nSTDERR:\n{e.stderr}" ) raise RuntimeError(msg) from e if r.stdout: print(r.stdout) if r.stderr.strip(): print("STDERR (tool messages):\n" + r.stderr) return r