"""I/O helpers: file reading, writing, downloading, subprocess execution."""
from __future__ import annotations
import gzip
import logging
import shutil
import subprocess
import urllib.request
from pathlib import Path
from typing import IO
logger = logging.getLogger(__name__)
[docs]
def download_if_missing(url: str, dest: str | Path, *, label: str | None = None) -> Path:
"""Download ``url`` to ``dest`` if ``dest`` is missing.
Mirrors the simple ``urllib.request.urlretrieve(url, dest)`` pattern used
in the notebook checkpoints — no automatic decompression.
Args:
url: Source URL. Empty ⇒ raises ValueError when the file is missing.
dest: Destination path. Parent directories are created if needed.
label: Optional human-readable label used in log/print messages.
Returns:
Path to the downloaded (or already-present) file.
"""
dest = Path(dest)
name = label or dest.name
if dest.exists():
print(f"{name} found - skipping download")
return dest
if not url:
raise ValueError(
f"Cannot download {name}: no URL configured "
f"(target path: {dest})"
)
dest.parent.mkdir(parents=True, exist_ok=True)
print(f" • Download of {name} - this could take a while")
urllib.request.urlretrieve(url, str(dest))
return dest
[docs]
def open_maybe_gzip(path: str | Path, mode: str = "rt", encoding: str = "utf-8") -> IO:
"""Open a file, transparently decompressing if it ends with .gz.
Args:
path: Path to the file.
mode: File open mode.
encoding: Text encoding.
Returns:
File-like object.
"""
p = Path(path)
if p.suffix == ".gz":
return gzip.open(p, mode, encoding=encoding, errors="replace")
return open(p, mode, encoding=encoding, errors="replace")
[docs]
def read_table(path: str | Path, index_col: int = 0, **kwargs):
"""Read a CSV, TSV, or Excel file into a DataFrame, including gzip-compressed variants.
Args:
path: Path to the tabular file.
index_col: Column to use as the row index.
**kwargs: Additional keyword arguments forwarded to pandas.
Returns:
pd.DataFrame with the file contents.
"""
import pandas as pd
path = Path(path)
suffixes = "".join(path.suffixes).lower()
if suffixes.endswith(".xlsx") or suffixes.endswith(".xls"):
return pd.read_excel(path, index_col=index_col, **kwargs)
if suffixes.endswith(".tsv") or suffixes.endswith(".tsv.gz"):
return pd.read_csv(path, sep="\t", index_col=index_col, **kwargs)
if suffixes.endswith(".csv") or suffixes.endswith(".csv.gz") or suffixes.endswith(".gz"):
return pd.read_csv(path, index_col=index_col, **kwargs)
raise ValueError(f"Unsupported file type: {path}")
[docs]
def write_df(df, path: str | Path, *, overwrite: bool = True, verbose: bool = False, **to_csv_kwargs) -> Path:
"""Write a DataFrame to CSV, creating parent directories as needed.
Args:
df: DataFrame to write.
path: Destination file path.
overwrite: Skip if file exists and overwrite is False.
verbose: Print a message on skip or save.
**to_csv_kwargs: Forwarded to DataFrame.to_csv.
Returns:
Path to the written file.
"""
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
if path.exists() and not overwrite:
if verbose:
print("skip existing:", path)
return path
df.to_csv(path, **to_csv_kwargs)
if verbose:
print("saved:", path, getattr(df, "shape", ""))
return path
[docs]
def sh(cmd: str, desc: str = "") -> subprocess.CompletedProcess:
"""Run a shell command, printing description and command, raising on failure.
Args:
cmd: Shell command string.
desc: Human-readable description printed before execution.
Returns:
CompletedProcess result.
Raises:
RuntimeError: If the command exits with a non-zero return code,
including full stdout and stderr in the message.
"""
if desc:
print(desc)
print(f"Executing command: {cmd}")
try:
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True)
except subprocess.CalledProcessError as e:
msg = (
f"Command failed with code {e.returncode}\n"
f"STDOUT:\n{e.stdout}\nSTDERR:\n{e.stderr}"
)
raise RuntimeError(msg) from e
if r.stdout:
print(r.stdout)
if r.stderr.strip():
print("STDERR (tool messages):\n" + r.stderr)
return r