Source code for idtrack._external_mappers._backend_pybiomart

#!/usr/bin/env python3
"""Ensembl BioMart backend for ID mapping.

This module provides the map_with_pybiomart() function for querying
Ensembl BioMart to convert biological identifiers. Supports historical
Ensembl releases via archive hosts.
"""

# Kemal Inecik
# k.inecik@gmail.com


from __future__ import annotations

import math
import re
import time
import typing as _t
from urllib.parse import urlparse

import pandas as pd
from tqdm import tqdm

from idtrack._external_mappers._constants import (
    _BM_ATTR_CANDIDATES,
    _BM_FILTER_CANDIDATES,
    _ENSEMBL_ARCHIVE_BY_RELEASE,
    _ENSEMBL_INPUT_DB,
    _ENSEMBL_SPECIAL_RELEASE_HOSTS,
)
from idtrack._external_mappers._utils import (
    _chunker,
    _empty_result,
    _ensure_all_inputs,
    _json,
    _suppress_stdout_stderr,
    _unique_not_null,
    canonical_db,
    canonical_species,
    logger,
    raise_missing_dependency,
    strip_version,
)



[docs]
def _ensembl_archive_host_for_release(
    release: int | str | None,
) -> str | None:
    """Resolve an Ensembl release or key to an archive host.

    Examples include an integer release (e.g. ``104``) or a special string key
    (e.g. ``"GRCh37"``) mapping to hosts like ``"may2021.archive.ensembl.org"``.

    Args:
        release: Ensembl release number or key (e.g. ``104``, ``"v104"``, ``"GRCh37"``), or ``None``.

    Returns:
        Archive host for the requested release, or ``None`` if unknown.
    """
    if release is None:
        return None

    # String releases can be things like "104", "v104", "GRCh37", ...
    if isinstance(release, str):
        s = release.strip()
        if not s:
            return None

        key = s.lower()
        # Special non‑numeric keys first
        if key in _ENSEMBL_SPECIAL_RELEASE_HOSTS:
            return _ENSEMBL_SPECIAL_RELEASE_HOSTS[key]

        # Strip a leading "v" or "r" if present (e.g. "v104")
        m = re.match(r"^[vr]?(\d+)$", key)
        if not m:
            return None
        try:
            rel_int = int(m.group(1))
        except ValueError:
            return None
    else:
        try:
            rel_int = int(release)
        except (TypeError, ValueError):
            return None

    return _ENSEMBL_ARCHIVE_BY_RELEASE.get(rel_int)




[docs]
def _biomart_dataset_for_species(species: str, explicit: str | None = None) -> str:
    """Return the Ensembl BioMart dataset name for the given species."""
    if explicit:
        return explicit
    s = canonical_species(species)
    return f"{s}_gene_ensembl"




[docs]
def _normalize_biomart_host(host: str | None) -> str:
    """Normalize an Ensembl BioMart host for pybiomart.

    Examples of valid outputs:
        "http://www.ensembl.org"
        "http://nov2020.archive.ensembl.org"
        "http://grch37.ensembl.org"

    Args:
        host: Hostname or URL (scheme optional). If ``None``, defaults to ``"http://www.ensembl.org"``.

    Returns:
        str: Normalized base URL suitable for pybiomart.
    """
    if not host:
        return "http://www.ensembl.org"

    host = host.strip()
    parsed = urlparse(host if "://" in host else "http://" + host)
    netloc = parsed.netloc or parsed.path.split("/")[0]
    return "http://" + netloc




[docs]
def _bm_list_attribute_names(ds) -> list[str]:
    """Return a list of attribute names for a pybiomart Dataset."""
    try:
        attrs = ds.list_attributes()
    except (AttributeError, TypeError, RuntimeError):
        # Different pybiomart versions may not have list_attributes()
        attrs = getattr(ds, "attributes", None)

    if attrs is None:
        return []

    try:
        # biomaRt-style DataFrame
        if hasattr(attrs, "columns"):
            if "name" in attrs.columns:
                return [str(x) for x in attrs["name"].tolist()]
            else:
                return [str(x) for x in attrs.iloc[:, 0].tolist()]
        # dict or list-like
        if isinstance(attrs, dict):
            return [str(k) for k in attrs.keys()]
        return [str(x) for x in list(attrs)]
    except (KeyError, IndexError, TypeError, AttributeError):
        try:
            return list(attrs)
        except (TypeError, ValueError):
            return []




[docs]
def _bm_list_filter_names(ds) -> list[str]:
    """Return a list of filter names for a pybiomart Dataset."""
    try:
        filts = ds.list_filters()
    except (AttributeError, TypeError, RuntimeError):
        # Different pybiomart versions may not have list_filters()
        filts = getattr(ds, "filters", None)

    if filts is None:
        return []

    try:
        if hasattr(filts, "columns"):
            if "name" in filts.columns:
                return [str(x) for x in filts["name"].tolist()]
            else:
                return [str(x) for x in filts.iloc[:, 0].tolist()]
        if isinstance(filts, dict):
            return [str(k) for k in filts.keys()]
        return [str(x) for x in list(filts)]
    except (KeyError, IndexError, TypeError, AttributeError):
        try:
            return list(filts)
        except (TypeError, ValueError):
            return []




[docs]
def _bm_pick_attribute(canonical_db_name: str, available_attrs: list[str]) -> str:
    """Choose a BioMart attribute name for a canonical DB.

    The helper first tries explicit candidates from :py:data:`_BM_ATTR_CANDIDATES` and
    falls back to fuzzy matching on common substrings.

    Args:
        canonical_db_name: Canonical database key (see :py:func:`~idtrack._external_mappers._utils.canonical_db`).
        available_attrs: Attribute names provided by the BioMart dataset.

    Returns:
        str: Selected attribute name.

    Raises:
        RuntimeError: If no compatible attribute is available on the dataset.
    """
    cdb = canonical_db(canonical_db_name)
    attrs = list(dict.fromkeys(available_attrs))  # dedupe, preserve order
    attr_set = set(attrs)

    # 1) Try explicit candidates in preferred order
    candidates = _BM_ATTR_CANDIDATES.get(cdb, [])
    for cand in candidates:
        if cand in attr_set:
            return cand

    # 2) Fuzzy fallback based on typical naming patterns
    if cdb.startswith("ensembl_"):
        pattern = "ensembl_" + cdb.split("_", 1)[1]
    elif cdb == "hgnc_symbol":
        pattern = "external_gene"
    elif cdb == "hgnc_id":
        pattern = "hgnc"
    elif cdb == "entrez_gene":
        pattern = "entrez"
    elif cdb == "uniprot":
        pattern = "uniprot"
    elif cdb.startswith("refseq_"):
        pattern = "refseq"
    else:
        pattern = cdb

    prefix_hits = [a for a in attrs if a.startswith(pattern)]
    if prefix_hits:
        return prefix_hits[0]

    contains_hits = [a for a in attrs if pattern in a]
    if contains_hits:
        return contains_hits[0]

    raise RuntimeError(
        f"pybiomart: dataset does not provide any attribute compatible with "
        f"{cdb!r}; inspect `dataset.list_attributes()` for valid names."
    )




[docs]
def _bm_pick_filter(
    canonical_db_name: str,
    attr_name: str,
    available_filters: list[str],
) -> str:
    """Choose a BioMart filter name.

    The selection depends on the canonical database and chosen attribute.

    Args:
        canonical_db_name: Canonical database key for the input IDs.
        attr_name: Attribute name chosen for the input IDs.
        available_filters: Filter names provided by the BioMart dataset.

    Returns:
        str: Selected filter name.

    Raises:
        RuntimeError: If no compatible filter is available on the dataset.
    """
    cdb = canonical_db(canonical_db_name)
    filt_list = list(dict.fromkeys(available_filters))  # dedupe, preserve order
    filt_set = set(filt_list)

    candidates: list[str] = []

    # 1) DB-specific candidates
    candidates.extend(_BM_FILTER_CANDIDATES.get(cdb, []))

    # 2) The attribute name itself
    candidates.append(attr_name)

    # 3) Some small variations
    if attr_name.endswith("_id"):
        candidates.append(attr_name[:-3])
    if attr_name.endswith("_accession"):
        candidates.append(attr_name[:-10])

    # Deduplicate candidates but preserve order
    seen: set[str] = set()
    ordered_candidates: list[str] = []
    for c in candidates:
        if c and c not in seen:
            seen.add(c)
            ordered_candidates.append(c)

    for c in ordered_candidates:
        if c in filt_set:
            return c

    # 4) Fuzzy search
    if cdb == "hgnc_symbol":
        pattern = "external_gene"
    elif cdb.startswith("ensembl_"):
        pattern = "ensembl_" + cdb.split("_", 1)[1]
    elif cdb == "entrez_gene":
        pattern = "entrez"
    elif cdb == "uniprot":
        pattern = "uniprot"
    elif cdb.startswith("refseq_"):
        pattern = "refseq"
    else:
        pattern = cdb

    prefix_hits = [f for f in filt_list if f.startswith(pattern)]
    if prefix_hits:
        return prefix_hits[0]

    contains_hits = [f for f in filt_list if pattern in f]
    if contains_hits:
        return contains_hits[0]

    raise RuntimeError(
        f"pybiomart: dataset does not provide any filter compatible with "
        f"{cdb!r}; inspect `dataset.list_filters()` for valid names."
    )




[docs]
def map_with_pybiomart(
    ids: _t.Iterable[str],
    input_db: str,
    output_db: str,
    *,
    species: str = "hsapiens",
    chunk_size: int = 1000,
    pause: float = 0.2,
    strip_versions: bool = True,
    release: str | int | None = None,
    show_progress: bool = True,
    suppress_method_verbosity: bool = True,
) -> pd.DataFrame:
    """Map identifiers using Ensembl BioMart via pybiomart.

    Note: BioMart can only filter by Ensembl IDs (gene, transcript, protein).
    Other ID types can be used as output_db but not input_db.

    Args:
        ids: Input Ensembl identifiers to map.
        input_db: Source database type. Must be one of ``"ensembl_gene"``, ``"ensembl_transcript"``,
            or ``"ensembl_protein"``.
        output_db: Target database type (e.g. ``"hgnc_symbol"``, ``"uniprot"``, ``"entrez_gene"``).
        species: Species code (e.g. ``"hsapiens"``, ``"mmusculus"``, ``"sscrofa"``).
        chunk_size: Number of IDs per BioMart query.
        pause: Pause in seconds between queries.
        strip_versions: Strip version suffixes from Ensembl/RefSeq IDs.
        release: Ensembl release number (e.g. ``104``) or special key (e.g. ``"grch37"``). If ``None``, uses
            the current Ensembl release.
        show_progress: Display progress bar.
        suppress_method_verbosity: Suppress stdout/stderr from pybiomart.

    Returns:
        pd.DataFrame: Standardized mapping DataFrame.

    Raises:
        RuntimeError: If the BioMart connection fails or required dataset metadata cannot be retrieved.
        ValueError: If ``input_db`` is not an Ensembl type.
    """
    try:
        from pybiomart import Dataset  # type: ignore
    except ImportError as e:
        raise_missing_dependency("pybiomart", feature="pybiomart ID mapping backend", original_error=e)

    inp = canonical_db(input_db)
    outp = canonical_db(output_db)

    # Early, explicit check: BioMart can only *filter* by Ensembl IDs.
    # It can still *return* HGNC/UniProt/etc. as attributes, but input_db
    # must be one of the Ensembl IDs.
    if inp not in _ENSEMBL_INPUT_DB:
        allowed_str = ", ".join(sorted(_ENSEMBL_INPUT_DB))
        raise ValueError(
            f"pybiomart input_db must be one of {{{allowed_str}}}, got {inp!r}. "
            "BioMart cannot filter by HGNC/UniProt directly; keep them as output_db "
            "or use method='mygene'/'gprofiler' for those inputs."
        )

    clean_ids = [strip_version(i) if strip_versions else str(i) for i in ids]
    uniq_ids = _unique_not_null(clean_ids)

    if not uniq_ids:
        return _ensure_all_inputs(_empty_result(), clean_ids, inp, outp, "pybiomart", release_used=None)

    # Resolve Ensembl archive host solely from `release` (if provided).
    raw_host: str | None = None

    resolved_release_host: str | None = None

    if release is not None:
        resolved_release_host = _ensembl_archive_host_for_release(release)
        if resolved_release_host:
            raw_host = resolved_release_host
        else:
            logger.warning(
                "pybiomart: no known archive host for Ensembl release %r; falling back to www.ensembl.org",
                release,
            )

    if raw_host is None:
        raw_host = "http://www.ensembl.org"

    host = _normalize_biomart_host(raw_host)

    dataset_name = _biomart_dataset_for_species(species)

    try:
        ds = Dataset(name=dataset_name, host=host)
    except Exception as e:
        raise RuntimeError(
            f"pybiomart: failed to connect to Ensembl BioMart " f"(dataset={dataset_name!r}, host={host!r}): {e}"
        ) from e

    # Discover attributes and filters that actually exist for this dataset
    attr_names = _bm_list_attribute_names(ds)
    filter_names = _bm_list_filter_names(ds)

    if not attr_names:
        raise RuntimeError(f"pybiomart: could not retrieve attributes for dataset {dataset_name!r}")
    if not filter_names:
        raise RuntimeError(f"pybiomart: could not retrieve filters for dataset {dataset_name!r}")

    # Choose valid attribute + filter names for the requested mapping
    in_attr = _bm_pick_attribute(inp, attr_names)
    out_attr = _bm_pick_attribute(outp, attr_names)
    filter_name = _bm_pick_filter(inp, in_attr, filter_names)

    logger.debug(
        "pybiomart: using dataset=%r host=%r in_attr=%r out_attr=%r filter=%r",
        dataset_name,
        host,
        in_attr,
        out_attr,
        filter_name,
    )

    frames: list[pd.DataFrame] = []
    n_chunks = math.ceil(len(uniq_ids) / chunk_size)
    with tqdm(
        total=len(uniq_ids),
        desc="pybiomart",
        mininterval=0.25,
        disable=not show_progress,
        ncols=100,
        unit="ids",
    ) as progress:
        for i, chunk in enumerate(_chunker(uniq_ids, chunk_size), start=1):
            logger.debug(
                "pybiomart: querying chunk %d/%d (n=%d)",
                i,
                n_chunks,
                len(chunk),
            )
            try:
                with _suppress_stdout_stderr(suppress_method_verbosity):
                    df = ds.query(
                        attributes=[in_attr, out_attr],
                        filters={filter_name: chunk},
                        use_attr_names=True,
                    )

                if df is None or df.empty:
                    frames.append(
                        pd.DataFrame(
                            {
                                "input_id": chunk,
                                "output_id": [None] * len(chunk),
                            }
                        )
                    )
                else:
                    df = df.rename(columns={in_attr: "input_id", out_attr: "output_id"})
                    keep_cols = [c for c in ("input_id", "output_id") if c in df.columns]
                    if not keep_cols:
                        frames.append(
                            pd.DataFrame(
                                {
                                    "input_id": chunk,
                                    "output_id": [None] * len(chunk),
                                }
                            )
                        )
                    else:
                        frames.append(df[keep_cols].drop_duplicates())

            except Exception as e:
                logger.warning(f"pybiomart chunk failed: {e}")
                meta = {"error": str(e)}
                frames.append(
                    pd.DataFrame(
                        {
                            "input_id": chunk,
                            "output_id": [None] * len(chunk),
                            "metadata_json": [_json(meta)] * len(chunk),
                        }
                    )
                )

            progress.update(len(chunk))
            time.sleep(pause)

    if not frames:
        return _ensure_all_inputs(_empty_result(), clean_ids, inp, outp, "pybiomart", release_used=host)

    out = pd.concat(frames, ignore_index=True)

    out["input_db"] = inp
    out["output_db"] = outp
    out["method"] = "pybiomart"
    out["release_used"] = host
    if "metadata_json" not in out.columns:
        out["metadata_json"] = _json({})

    out = _ensure_all_inputs(out, clean_ids, inp, outp, "pybiomart", release_used=host)
    out = out.drop_duplicates(["input_id", "output_id", "input_db", "output_db", "method", "release_used"])

    return out[
        [
            "input_id",
            "input_db",
            "mapping",
            "output_id",
            "output_db",
            "method",
            "release_used",
            "metadata_json",
        ]
    ]