Source code for idtrack._external_mappers._backend_pybiomart

#!/usr/bin/env python3
"""Ensembl BioMart backend for ID mapping.

This module provides the map_with_pybiomart() function for querying
Ensembl BioMart to convert biological identifiers. Supports historical
Ensembl releases via archive hosts.
"""

# Kemal Inecik
# k.inecik@gmail.com


from __future__ import annotations

import math
import re
import time
import typing as _t
from urllib.parse import urlparse

import pandas as pd
from tqdm import tqdm

from idtrack._external_mappers._constants import (
    _BM_ATTR_CANDIDATES,
    _BM_FILTER_CANDIDATES,
    _ENSEMBL_ARCHIVE_BY_RELEASE,
    _ENSEMBL_INPUT_DB,
    _ENSEMBL_SPECIAL_RELEASE_HOSTS,
)
from idtrack._external_mappers._utils import (
    _chunker,
    _empty_result,
    _ensure_all_inputs,
    _json,
    _suppress_stdout_stderr,
    _unique_not_null,
    canonical_db,
    canonical_species,
    logger,
    raise_missing_dependency,
    strip_version,
)


[docs] def _ensembl_archive_host_for_release( release: int | str | None, ) -> str | None: """Resolve an Ensembl release or key to an archive host. Examples include an integer release (e.g. ``104``) or a special string key (e.g. ``"GRCh37"``) mapping to hosts like ``"may2021.archive.ensembl.org"``. Args: release: Ensembl release number or key (e.g. ``104``, ``"v104"``, ``"GRCh37"``), or ``None``. Returns: Archive host for the requested release, or ``None`` if unknown. """ if release is None: return None # String releases can be things like "104", "v104", "GRCh37", ... if isinstance(release, str): s = release.strip() if not s: return None key = s.lower() # Special non‑numeric keys first if key in _ENSEMBL_SPECIAL_RELEASE_HOSTS: return _ENSEMBL_SPECIAL_RELEASE_HOSTS[key] # Strip a leading "v" or "r" if present (e.g. "v104") m = re.match(r"^[vr]?(\d+)$", key) if not m: return None try: rel_int = int(m.group(1)) except ValueError: return None else: try: rel_int = int(release) except (TypeError, ValueError): return None return _ENSEMBL_ARCHIVE_BY_RELEASE.get(rel_int)
[docs] def _biomart_dataset_for_species(species: str, explicit: str | None = None) -> str: """Return the Ensembl BioMart dataset name for the given species.""" if explicit: return explicit s = canonical_species(species) return f"{s}_gene_ensembl"
[docs] def _normalize_biomart_host(host: str | None) -> str: """Normalize an Ensembl BioMart host for pybiomart. Examples of valid outputs: "http://www.ensembl.org" "http://nov2020.archive.ensembl.org" "http://grch37.ensembl.org" Args: host: Hostname or URL (scheme optional). If ``None``, defaults to ``"http://www.ensembl.org"``. Returns: str: Normalized base URL suitable for pybiomart. """ if not host: return "http://www.ensembl.org" host = host.strip() parsed = urlparse(host if "://" in host else "http://" + host) netloc = parsed.netloc or parsed.path.split("/")[0] return "http://" + netloc
[docs] def _bm_list_attribute_names(ds) -> list[str]: """Return a list of attribute names for a pybiomart Dataset.""" try: attrs = ds.list_attributes() except (AttributeError, TypeError, RuntimeError): # Different pybiomart versions may not have list_attributes() attrs = getattr(ds, "attributes", None) if attrs is None: return [] try: # biomaRt-style DataFrame if hasattr(attrs, "columns"): if "name" in attrs.columns: return [str(x) for x in attrs["name"].tolist()] else: return [str(x) for x in attrs.iloc[:, 0].tolist()] # dict or list-like if isinstance(attrs, dict): return [str(k) for k in attrs.keys()] return [str(x) for x in list(attrs)] except (KeyError, IndexError, TypeError, AttributeError): try: return list(attrs) except (TypeError, ValueError): return []
[docs] def _bm_list_filter_names(ds) -> list[str]: """Return a list of filter names for a pybiomart Dataset.""" try: filts = ds.list_filters() except (AttributeError, TypeError, RuntimeError): # Different pybiomart versions may not have list_filters() filts = getattr(ds, "filters", None) if filts is None: return [] try: if hasattr(filts, "columns"): if "name" in filts.columns: return [str(x) for x in filts["name"].tolist()] else: return [str(x) for x in filts.iloc[:, 0].tolist()] if isinstance(filts, dict): return [str(k) for k in filts.keys()] return [str(x) for x in list(filts)] except (KeyError, IndexError, TypeError, AttributeError): try: return list(filts) except (TypeError, ValueError): return []
[docs] def _bm_pick_attribute(canonical_db_name: str, available_attrs: list[str]) -> str: """Choose a BioMart attribute name for a canonical DB. The helper first tries explicit candidates from :py:data:`_BM_ATTR_CANDIDATES` and falls back to fuzzy matching on common substrings. Args: canonical_db_name: Canonical database key (see :py:func:`~idtrack._external_mappers._utils.canonical_db`). available_attrs: Attribute names provided by the BioMart dataset. Returns: str: Selected attribute name. Raises: RuntimeError: If no compatible attribute is available on the dataset. """ cdb = canonical_db(canonical_db_name) attrs = list(dict.fromkeys(available_attrs)) # dedupe, preserve order attr_set = set(attrs) # 1) Try explicit candidates in preferred order candidates = _BM_ATTR_CANDIDATES.get(cdb, []) for cand in candidates: if cand in attr_set: return cand # 2) Fuzzy fallback based on typical naming patterns if cdb.startswith("ensembl_"): pattern = "ensembl_" + cdb.split("_", 1)[1] elif cdb == "hgnc_symbol": pattern = "external_gene" elif cdb == "hgnc_id": pattern = "hgnc" elif cdb == "entrez_gene": pattern = "entrez" elif cdb == "uniprot": pattern = "uniprot" elif cdb.startswith("refseq_"): pattern = "refseq" else: pattern = cdb prefix_hits = [a for a in attrs if a.startswith(pattern)] if prefix_hits: return prefix_hits[0] contains_hits = [a for a in attrs if pattern in a] if contains_hits: return contains_hits[0] raise RuntimeError( f"pybiomart: dataset does not provide any attribute compatible with " f"{cdb!r}; inspect `dataset.list_attributes()` for valid names." )
[docs] def _bm_pick_filter( canonical_db_name: str, attr_name: str, available_filters: list[str], ) -> str: """Choose a BioMart filter name. The selection depends on the canonical database and chosen attribute. Args: canonical_db_name: Canonical database key for the input IDs. attr_name: Attribute name chosen for the input IDs. available_filters: Filter names provided by the BioMart dataset. Returns: str: Selected filter name. Raises: RuntimeError: If no compatible filter is available on the dataset. """ cdb = canonical_db(canonical_db_name) filt_list = list(dict.fromkeys(available_filters)) # dedupe, preserve order filt_set = set(filt_list) candidates: list[str] = [] # 1) DB-specific candidates candidates.extend(_BM_FILTER_CANDIDATES.get(cdb, [])) # 2) The attribute name itself candidates.append(attr_name) # 3) Some small variations if attr_name.endswith("_id"): candidates.append(attr_name[:-3]) if attr_name.endswith("_accession"): candidates.append(attr_name[:-10]) # Deduplicate candidates but preserve order seen: set[str] = set() ordered_candidates: list[str] = [] for c in candidates: if c and c not in seen: seen.add(c) ordered_candidates.append(c) for c in ordered_candidates: if c in filt_set: return c # 4) Fuzzy search if cdb == "hgnc_symbol": pattern = "external_gene" elif cdb.startswith("ensembl_"): pattern = "ensembl_" + cdb.split("_", 1)[1] elif cdb == "entrez_gene": pattern = "entrez" elif cdb == "uniprot": pattern = "uniprot" elif cdb.startswith("refseq_"): pattern = "refseq" else: pattern = cdb prefix_hits = [f for f in filt_list if f.startswith(pattern)] if prefix_hits: return prefix_hits[0] contains_hits = [f for f in filt_list if pattern in f] if contains_hits: return contains_hits[0] raise RuntimeError( f"pybiomart: dataset does not provide any filter compatible with " f"{cdb!r}; inspect `dataset.list_filters()` for valid names." )
[docs] def map_with_pybiomart( ids: _t.Iterable[str], input_db: str, output_db: str, *, species: str = "hsapiens", chunk_size: int = 1000, pause: float = 0.2, strip_versions: bool = True, release: str | int | None = None, show_progress: bool = True, suppress_method_verbosity: bool = True, ) -> pd.DataFrame: """Map identifiers using Ensembl BioMart via pybiomart. Note: BioMart can only filter by Ensembl IDs (gene, transcript, protein). Other ID types can be used as output_db but not input_db. Args: ids: Input Ensembl identifiers to map. input_db: Source database type. Must be one of ``"ensembl_gene"``, ``"ensembl_transcript"``, or ``"ensembl_protein"``. output_db: Target database type (e.g. ``"hgnc_symbol"``, ``"uniprot"``, ``"entrez_gene"``). species: Species code (e.g. ``"hsapiens"``, ``"mmusculus"``, ``"sscrofa"``). chunk_size: Number of IDs per BioMart query. pause: Pause in seconds between queries. strip_versions: Strip version suffixes from Ensembl/RefSeq IDs. release: Ensembl release number (e.g. ``104``) or special key (e.g. ``"grch37"``). If ``None``, uses the current Ensembl release. show_progress: Display progress bar. suppress_method_verbosity: Suppress stdout/stderr from pybiomart. Returns: pd.DataFrame: Standardized mapping DataFrame. Raises: RuntimeError: If the BioMart connection fails or required dataset metadata cannot be retrieved. ValueError: If ``input_db`` is not an Ensembl type. """ try: from pybiomart import Dataset # type: ignore except ImportError as e: raise_missing_dependency("pybiomart", feature="pybiomart ID mapping backend", original_error=e) inp = canonical_db(input_db) outp = canonical_db(output_db) # Early, explicit check: BioMart can only *filter* by Ensembl IDs. # It can still *return* HGNC/UniProt/etc. as attributes, but input_db # must be one of the Ensembl IDs. if inp not in _ENSEMBL_INPUT_DB: allowed_str = ", ".join(sorted(_ENSEMBL_INPUT_DB)) raise ValueError( f"pybiomart input_db must be one of {{{allowed_str}}}, got {inp!r}. " "BioMart cannot filter by HGNC/UniProt directly; keep them as output_db " "or use method='mygene'/'gprofiler' for those inputs." ) clean_ids = [strip_version(i) if strip_versions else str(i) for i in ids] uniq_ids = _unique_not_null(clean_ids) if not uniq_ids: return _ensure_all_inputs(_empty_result(), clean_ids, inp, outp, "pybiomart", release_used=None) # Resolve Ensembl archive host solely from `release` (if provided). raw_host: str | None = None resolved_release_host: str | None = None if release is not None: resolved_release_host = _ensembl_archive_host_for_release(release) if resolved_release_host: raw_host = resolved_release_host else: logger.warning( "pybiomart: no known archive host for Ensembl release %r; falling back to www.ensembl.org", release, ) if raw_host is None: raw_host = "http://www.ensembl.org" host = _normalize_biomart_host(raw_host) dataset_name = _biomart_dataset_for_species(species) try: ds = Dataset(name=dataset_name, host=host) except Exception as e: raise RuntimeError( f"pybiomart: failed to connect to Ensembl BioMart " f"(dataset={dataset_name!r}, host={host!r}): {e}" ) from e # Discover attributes and filters that actually exist for this dataset attr_names = _bm_list_attribute_names(ds) filter_names = _bm_list_filter_names(ds) if not attr_names: raise RuntimeError(f"pybiomart: could not retrieve attributes for dataset {dataset_name!r}") if not filter_names: raise RuntimeError(f"pybiomart: could not retrieve filters for dataset {dataset_name!r}") # Choose valid attribute + filter names for the requested mapping in_attr = _bm_pick_attribute(inp, attr_names) out_attr = _bm_pick_attribute(outp, attr_names) filter_name = _bm_pick_filter(inp, in_attr, filter_names) logger.debug( "pybiomart: using dataset=%r host=%r in_attr=%r out_attr=%r filter=%r", dataset_name, host, in_attr, out_attr, filter_name, ) frames: list[pd.DataFrame] = [] n_chunks = math.ceil(len(uniq_ids) / chunk_size) with tqdm( total=len(uniq_ids), desc="pybiomart", mininterval=0.25, disable=not show_progress, ncols=100, unit="ids", ) as progress: for i, chunk in enumerate(_chunker(uniq_ids, chunk_size), start=1): logger.debug( "pybiomart: querying chunk %d/%d (n=%d)", i, n_chunks, len(chunk), ) try: with _suppress_stdout_stderr(suppress_method_verbosity): df = ds.query( attributes=[in_attr, out_attr], filters={filter_name: chunk}, use_attr_names=True, ) if df is None or df.empty: frames.append( pd.DataFrame( { "input_id": chunk, "output_id": [None] * len(chunk), } ) ) else: df = df.rename(columns={in_attr: "input_id", out_attr: "output_id"}) keep_cols = [c for c in ("input_id", "output_id") if c in df.columns] if not keep_cols: frames.append( pd.DataFrame( { "input_id": chunk, "output_id": [None] * len(chunk), } ) ) else: frames.append(df[keep_cols].drop_duplicates()) except Exception as e: logger.warning(f"pybiomart chunk failed: {e}") meta = {"error": str(e)} frames.append( pd.DataFrame( { "input_id": chunk, "output_id": [None] * len(chunk), "metadata_json": [_json(meta)] * len(chunk), } ) ) progress.update(len(chunk)) time.sleep(pause) if not frames: return _ensure_all_inputs(_empty_result(), clean_ids, inp, outp, "pybiomart", release_used=host) out = pd.concat(frames, ignore_index=True) out["input_db"] = inp out["output_db"] = outp out["method"] = "pybiomart" out["release_used"] = host if "metadata_json" not in out.columns: out["metadata_json"] = _json({}) out = _ensure_all_inputs(out, clean_ids, inp, outp, "pybiomart", release_used=host) out = out.drop_duplicates(["input_id", "output_id", "input_db", "output_db", "method", "release_used"]) return out[ [ "input_id", "input_db", "mapping", "output_id", "output_db", "method", "release_used", "metadata_json", ] ]