From 26cba7227ab03b11e01ee32002474aa75d8aa2f9 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 14 Jan 2026 07:11:32 -0800 Subject: [PATCH 1/3] Use ClinGen to map genomic accession-based variants to genome This change reduces likelihood of errors when mapping accession-based transcript variants to genomic positions. Keep alignment process for now so that we can report the transcript and gene for these variants. Eventually, just the transcript id could be used to access this information. --- src/dcd_mapping/vrs_map.py | 57 ++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index db81d57..3f3e400 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -4,6 +4,7 @@ from collections.abc import Iterable from itertools import cycle +import requests from Bio.Seq import Seq from bioutils.accessions import infer_namespace from cool_seq_tool.schemas import AnnotationLayer, Strand @@ -22,6 +23,7 @@ from dcd_mapping.exceptions import ( MissingSequenceIdError, + ResourceAcquisitionError, UnsupportedReferenceSequenceNameSpaceError, UnsupportedReferenceSequencePrefixError, ) @@ -47,6 +49,8 @@ _logger = logging.getLogger(__name__) +CLINGEN_API_URL = "https://reg.genome.network/allele" + def _hgvs_variant_is_valid(hgvs_string: str) -> bool: return not hgvs_string.endswith((".=", ")", "X")) @@ -85,6 +89,29 @@ def is_intronic_variant(variant: Variant) -> bool: return False +def fetch_clingen_genomic_hgvs(hgvs: str) -> str | None: + """Fetch the genomic HGVS string from ClinGen. + + :param hgvs: The HGVS string to fetch + :return: The genomic HGVS string on GRCh38, or None if not found + """ + response = requests.get(f"{CLINGEN_API_URL}?hgvs={hgvs}", timeout=30) + try: + response.raise_for_status() + except requests.HTTPError as e: + msg = f"Received HTTPError from {CLINGEN_API_URL} for HGVS {hgvs}" + _logger.error(msg) + raise ResourceAcquisitionError(msg) from e + if response.status_code == 200: + data = response.json() + for allele in data.get("genomicAlleles", []): + if allele.get("referenceGenome") == "GRCh38": + for coordinates in allele.get("hgvs", []): + if coordinates.startswith("NC_"): + return coordinates + return None + + def _create_pre_mapped_hgvs_strings( raw_description: str, layer: AnnotationLayer, @@ -155,6 +182,7 @@ def _create_post_mapped_hgvs_strings( layer: AnnotationLayer, tx: TxSelectResult | None = None, alignment: AlignmentResult | None = None, + accession_id: str | None = None, ) -> list[str]: """Generate a list of (post-mapped) HGVS strings from one long string containing many valid HGVS substrings. @@ -166,13 +194,14 @@ def _create_post_mapped_hgvs_strings( :param layer: An enum denoting the targeted annotation layer of these HGVS strings :param tx: A TxSelectResult object defining the transcript we are mapping to (or None) :param alignment: An AlignmentResult object defining the alignment we are mapping to (or None) + :param accession_id: An accession id describing the reference sequence (or None). Only used for accession-based variants. :return: A list of HGVS strings relative to the `tx` or `alignment` """ if layer is AnnotationLayer.PROTEIN and tx is None: msg = f"Transcript result must be provided for {layer} annotations (Transcript was `{tx}`)." raise ValueError(msg) - if layer is AnnotationLayer.GENOMIC and alignment is None: - msg = f"Alignment result must be provided for {layer} annotations (Alignment was `{alignment}`)." + if layer is AnnotationLayer.GENOMIC and alignment is None and accession_id is None: + msg = f"Alignment result or accession ID must be provided for {layer} annotations (Alignment was `{alignment}` and Accession ID was `{accession_id}`)." raise ValueError(msg) raw_variants = _parse_raw_variant_str(raw_description) @@ -196,12 +225,22 @@ def _create_post_mapped_hgvs_strings( variant = _adjust_protein_variant_to_ref(variant, tx) hgvs_strings.append(tx.np + ":" + str(variant)) elif layer is AnnotationLayer.GENOMIC: - assert alignment # noqa: S101. mypy help - - variant = _adjust_genomic_variant_to_ref(variant, alignment) - hgvs_strings.append( - get_chromosome_identifier(alignment.chrom) + ":" + str(variant) - ) + if accession_id: + pre_mapped_hgvs = accession_id + ":" + str(variant) + # use ClinGen to align accession-based variants to genomic reference + genomic_hgvs = fetch_clingen_genomic_hgvs(pre_mapped_hgvs) + if genomic_hgvs: + hgvs_strings.append(genomic_hgvs) + else: + msg = f"Could not fetch genomic HGVS on GRCh38 for accession-based variant: {pre_mapped_hgvs}" + raise ValueError(msg) + else: + assert alignment # noqa: S101. mypy help + + variant = _adjust_genomic_variant_to_ref(variant, alignment) + hgvs_strings.append( + get_chromosome_identifier(alignment.chrom) + ":" + str(variant) + ) else: msg = ( f"Could not generate HGVS strings for invalid AnnotationLayer: {layer}" @@ -539,7 +578,7 @@ def _map_genomic( post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( row.hgvs_nt, AnnotationLayer.GENOMIC, - alignment=align_result, + accession_id=sequence_id, ) post_mapped_genomic = _construct_vrs_allele( post_mapped_hgvs_strings, From ec55365ce3cb6bab4be126d6837cbc825903ccd7 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 3 Feb 2026 14:28:15 -0800 Subject: [PATCH 2/3] Add ClinGen API URL as environment variable --- settings/.env.dev | 6 ++++++ src/dcd_mapping/vrs_map.py | 7 ++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/settings/.env.dev b/settings/.env.dev index 359fc2a..7258d2f 100644 --- a/settings/.env.dev +++ b/settings/.env.dev @@ -31,3 +31,9 @@ MAVEDB_API_KEY= #################################################################################################### SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2024-12-20 + +#################################################################################################### +# Environment variables for ClinGen +#################################################################################################### + +CLINGEN_API_URL=https://reg.genome.network/allele diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index 3f3e400..d2edb87 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -1,6 +1,7 @@ """Map transcripts to VRS objects.""" import logging +import os from collections.abc import Iterable from itertools import cycle @@ -49,7 +50,7 @@ _logger = logging.getLogger(__name__) -CLINGEN_API_URL = "https://reg.genome.network/allele" +CLINGEN_API_URL = os.environ.get("CLINGEN_API_URL", "https://reg.genome.network/allele") def _hgvs_variant_is_valid(hgvs_string: str) -> bool: @@ -95,6 +96,10 @@ def fetch_clingen_genomic_hgvs(hgvs: str) -> str | None: :param hgvs: The HGVS string to fetch :return: The genomic HGVS string on GRCh38, or None if not found """ + if CLINGEN_API_URL is None: + msg = "CLINGEN_API_URL environment variable is not set and default is unavailable." + _logger.error(msg) + raise ValueError(msg) response = requests.get(f"{CLINGEN_API_URL}?hgvs={hgvs}", timeout=30) try: response.raise_for_status() From e3552f17856e5356caf4ae678e004f81f562ebc4 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 4 Feb 2026 08:58:28 -0800 Subject: [PATCH 3/3] Make requests to ClinGen with retries upon failure --- src/dcd_mapping/resource_utils.py | 37 +++++++++++++++++++++++++++++++ src/dcd_mapping/vrs_map.py | 11 ++------- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/src/dcd_mapping/resource_utils.py b/src/dcd_mapping/resource_utils.py index f84caf1..5547440 100644 --- a/src/dcd_mapping/resource_utils.py +++ b/src/dcd_mapping/resource_utils.py @@ -1,11 +1,15 @@ """Provide basic utilities for fetching and storing external data.""" +import logging import os +import time from pathlib import Path import click import requests from tqdm import tqdm +_logger = logging.getLogger(__name__) + MAVEDB_API_KEY = os.environ.get("MAVEDB_API_KEY") MAVEDB_BASE_URL = os.environ.get("MAVEDB_BASE_URL") @@ -57,3 +61,36 @@ def http_download(url: str, out_path: Path, silent: bool = True) -> Path: if chunk: h.write(chunk) return out_path + + +def request_with_backoff( + method: str, url: str, backoff_limit: int = 5, backoff_wait: int = 10, **kwargs +) -> requests.Response: + """Make HTTP request with exponential backoff on failure. + This is a duplicate of the function with same name in MaveDB API codebase. + + :param method: HTTP method (e.g., 'GET', 'POST') + :param url: URL to make request to + :param backoff_limit: number of retry attempts + :param backoff_wait: initial wait time between retries (in seconds) + :param kwargs: additional keyword arguments to pass to the request + :return: Response object from the successful request + """ + attempt = 0 + while attempt <= backoff_limit: + msg = f"Attempt {attempt+1} of {backoff_limit} for {method} {url}" + _logger.debug(msg) + try: + response = requests.request(method=method, url=url, **kwargs) + response.raise_for_status() + return response + except requests.exceptions.RequestException as exc: + msg = f"Request to {url} failed on attempt {attempt+1}." + _logger.warning(msg, exc_info=exc) + backoff_time = backoff_wait * (2**attempt) + attempt += 1 + msg = f"Waiting {backoff_time} seconds before retrying." + _logger.info(msg) + time.sleep(backoff_time) + msg = f"Request to {url} failed after {backoff_limit} attempts." + raise requests.exceptions.RequestException(msg) diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index d2edb87..6ec02c4 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -5,7 +5,6 @@ from collections.abc import Iterable from itertools import cycle -import requests from Bio.Seq import Seq from bioutils.accessions import infer_namespace from cool_seq_tool.schemas import AnnotationLayer, Strand @@ -24,7 +23,6 @@ from dcd_mapping.exceptions import ( MissingSequenceIdError, - ResourceAcquisitionError, UnsupportedReferenceSequenceNameSpaceError, UnsupportedReferenceSequencePrefixError, ) @@ -34,6 +32,7 @@ get_seqrepo, translate_hgvs_to_vrs, ) +from dcd_mapping.resource_utils import request_with_backoff from dcd_mapping.schemas import ( AlignmentResult, MappedScore, @@ -100,13 +99,7 @@ def fetch_clingen_genomic_hgvs(hgvs: str) -> str | None: msg = "CLINGEN_API_URL environment variable is not set and default is unavailable." _logger.error(msg) raise ValueError(msg) - response = requests.get(f"{CLINGEN_API_URL}?hgvs={hgvs}", timeout=30) - try: - response.raise_for_status() - except requests.HTTPError as e: - msg = f"Received HTTPError from {CLINGEN_API_URL} for HGVS {hgvs}" - _logger.error(msg) - raise ResourceAcquisitionError(msg) from e + response = request_with_backoff("GET", f"{CLINGEN_API_URL}?hgvs={hgvs}", timeout=30) if response.status_code == 200: data = response.json() for allele in data.get("genomicAlleles", []):