diff --git a/settings/.env.dev b/settings/.env.dev index 49071b3..1bc266b 100644 --- a/settings/.env.dev +++ b/settings/.env.dev @@ -32,6 +32,12 @@ MAVEDB_API_KEY= SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2024-12-20 +#################################################################################################### +# Environment variables for ClinGen +#################################################################################################### + +CLINGEN_API_URL=https://reg.genome.network/allele + #################################################################################################### # Environment variables for ensembl #################################################################################################### diff --git a/src/dcd_mapping/resource_utils.py b/src/dcd_mapping/resource_utils.py index 6b610cf..e5b8ad3 100644 --- a/src/dcd_mapping/resource_utils.py +++ b/src/dcd_mapping/resource_utils.py @@ -1,4 +1,5 @@ """Provide basic utilities for fetching and storing external data.""" +import logging import os import time from pathlib import Path @@ -7,6 +8,8 @@ import requests from tqdm import tqdm +_logger = logging.getLogger(__name__) + MAVEDB_API_KEY = os.environ.get("MAVEDB_API_KEY") MAVEDB_BASE_URL = os.environ.get("MAVEDB_BASE_URL") ENSEMBL_API_URL = os.environ.get("ENSEMBL_API_URL", "https://rest.ensembl.org") # TODO diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index 7bf3d15..9c0b507 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -1,6 +1,7 @@ """Map transcripts to VRS objects.""" import logging +import os from collections.abc import Iterable from itertools import cycle @@ -31,7 +32,7 @@ get_seqrepo, translate_hgvs_to_vrs, ) -from dcd_mapping.resource_utils import CDOT_URL +from dcd_mapping.resource_utils import CDOT_URL, request_with_backoff from dcd_mapping.schemas import ( AlignmentResult, MappedScore, @@ -48,6 +49,8 @@ _logger = logging.getLogger(__name__) +CLINGEN_API_URL = os.environ.get("CLINGEN_API_URL", "https://reg.genome.network/allele") + def _hgvs_variant_is_valid(hgvs_string: str) -> bool: return not hgvs_string.endswith((".=", ")", "X")) @@ -86,6 +89,27 @@ def is_intronic_variant(variant: Variant) -> bool: return False +def fetch_clingen_genomic_hgvs(hgvs: str) -> str | None: + """Fetch the genomic HGVS string from ClinGen. + + :param hgvs: The HGVS string to fetch + :return: The genomic HGVS string on GRCh38, or None if not found + """ + if CLINGEN_API_URL is None: + msg = "CLINGEN_API_URL environment variable is not set and default is unavailable." + _logger.error(msg) + raise ValueError(msg) + response = request_with_backoff("GET", f"{CLINGEN_API_URL}?hgvs={hgvs}", timeout=30) + if response.status_code == 200: + data = response.json() + for allele in data.get("genomicAlleles", []): + if allele.get("referenceGenome") == "GRCh38": + for coordinates in allele.get("hgvs", []): + if coordinates.startswith("NC_"): + return coordinates + return None + + def _create_pre_mapped_hgvs_strings( raw_description: str, layer: AnnotationLayer, @@ -156,6 +180,7 @@ def _create_post_mapped_hgvs_strings( layer: AnnotationLayer, tx: TxSelectResult | None = None, alignment: AlignmentResult | None = None, + accession_id: str | None = None, ) -> list[str]: """Generate a list of (post-mapped) HGVS strings from one long string containing many valid HGVS substrings. @@ -167,13 +192,14 @@ def _create_post_mapped_hgvs_strings( :param layer: An enum denoting the targeted annotation layer of these HGVS strings :param tx: A TxSelectResult object defining the transcript we are mapping to (or None) :param alignment: An AlignmentResult object defining the alignment we are mapping to (or None) + :param accession_id: An accession id describing the reference sequence (or None). Only used for accession-based variants. :return: A list of HGVS strings relative to the `tx` or `alignment` """ if layer is AnnotationLayer.PROTEIN and tx is None: msg = f"Transcript result must be provided for {layer} annotations (Transcript was `{tx}`)." raise ValueError(msg) - if layer is AnnotationLayer.GENOMIC and alignment is None: - msg = f"Alignment result must be provided for {layer} annotations (Alignment was `{alignment}`)." + if layer is AnnotationLayer.GENOMIC and alignment is None and accession_id is None: + msg = f"Alignment result or accession ID must be provided for {layer} annotations (Alignment was `{alignment}` and Accession ID was `{accession_id}`)." raise ValueError(msg) raw_variants = _parse_raw_variant_str(raw_description) @@ -197,12 +223,22 @@ def _create_post_mapped_hgvs_strings( variant = _adjust_protein_variant_to_ref(variant, tx) hgvs_strings.append(tx.np + ":" + str(variant)) elif layer is AnnotationLayer.GENOMIC: - assert alignment # noqa: S101. mypy help - - variant = _adjust_genomic_variant_to_ref(variant, alignment) - hgvs_strings.append( - get_chromosome_identifier(alignment.chrom) + ":" + str(variant) - ) + if accession_id: + pre_mapped_hgvs = accession_id + ":" + str(variant) + # use ClinGen to align accession-based variants to genomic reference + genomic_hgvs = fetch_clingen_genomic_hgvs(pre_mapped_hgvs) + if genomic_hgvs: + hgvs_strings.append(genomic_hgvs) + else: + msg = f"Could not fetch genomic HGVS on GRCh38 for accession-based variant: {pre_mapped_hgvs}" + raise ValueError(msg) + else: + assert alignment # noqa: S101. mypy help + + variant = _adjust_genomic_variant_to_ref(variant, alignment) + hgvs_strings.append( + get_chromosome_identifier(alignment.chrom) + ":" + str(variant) + ) else: msg = ( f"Could not generate HGVS strings for invalid AnnotationLayer: {layer}" @@ -540,7 +576,7 @@ def _map_genomic( post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( row.hgvs_nt, AnnotationLayer.GENOMIC, - alignment=align_result, + accession_id=sequence_id, ) post_mapped_genomic = _construct_vrs_allele( post_mapped_hgvs_strings,