Source code for TCT.node_normalizer

"""
This is a wrapper around the Node Normalizer API.

API docs: https://nodenorm.transltr.io/docs
"""
import urllib.parse

import requests

from .translator_node import TranslatorNode


URL = 'https://nodenorm.transltr.io/'

[docs] def get_normalized_nodes(query: str | list[str], return_equivalent_identifiers:bool=False, **kwargs): """ A wrapper around the `get_normalized_nodes` api endpoint. Given a CURIE or a list of CURIEs, this returns either a single TranslatorNode or a dict of CURIE ids to TranslatorNodes. Parameters ---------- query : str Query CURIE return_equivalent_identifiers : bool Whether or not to return a list of equivalent identifiers along with the TranslatorNode. Default: False **kwargs Other arguments to `get_normalized_nodes` (e.g. `conflate` for gene-protein conflation, `drug_chemical_conflate` for drug-chemical conflation) Returns ------- If query is a single CURIE, returns a single TranslatorNode. If query is a list of CURIEs, a dict of CURIE id to TranslatorNode for every node in the query. Examples -------- >>> get_normalized_nodes('MESH:D014867', return_equivalent_identifiers=False) TranslatorNode(curie='CHEBI:15377', label='Water', types=['biolink:SmallMolecule', 'biolink:MolecularEntity', 'biolink:ChemicalEntity', 'biolink:PhysicalEssence', 'biolink:ChemicalOrDrugOrTreatment', 'biolink:ChemicalEntityOrGeneOrGeneProduct', 'biolink:ChemicalEntityOrProteinOrPolypeptide', 'biolink:NamedThing', 'biolink:PhysicalEssenceOrOccurrent'], synonyms=None, curie_synonyms=None) """ path = urllib.parse.urljoin(URL, 'get_normalized_nodes') # default parameters: true for gene-protein conflation, false for drug-chemical conflation response = requests.get(path, params={'curie': query, **kwargs}) if response.status_code == 200: result = response.json() if len(result) == 0: raise LookupError('No matches found for the given input: ' + str(query)) else: normalized_dict = {} for k, node in result.items(): n = TranslatorNode(node['id']['identifier']) if 'label' in node['id']: n.label = node['id']['label'] if 'type' in node: n.types = node['type'] if return_equivalent_identifiers and 'equivalent_identifiers' in node: synonyms = [] curie_synonyms = [] for eq in node['equivalent_identifiers']: if 'label' in eq: synonyms.append(eq['label']) else: synonyms.append(None) curie_synonyms.append(eq['identifier']) n.synonyms = synonyms n.curie_synonyms = curie_synonyms normalized_dict[k] = n if isinstance(query, str): return normalized_dict[query] return normalized_dict else: raise requests.RequestException('Response from server had error, code ' + str(response.status_code))
[docs] def ID_convert_to_preferred_name_nodeNormalizer(id_list): ''' Convert a list of CURIEs to their preferred names using NodeNorm. Arg: id_list: list of CURIEs to be converted Returns: dic_id_map: dictionary mapping CURIEs to their preferred names Example: dic_id_map = ID_convert_to_preferred_name_nodeNormalizer(["NCBIGene:1234", "NCBIGene:5678"]) ''' dic_id_map = {} unrecoglized_ids = [] recoglized_ids = [] # To convert a CURIE to a preferred name, you don't need NameLookup at all -- NodeNorm can # do this by itself! NODENORM_BASE_URL = "https://nodenorm.transltr.io" # Adjust this if you need NodeNorm TEST, CI or DEV. NODENORM_BATCH_LIMIT = 900 # Adjust this if you start getting errors from NodeNorm. NODENORM_GENE_PROTEIN_CONFLATION = True # Change to False if you don't want gene/protein conflation. NODENORM_DRUG_CHEMICAL_CONFLATION = False # Change to True if you want drug/chemical conflation. # split id_list into batches of at most NODENORM_BATCH_LIMIT entries for index in range(0, len(id_list), NODENORM_BATCH_LIMIT): id_sublist = id_list[index:index + NODENORM_BATCH_LIMIT] # print(f"id_sublist: {id_sublist}") # Query NodeNorm with https://nodenorm.transltr.io/docs#/default/get_normalized_node_handler_get_normalized_nodes_get response = requests.post(NODENORM_BASE_URL + '/get_normalized_nodes', json={ "curies": id_sublist, "description": False, # Change to True if you want descriptions from any identifiers we know about. "conflate": NODENORM_GENE_PROTEIN_CONFLATION, "drug_chemical_conflate": NODENORM_DRUG_CHEMICAL_CONFLATION, }) if not response.ok: raise RuntimeError("Error: NodeNorm request failed with status code " + str(response.status_code)) results = response.json() for curie in id_sublist: if curie in results and results[curie]: identifier = results[curie].get('id', {}) if 'identifier' in identifier and identifier['identifier'] != curie: recoglized_ids.append(curie) #print(f"NodeNorm normalized {curie} to {identifier['identifier']} " + # f"with gene-protein conflation {NODENORM_GENE_PROTEIN_CONFLATION} and " + # f"with drug-chemical conflation {NODENORM_DRUG_CHEMICAL_CONFLATION}.") label = identifier.get('label') dic_id_map[curie] = label if not label: print(curie + ": no preferred name") dic_id_map[curie] = curie else: unrecoglized_ids.append(curie) dic_id_map[curie] = curie if len(unrecoglized_ids) > 0: print("NodeNorm does not know about these identifiers: " + ",".join(unrecoglized_ids)) return dic_id_map