Source code for TCT.node_normalizer

"""
This is a wrapper around the Node Normalizer API.

API docs: https://nodenorm.transltr.io/docs
"""
import urllib.parse

import requests

from .translator_node import TranslatorNode


URL = 'https://nodenorm.transltr.io/'


[docs]
def get_normalized_nodes(query: str | list[str],
        return_equivalent_identifiers:bool=False,
        **kwargs):
    """
    A wrapper around the `get_normalized_nodes` api endpoint. Given a CURIE or a list of CURIEs, this returns either a single TranslatorNode or a dict of CURIE ids to TranslatorNodes.
    
    Parameters
    ----------
    query : str
        Query CURIE
    return_equivalent_identifiers : bool
        Whether or not to return a list of equivalent identifiers along with the TranslatorNode. Default: False
    **kwargs
        Other arguments to `get_normalized_nodes` (e.g. `conflate` for gene-protein conflation, `drug_chemical_conflate` for drug-chemical conflation)

    Returns
    -------
    If query is a single CURIE, returns a single TranslatorNode.

    If query is a list of CURIEs, a dict of CURIE id to TranslatorNode for every node in the query.

    Examples
    --------
    >>> get_normalized_nodes('MESH:D014867', return_equivalent_identifiers=False)
    TranslatorNode(curie='CHEBI:15377', label='Water', types=['biolink:SmallMolecule', 'biolink:MolecularEntity', 'biolink:ChemicalEntity', 'biolink:PhysicalEssence', 'biolink:ChemicalOrDrugOrTreatment', 'biolink:ChemicalEntityOrGeneOrGeneProduct', 'biolink:ChemicalEntityOrProteinOrPolypeptide', 'biolink:NamedThing', 'biolink:PhysicalEssenceOrOccurrent'], synonyms=None, curie_synonyms=None)
    """
    path = urllib.parse.urljoin(URL, 'get_normalized_nodes')
    # default parameters: true for gene-protein conflation, false for drug-chemical conflation
    response = requests.get(path, params={'curie': query, **kwargs})
    if response.status_code == 200:
        result = response.json()
        if len(result) == 0:
            raise LookupError('No matches found for the given input: ' + str(query))
        else:
            normalized_dict = {}
            for k, node in result.items():
                n = TranslatorNode(node['id']['identifier'])
                if 'label' in node['id']:
                    n.label = node['id']['label']
                if 'type' in node:
                    n.types = node['type']
                if return_equivalent_identifiers and 'equivalent_identifiers' in node:
                    synonyms = []
                    curie_synonyms = []
                    for eq in node['equivalent_identifiers']:
                        if 'label' in eq:
                            synonyms.append(eq['label'])
                        else:
                            synonyms.append(None)
                        curie_synonyms.append(eq['identifier'])
                    n.synonyms = synonyms
                    n.curie_synonyms = curie_synonyms
                normalized_dict[k] = n
            if isinstance(query, str):
                return normalized_dict[query]
            return normalized_dict
    else:
        raise requests.RequestException('Response from server had error, code ' + str(response.status_code))



[docs]
def ID_convert_to_preferred_name_nodeNormalizer(id_list):
    '''
    Convert a list of CURIEs to their preferred names using NodeNorm.
    Arg:
        id_list: list of CURIEs to be converted
    Returns:
        dic_id_map: dictionary mapping CURIEs to their preferred names
    Example:
        dic_id_map = ID_convert_to_preferred_name_nodeNormalizer(["NCBIGene:1234", "NCBIGene:5678"])
    '''
    dic_id_map = {}
    unrecoglized_ids = []
    recoglized_ids = []
    # To convert a CURIE to a preferred name, you don't need NameLookup at all -- NodeNorm can
    # do this by itself!
    NODENORM_BASE_URL = "https://nodenorm.transltr.io"  # Adjust this if you need NodeNorm TEST, CI or DEV.
    NODENORM_BATCH_LIMIT = 900                          # Adjust this if you start getting errors from NodeNorm.
    NODENORM_GENE_PROTEIN_CONFLATION = True             # Change to False if you don't want gene/protein conflation.
    NODENORM_DRUG_CHEMICAL_CONFLATION = False           # Change to True if you want drug/chemical conflation.

    # split id_list into batches of at most NODENORM_BATCH_LIMIT entries
    for index in range(0, len(id_list), NODENORM_BATCH_LIMIT):
        id_sublist = id_list[index:index + NODENORM_BATCH_LIMIT]

        # print(f"id_sublist: {id_sublist}")

        # Query NodeNorm with https://nodenorm.transltr.io/docs#/default/get_normalized_node_handler_get_normalized_nodes_get
        response = requests.post(NODENORM_BASE_URL + '/get_normalized_nodes', json={
            "curies": id_sublist,
            "description": False,   # Change to True if you want descriptions from any identifiers we know about.
            "conflate": NODENORM_GENE_PROTEIN_CONFLATION,
            "drug_chemical_conflate": NODENORM_DRUG_CHEMICAL_CONFLATION,
        })
        if not response.ok:
            raise RuntimeError("Error: NodeNorm request failed with status code " + str(response.status_code))

        results = response.json()
        for curie in id_sublist:
            if curie in results and results[curie]:
                identifier = results[curie].get('id', {})
                if 'identifier' in identifier and identifier['identifier'] != curie:
                    recoglized_ids.append(curie)
                    #print(f"NodeNorm normalized {curie} to {identifier['identifier']} " +
                    #      f"with gene-protein conflation {NODENORM_GENE_PROTEIN_CONFLATION} and " +
                    #      f"with drug-chemical conflation {NODENORM_DRUG_CHEMICAL_CONFLATION}.")
                label = identifier.get('label')
                dic_id_map[curie] = label
                if not label:
                    print(curie + ": no preferred name")
                    dic_id_map[curie] = curie
            else:
                unrecoglized_ids.append(curie)

                dic_id_map[curie] = curie
    if len(unrecoglized_ids) > 0:
        print("NodeNorm does not know about these identifiers: " + ",".join(unrecoglized_ids))

    return dic_id_map
Source code for TCT.node_normalizer

Translator Component Toolkit

Navigation

Related Topics