• This is the python script use for get huge PubMed data from NCBI API.
    Document is here. https://www.ncbi.nlm.nih.gov/books/NBK25501/
  • You have to read below term before use this script.
    https://www.nlm.nih.gov/databases/download/terms_and_conditions.html

What's Output?¶

Thinking about outputimage but no concrete image.

Import Library¶

In [1]:
# conda install numpy
# conda install matplotlib
# conda install plotly pandas
# conda install requests
# conda install -c conda-forge scikit-learn
# conda install -c conda-forge tqdm
# conda install -c conda-forge ipywidgets
# conda install -c conda-forge nbformat
# ipython kernel install --user --name=pyPubTator
# conda install -c conda-forge nodejs
# (base) conda install -c plotly plotly
# (base) conda install jupyterlab ipywidgets
# (base) conda install -c conda-forge nodejs
# (base) jupyter labextension install jupyterlab-plotly
## UserWarning: ValueError: The extension "jupyterlab-plotly" 
## does not yet support the current version of JupyterLab.
## on 2023/10/09

import math
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import requests
import urllib.parse
import uuid
import xml.etree.ElementTree as ET
from collections import OrderedDict
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
# from tqdm import tqdm_notebook as tqdm
from tqdm.notebook import tqdm
plotly.offline.init_notebook_mode(connected=False)

Parameters¶

In [2]:
iterCountTrial = 20 # just for trial
In [3]:
# const
BASEURL_INFO = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
BASEURL_SRCH = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
BASEURL_FTCH = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
In [4]:
# const (PubTator)
BASEURL_PUBTATOR = 'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator'
BASEURL_BIOCXML = 'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml'
BASEURL_BIOCJSON = 'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson'
# https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator?pmids=28483577&concepts=gene
# https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmcids=PMC6207735
# https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmids=28483577,28483578,28483579
BATCH_NUM_PUBTATOR    = 100
In [5]:
# parameters
TERM         = 'cancer'
In [6]:
DATE_TYPE    = 'pdat'       # Type of date used to limit a search. The allowed values vary between Entrez databases, but common values are 'mdat' (modification date), 'pdat' (publication date) and 'edat' (Entrez date). Generally an Entrez database will have only two allowed values for datetype.
MIN_DATE     = '2018/01/01' # yyyy/mm/dd
MAX_DATE     = '2019/12/31' # yyyy/mm/dd
In [7]:
SOURCE_DB    = 'pubmed'
SEP          = '|'
BATCH_NUM    = 1000
In [8]:
# seed
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

functions¶

In [9]:
'''
make query function

base_url: base_url
params: parameter dictionary
        ex) {key1: value1, key2: value2}
'''
def mkquery(base_url, params):
    base_url += '?'
    for key, value in zip(params.keys(), params.values()):
        base_url += '{key}={value}&'.format(key=key, value=value)
    url = base_url[0:len(base_url) - 1]
    # print('request url is: ' + url)
    return url

def mkqueryprint(base_url, params, i):
    base_url += '?'
    for key, value in zip(params.keys(), params.values()):
        base_url += '{key}={value}&'.format(key=key, value=value)
    url = base_url[0:len(base_url) - 1]
    print('request url ' + str(i) + ' is: ' + url)
    return url

'''
getXmlFromURL
(mkquery wrapper)

base_url: base_url
params: parameter dictionary
        ex) {key1: value1, key2: value2}
'''
def getXmlFromURL(base_url, params):
    response = requests.get(mkquery(base_url, params))
    return ET.fromstring(response.text)

'''
getTextFromNode

root: Xml root node
path: XPath
fill: fill na string
mode: 0 = text, 1 = attribute
attrib: attribute name
'''
def getTextFromNode(root, path, fill='', mode=0, attrib='attribute'):
    if (root.find(path) == None):
        return fill
    else:
        if mode == 0:
            return root.find(path).text
        if mode == 1:
            return root.find(path).get(attrib)
    

# example
rootXml = getXmlFromURL(BASEURL_INFO, {'db': SOURCE_DB})

Statistic¶

In [10]:
# Info API
rootXml = getXmlFromURL(BASEURL_INFO, {'db': SOURCE_DB})
print(rootXml.find('DbInfo').find('Count').text)
print(rootXml.find('DbInfo').find('LastUpdate').text)
36298248
2023/10/08 20:23

Get Total Count¶

In [11]:
# get xml
rootXml = getXmlFromURL(BASEURL_SRCH, {
    'db': SOURCE_DB,
    'term': TERM,
    'usehistory': 'y',
    'datetype': DATE_TYPE,
    'mindate': MIN_DATE,
    'maxdate': MAX_DATE})
In [12]:
# get PMIDs
IdList = rootXml.find('IdList')
len(IdList)
PMIDs = []
PMIDsOneLine = ''
for child in IdList.iter():
    # 特定要素の抽出
    if child.tag == 'Id':
        PMID = child.text
        # print(PMID)
        PMIDs.append(PMID)
        # PMIDsOneLine += PMID + ","

PMIDsOneLine = ",".join(PMIDs)

PMIDsOneLine
#print('PMIDs: ', Idlist)
Out[12]:
'37318346,36753623,36753205,36753200,36753199,36753182,36452791,36075667,35779889,35779886,35779885,35779883,35590431,35483769,35462651,35242282,35100937,34980445,34980443,34980441'
In [13]:
# get querykey and webenv
Count = rootXml.find('Count').text
QueryKey = rootXml.find('QueryKey').text
WebEnv = urllib.parse.quote(rootXml.find('WebEnv').text)

print('total Count: ', Count)
print('QueryKey   : ', QueryKey)
print('WebEnv     : ', WebEnv)
total Count:  421386
QueryKey   :  1
WebEnv     :  MCID_6523bbb9d8c193677b66260e

Get All details¶

In [14]:
articleDics = []
authorArticleDics = []
authorAffiliationDics = []

def pushData(rootXml):
    for article in rootXml.iter('PubmedArticle'):
        # get article info
        articleDic = {
            'PMID'                    : getTextFromNode(article, 'MedlineCitation/PMID', ''),
            'JournalTitle'            : getTextFromNode(article, 'MedlineCitation/Article/Journal/Title', ''),
            'Title'                   : getTextFromNode(article, 'MedlineCitation/Article/ArticleTitle', ''),
            'doi'                     : getTextFromNode(article, 'MedlineCitation/Article/ELocationID[@EIdType="doi"]', ''),
            'Abstract'                : getTextFromNode(article, 'MedlineCitation/Article/Abstract/AbstractText', ''),
        #    if you want to get data in flat(denormalized), uncomment below. but it's difficult to use for analytics.
        #    'Authors'                 : SEP.join([author.find('ForeName').text + ' ' +  author.find('LastName').text if author.find('CollectiveName') == None else author.find('CollectiveName').text for author in article.findall('MedlineCitation/Article/AuthorList/')]),
        #    'AuthorIdentifiers'       : SEP.join([getTextFromNode(author, 'Identifier', 'None') for author in article.findall('MedlineCitation/Article/AuthorList/')]),
        #    'AuthorIdentifierSources' : SEP.join([getTextFromNode(author, 'Identifier', 'None', 1, 'Source') for author in article.findall('MedlineCitation/Article/AuthorList/')]),
            'Language'                : getTextFromNode(article, 'MedlineCitation/Article/Language', ''),
            'Year_A'                  : getTextFromNode(article, 'MedlineCitation/Article/ArticleDate/Year', ''),
            'Month_A'                 : getTextFromNode(article, 'MedlineCitation/Article/ArticleDate/Month', ''),
            'Day_A'                   : getTextFromNode(article, 'MedlineCitation/Article/ArticleDate/Day', ''),
            'Year_PM'                 : getTextFromNode(article, 'PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]/Year', ''),
            'Month_PM'                : getTextFromNode(article, 'PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]/Month', ''),
            'Day_PM'                  : getTextFromNode(article, 'PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]/Day', ''),
            'Status'                  : getTextFromNode(article, './PubmedData/PublicationStatus', ''),
            'MeSH'                    : SEP.join([getTextFromNode(mesh, 'DescriptorName') for mesh in article.findall('MedlineCitation/MeshHeadingList/')]),
            'MeSH_UI'                 : SEP.join([getTextFromNode(mesh, 'DescriptorName', '', 1, 'UI') for mesh in article.findall('MedlineCitation/MeshHeadingList/')]),
            'Keyword'                 : SEP.join([keyword.text if keyword.text != None else ''  for keyword in article.findall('MedlineCitation/KeywordList/')])
        }
        articleDics.append(OrderedDict(articleDic))

        if article.find('MedlineCitation/MeshHeadingList/MeshHeading/') != None:
            tmp = article

        # get author info
        for author in article.findall('MedlineCitation/Article/AuthorList/'):

            # publish author ID
            # * It's only random id. not use for identify author. if you want to identify author, you can use identifier.
            authorId = str(uuid.uuid4())

            # author article
            authorArticleDic = {
                'authorId'         : authorId,
                'PMID'             : getTextFromNode(article, 'MedlineCitation/PMID', ''),
                'name'             : getTextFromNode(author, 'ForeName') + ' ' +  getTextFromNode(author,'LastName') if author.find('CollectiveName') == None else author.find('CollectiveName').text,
                'identifier'       : getTextFromNode(author, 'Identifier', '') ,
                'identifierSource' : getTextFromNode(author, 'Identifier', '', 1, 'Source')
            }
            authorArticleDics.append(OrderedDict(authorArticleDic))

            # author affiliation(author: affiliation = 1 : n)
            if author.find('./AffiliationInfo') != None:
                for affiliation in author.findall('./AffiliationInfo'):
                    authorAffiliationDic = {
                        'authorId'          : authorId,
                        'affiliation'       : getTextFromNode(affiliation, 'Affiliation', '') ,
                    }
                    authorAffiliationDics.append(OrderedDict(authorAffiliationDic))
In [15]:
annotation = 'MESH:D013629'
MeshId = annotation[5:]
#https://id.nlm.nih.gov/mesh/lookup/label?resource=D013629
MeshUrl = 'https://id.nlm.nih.gov/mesh/lookup/label?resource=' + MeshId
MeshResponse = requests.get(MeshUrl)
Mesh = MeshResponse.text[2:-2]

# https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/
MeshUrl = 'desc2023.xml'
MeshXml = ET.parse('desc2023.xml')
DicsMeshUidName = []
for MeshRecord in MeshXml.iter('DescriptorRecord'):
    MeshUidName = {
        'MeshUids' : getTextFromNode(MeshRecord, 'DescriptorUI', ''), 
        'MeshName' : getTextFromNode(MeshRecord, 'DescriptorName/String', '')
    }
    DicsMeshUidName.append(OrderedDict(MeshUidName))
# MeshUidName
df_Mesh = pd.DataFrame(DicsMeshUidName)
df_Mesh = df_Mesh.set_index('MeshUids')
df_Mesh.head(5)
Out[15]:
MeshName
MeshUids
D000001 Calcimycin
D000002 Temefos
D000003 Abattoirs
D000004 Abbreviations as Topic
D000005 Abdomen
In [16]:
# import gzip

# HomoloGeneXml = ET.parse(gzip.open('homologene.xml.gz'))
# DicsHomoloGene = []
# root = HomoloGeneXml.getroot()

# for child in root:
#    print(child.tag, child.attrib)
    

#for MeshRecord in HomoloGeneXml.iter('DescriptorRecord'):
#    MeshUidName = {
#        'MeshUids' : getTextFromNode(MeshRecord, 'DescriptorUI', ''), 
#        'MeshName' : getTextFromNode(MeshRecord, 'DescriptorName/String', '')
#    }
#    DicsMeshUidName.append(OrderedDict(MeshUidName))
# MeshUidName
#df_Mesh = pd.DataFrame(DicsMeshUidName)
#df_Mesh = df_Mesh.set_index('MeshUids')
#df_Mesh.head(5)

# (cmd) python -m gzip -d Homo_sapiens.gene_info.gz
# https://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/
df_gene = pd.read_table('Homo_sapiens.gene_info.gz', index_col=0)
df_gene = df_gene.set_index('GeneID')
df_gene.head(5)
Out[16]:
Symbol LocusTag Synonyms dbXrefs chromosome map_location description type_of_gene Symbol_from_nomenclature_authority Full_name_from_nomenclature_authority Nomenclature_status Other_designations Modification_date Feature_type
GeneID
1 A1BG - A1B|ABG|GAB|HYST2477 MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410... 19 19q13.43 alpha-1-B glycoprotein protein-coding A1BG alpha-1-B glycoprotein O alpha-1B-glycoprotein|HEL-S-163pA|epididymis s... 20230621 -
2 A2M - A2MD|CPAMD5|FWP007|S863-7 MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899... 12 12p13.31 alpha-2-macroglobulin protein-coding A2M alpha-2-macroglobulin O alpha-2-macroglobulin|C3 and PZP-like alpha-2-... 20230801 -
3 A2MP1 - A2MP HGNC:HGNC:8|Ensembl:ENSG00000291190|AllianceGe... 12 12p13.31 alpha-2-macroglobulin pseudogene 1 pseudo A2MP1 alpha-2-macroglobulin pseudogene 1 O pregnancy-zone protein pseudogene 20230329 -
9 NAT1 - AAC1|MNAT|NAT-1|NATI MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171... 8 8p22 N-acetyltransferase 1 protein-coding NAT1 N-acetyltransferase 1 O arylamine N-acetyltransferase 1|N-acetyltransf... 20230724 -
10 NAT2 - AAC2|NAT-2|PNAT MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156... 8 8p22 N-acetyltransferase 2 protein-coding NAT2 N-acetyltransferase 2 O arylamine N-acetyltransferase 2|N-acetyltransf... 20230806 -
In [17]:
# https://ftp.ncbi.nih.gov/pub/HomoloGene/current/build_inputs/taxid_taxname
df_Taxonomy = pd.read_table('taxid_taxname.txt', names=['TaxID', 'TaxName'])
df_Taxonomy = df_Taxonomy.set_index('TaxID')
df_Taxonomy.head(5)
Out[17]:
TaxName
TaxID
10090 Mus musculus
10116 Rattus norvegicus
28985 Kluyveromyces lactis
318829 Magnaporthe oryzae
33169 Eremothecium gossypii
In [18]:
# https://ftp.expasy.org/databases/cellosaurus/
CellLineUrl = 'cellosaurus.xml'
CellLineXml = ET.parse('cellosaurus.xml')
DicsCellLine = []
# i = 0
# root = CellLineXml.getroot()
# 
# for child in root:
#     print(child.tag, child.attrib)
    
for cellline in CellLineXml.findall('cell-line-list/cell-line'):
    DicCellLine = {
        'Accession' : getTextFromNode(cellline, 'accession-list/accession[@type="primary"]', ''), 
        'CellName' : getTextFromNode(cellline, 'name-list/name[@type="identifier"]', '')
    }
    DicsCellLine.append(OrderedDict(DicCellLine))

df_CellLine = pd.DataFrame(DicsCellLine)
df_CellLine = df_CellLine.set_index('Accession')
df_CellLine.head(5)
Out[18]:
CellName
Accession
CVCL_B0T9 #132 PC3-1-SC-E8
CVCL_B0T8 #132 PL12 SC-D1
CVCL_E548 #15310-LN
CVCL_KA96 #16-15
CVCL_IW91 #40a
In [19]:
print(len(CellLineXml.findall('cell-line-list/cell-line')))
# print(i)
df_CellLine.head(5)
146062
Out[19]:
CellName
Accession
CVCL_B0T9 #132 PC3-1-SC-E8
CVCL_B0T8 #132 PL12 SC-D1
CVCL_E548 #15310-LN
CVCL_KA96 #16-15
CVCL_IW91 #40a
In [20]:
articleDicsPubTator = []
# authorArticleDics = []
# authorAffiliationDics = []

def getPubTatorGene(annotation, df_Gene):
    if annotation == '':
        gene = ''
    else:
        gene = df_Gene[annotation][Symbol]
    return gene

def getMESH(annotation, df_Mesh):
    if getTextFromNode(annotation, 'infon[@key="type"]', '') == 'MESH':
        # MESH:D013629
        MeshId = getTextFromNode(annotation, 'infon[@key="identifier"]', '')
        MeshId = MeshId[5:]
        Mesh = df_Mesh[MeshId][MeshName]
    else:
        Mesh = ''    
    return Mesh
    
def getCellLine(annotaion, df_CellLine):
    if getTextFromNode(annotation, 'infon[@key="type"]', '') == 'CellLine':
        CellLine = df_CellLine[getTextFromNode(annotation, 'infon[@key="identifier"]', '')][CellName]
    else:
        CellLine = ''
    return CellLine

def getMutation(aanotation):
    test
    return Mutation

def getSpecies(annotation, df_Taxonomy):
    if annotation == '':
        Species = ''
    else:
        Species = df_Taxonomy[annotation][TaxName]
    return Species

def getTextAnnotaion(annotation):
    pidentifier = getTextFromNode(annotation, 'infon[@key="identifier"]', '')
    ptype = getTextFromNode(annotation, 'infon[@key="type"]', '')
    # ptype: Disease, Chemical, Gene, Species, CellLine
    #if ptype == 
    
def pushDataPubTator(rootXml):
    for document in rootXml.iter('document'):
        # get article info
        for annotation in document.findall('passage/annotation/infon'):
            if annotation.attrib['key'] == 'identifier':
                PubTatorIdentifierTmp = annotation.text
            if annotation.attrib['key'] == 'type':
                PubTatorTypeTmp = annotation.text
        articleDicPubTator = {
            'PMID'                    : getTextFromNode(document, 'id', ''),
            'PubTatorIdentifier'      : SEP.join([getTextFromNode(annotation, 'infon[@key="identifier"]', '')
                                                  for annotation in document.findall('passage/annotation')]),
            'PubTatorType'            : SEP.join([getTextFromNode(annotation, 'infon[@key="type"]', '')
                                                  for annotation in document.findall('passage/annotation')]),
            'PubTatorHomoloGene'      : SEP.join([getTextFromNode(annotation, 'infon[@key="NCBI Homologene"]', '')
                                                  for annotation in document.findall('passage/annotation')]),
            'PubTatorLocation'        : SEP.join([getTextFromNode(annotation, 'location', '', 1, 'offset')
                                                  for annotation in document.findall('passage/annotation')]),
            'PubTatorText'            : SEP.join([getTextFromNode(annotation, 'text', '')
                                                  for annotation in document.findall('passage/annotation')]),
        }
        articleDicsPubTator.append(OrderedDict(articleDicPubTator))
In [21]:
# ceil
iterCount = math.ceil(int(Count) / BATCH_NUM)
iterCount = iterCountTrial # just for trial
# get all data
for i in tqdm(range(iterCount)):
    rootXml = getXmlFromURL(BASEURL_FTCH, {
        'db': SOURCE_DB,
        'query_key': QueryKey,
        'WebEnv': WebEnv,
        'retstart': i * BATCH_NUM,
        'retmax': BATCH_NUM,
        'retmode': 'xml'})
    
    pushData(rootXml)
  0%|          | 0/20 [00:00<?, ?it/s]
In [22]:
# article
df_article = pd.DataFrame(articleDics)
# ceil
iterCount = math.ceil(int(Count) / BATCH_NUM)
iterCount = 100 # just for trial
printCount = 20
# get all data
for i in tqdm(range(iterCount)):
    retstart = i * BATCH_NUM_PUBTATOR
    retend =  (i + 1) * BATCH_NUM_PUBTATOR
    ret_df_article = df_article.iloc[retstart:retend]
    ret_df_article_pmids  = ret_df_article['PMID']
    PMIDsOneLine = ",".join(ret_df_article_pmids)
    rootXmlPubTator = getXmlFromURL(BASEURL_BIOCXML, {'pmids': PMIDsOneLine})
    if i % printCount == 0:
        mkqueryprint(BASEURL_BIOCXML, {'pmids': PMIDsOneLine}, i)
    pushDataPubTator(rootXmlPubTator)
  0%|          | 0/100 [00:00<?, ?it/s]
request url 0 is: https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmids=37318346,36753623,36753205,36753200,36753199,36753182,36452791,36075667,35779889,35779886,35779885,35779883,35590431,35483769,35462651,35242282,35100937,34980445,34980443,34980441,34980438,34754911,34754909,34754903,34590506,34539049,34539046,34493369,34426479,34414848,34414847,34306915,34243914,34243910,34239389,34190022,34177378,34175032,34175030,34096432,34082642,34059235,34027418,34026405,34026404,34024926,33998520,33998482,33998468,33994729,33994726,33994724,33994723,33969772,33927493,33911342,33907724,33899924,33884377,33870087,33869785,33867720,33867704,33854408,33838042,33816203,33790488,33790481,33777636,33776557,33747610,33737982,33730117,33730115,33726891,33692833,33692603,33688113,33619573,33616513,33608435,33603623,33584050,33584040,33569282,33569278,33563372,33518805,33505620,33505137,33505135,33505127,33501156,33487634,33469503,33464845,33463296,33463284,33463270,33463251
request url 20 is: https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmids=32156962,32156961,32156960,32156959,32156958,32156957,32156956,32156955,32156954,32156953,32156952,32156951,32156950,32156949,32156948,32156947,32156946,32156945,32156944,32156943,32156942,32156941,32156940,32156939,32156938,32156937,32156936,32156935,32156934,32156933,32156932,32156931,32156930,32156929,32156928,32156927,32156926,32156925,32156924,32156923,32156922,32156921,32156920,32156919,32156918,32156917,32156916,32156915,32156914,32156913,32156912,32156911,32156910,32156909,32156908,32156907,32156906,32156905,32156904,32156903,32156902,32156901,32156900,32156899,32156898,32156897,32156896,32156895,32156894,32156893,32156892,32156891,32156890,32156889,32156888,32156887,32156886,32156885,32156884,32156883,32156882,32156881,32156880,32156879,32156878,32156877,32156876,32156875,32156874,32156873,32156872,32156871,32156870,32156869,32156868,32156867,32156866,32156865,32156864,32156863
request url 40 is: https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmids=32001161,32001154,32001147,32001012,32000916,32000914,32000913,32000912,32000911,32000910,32000909,32000908,32000907,32000906,32000905,32000904,32000903,32000902,32000901,32000900,32000899,32000898,32000897,32000896,32000895,32000894,32000893,32000892,32000891,32000481,32000329,32000075,32000032,31999669,31999590,31999584,31999577,31999576,31999573,31999571,31999555,31999544,31998864,31998827,31998820,31998818,31998817,31998816,31998813,31998812,31998772,31998754,31998749,31998733,31998724,31998722,31998721,31998718,31998707,31998704,31998654,31998653,31998652,31998651,31998650,31998649,31998648,31998647,31998646,31998645,31998644,31998643,31998642,31998641,31998640,31998639,31998638,31998637,31998636,31998635,31998634,31998633,31998631,31998630,31998629,31998627,31998622,31998621,31998620,31998618,31998543,31998542,31998541,31998538,31998537,31998495,31998494,31998486,31998480,31998477
request url 60 is: https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmids=31926773,31926772,31926768,31926762,31926733,31926732,31926730,31926722,31926715,31926713,31926701,31926698,31926696,31926682,31926681,31926680,31926667,31926648,31926628,31926624,31926622,31926619,31926609,31926587,31926585,31926584,31926582,31926580,31926576,31926575,31926566,31926501,31926493,31926475,31926469,31926441,31926403,31926384,31926377,31924585,31924584,31924574,31924526,31924513,31924512,31924481,31924480,31924432,31924412,31924365,31924363,31924355,31924335,31924315,31924055,31924044,31924042,31924041,31924033,31924031,31924029,31924026,31924025,31924023,31924012,31924010,31924007,31923999,31923997,31923988,31923987,31923986,31923975,31923965,31923958,31923957,31923953,31923949,31923941,31923940,31923934,31923860,31923859,31923858,31923847,31923822,31923791,31923786,31923762,31923747,31923745,31923744,31923732,31923687,31923684,31923676,31922704,31922703,31922508,31922503
request url 80 is: https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmids=31891584,31891574,31891569,31891547,31891546,31891413,31891326,31891322,31891321,31891293,31891291,31891283,31891275,31891274,31891260,31891240,31891239,31891232,31891230,31891228,31891216,31891149,31891146,31891138,31891135,31891131,31891130,31891129,31891128,31891127,31891126,31891112,31891105,31891090,31891085,31891063,31891061,31891054,31891050,31891019,31891018,31891016,31890904,31890897,31890889,31890886,31890881,31890880,31890879,31890878,31890872,31890869,31890868,31890866,31890865,31890834,31890832,31890830,31890813,31890758,31890756,31890746,31890734,31890713,31890710,31890692,31890691,31890670,31890648,31890647,31890638,31890636,31890583,31890578,31890559,31890556,31890475,31890470,31890469,31890468,31890457,31890445,31890437,31890434,31890419,31890401,31890398,31890396,31890391,31890390,31890384,31890368,31890367,31890362,31890344,31890337,31890322,31890303,31890299,31890285
In [23]:
# documents
df_document = pd.DataFrame(articleDicsPubTator)
# df_document.iloc[0,2]
df_document.head(5)
Out[23]:
PMID PubTatorIdentifier PubTatorType PubTatorHomoloGene PubTatorLocation PubTatorText
0 33463251 MESH:D008545|MESH:D009369|MESH:D008063|CVCL_01... Disease|Disease|Chemical|CellLine|Disease|Spec... ||||||||||||||||||||||||||||||| 107|177|435|515|522|534|603|616|629|664|704|82... Malignant Melanoma|cancer|lipoic acid|B16F10|m...
1 33463270 7474|1499|MESH:D001943|MESH:D009369|MESH:D0019... Gene|Gene|Disease|Disease|Disease|Gene|Gene|Ge... 20720|1434||||20720|1434|20720|1434||20720|143... 0|4|45|138|154|281|285|407|411|435|510|514|573... Wnt|beta-Catenin|Osteogenesis for Breast Cance...
2 33463284 MESH:D004317|-|MESH:D008545|MESH:D004317|MESH:... Chemical|Chemical|Disease|Chemical|Chemical|Di... ||||||||||||||||||||||||||||| 10|29|149|165|178|233|270|401|688|747|758|768|... Doxorubicin|PEGylated Poly(Lactide-co-glycolid...
3 33463296 MESH:D001943|MESH:D001943|MESH:D009369|MESH:D0... Disease|Disease|Disease|Disease|Disease|Specie... |||||||| 20|156|243|391|560|587|830|934|1236 Antibody Drug Conjugates Targeting the Breast ...
4 33464845 MESH:D008670|MESH:D008670|84148|84148|MESH:D00... Chemical|Chemical|Gene|Gene|Disease|Gene|Disea... ||41676|41676||41676|||| 0|384|624|755|795|866|976|994|1196|1318 Metal|metal|MOF|MOF|tumors|MOF|breast cancer|l...
In [24]:
document = rootXmlPubTator.find('document')
for result in document.findall('passage/infon'):
    print(result.attrib)
    if result.attrib['key'] == 'journal':
        print(result.text)
{'key': 'journal'}
Dig Liver Dis. 2020 Feb;52(2):234. doi: 10.1016/j.dld.2019.12.003. Epub 2019 Dec 
{'key': 'year'}
{'key': 'type'}
{'key': 'authors'}
{'key': 'type'}

Store¶

In [25]:
# article
df_article = pd.DataFrame(articleDics)
df_article.head(5)
Out[25]:
PMID JournalTitle Title doi Abstract Language Year_A Month_A Day_A Year_PM Month_PM Day_PM Status MeSH MeSH_UI Keyword
0 37318346 British journal of neurosurgery Olfactory schwannomas - an enigmatic clinical ... 10.1080/02688697.2019.1661968 Olfactory Schwannomas (OS) are a rare, benign ... eng 2019 09 06 2023 6 15 ppublish Female|Humans|Aged|Neurilemmoma|Diagnosis, Dif... D005260|D006801|D000368|D009442|D003937|D012903 olfactory|schwannoma|skull base
1 36753623 Revista de salud publica (Bogota, Colombia) [Biological cancer therapies: an approach towa... 10.15446/rsap.V21n4.73686 Monoclonal antibodies are a useful tool for la... spa 2023 2 9 ppublish Humans|Neoplasms|Antibodies, Monoclonal D006801|D009369|D000911
2 36753205 Revista de salud publica (Bogota, Colombia) [Survival of young adults with oral squamous c... 10.15446/rsap.V21n5.76193 To analyze the profile and survival in young a... por 2023 2 9 ppublish Humans|Male|Young Adult|Female|Squamous Cell C... D006801|D008297|D055815|D005260|D000077195|D00...
3 36753200 Revista de salud publica (Bogota, Colombia) [Screening mammography coverage and Decennial ... 10.15446/rsap.V21n5.81275 Estimate the coverage of screening mammograms ... spa 2023 2 9 ppublish Humans|Female|Mammography|Breast Neoplasms|Col... D006801|D005260|D008327|D001943|D003105|D00840...
4 36753199 Revista de salud publica (Bogota, Colombia) [Exercise tolerance and fatigue in women survi... 10.15446/rsap.V21n5.81849 Breast cancer is the most common type of cance... spa 2023 2 9 ppublish Female|Humans|Breast Neoplasms|Exercise Tolera... D005260|D006801|D001943|D017079|D003430|D01774...
In [26]:
# author article
df_author = pd.DataFrame(authorArticleDics)
df_author.head(10)
Out[26]:
authorId PMID name identifier identifierSource
0 48c6e144-080f-4c1b-896f-ca0c57285856 37318346 Georgios F Hadjigeorgiou
1 5a92951d-637d-4c10-8950-632632f5b65e 37318346 Eleana M Strouthou
2 eabab1d4-bd9f-4bf4-b441-ac42618bb5a6 37318346 Dimitri Koulousakis
3 d2df269f-6ba6-4ce1-b762-c127464c2c00 37318346 Victor Patsouris
4 a80084a0-5e78-457d-b2e2-a8a5d2c81fe4 37318346 Frauke Neff
5 2fb1c66f-bb7b-4aff-b80f-1d2b4eabdb8e 37318346 Christianto B Lumenta
6 dfd80221-6582-40de-81a1-c14e3756251b 37318346 David B Schul
7 a671e821-beaa-48e8-ae6c-e9e5d6e4c735 36753623 Monica B Aranda
8 23a184a5-8a71-4578-9d3b-18e3384b31c4 36753623 Karina B Sabalette
9 4173a4e2-5211-42e7-a8ad-253eff820425 36753205 Marília de Matos Amorim
In [27]:
# author affiliation
df_affiliation = pd.DataFrame(authorAffiliationDics)
df_affiliation.head(10)
Out[27]:
authorId affiliation
0 48c6e144-080f-4c1b-896f-ca0c57285856 Department of Neurosugery, Klinikum Bogenhause...
1 48c6e144-080f-4c1b-896f-ca0c57285856 School of Medicine, European University of Cyp...
2 5a92951d-637d-4c10-8950-632632f5b65e School of Medicine, European University of Cyp...
3 eabab1d4-bd9f-4bf4-b441-ac42618bb5a6 Department of Neurosugery, Klinikum Bogenhause...
4 d2df269f-6ba6-4ce1-b762-c127464c2c00 Charite Medical Faculty Berlin, Berlin, Germany.
5 a80084a0-5e78-457d-b2e2-a8a5d2c81fe4 Department of Pathology, Klinikum Bogenhausen,...
6 2fb1c66f-bb7b-4aff-b80f-1d2b4eabdb8e Department of Neurosugery, Klinikum Bogenhause...
7 dfd80221-6582-40de-81a1-c14e3756251b Department of Neurosugery, Klinikum Bogenhause...
8 a671e821-beaa-48e8-ae6c-e9e5d6e4c735 MA: Contador Público. Ph. D. Gobierno y Cultur...
9 23a184a5-8a71-4578-9d3b-18e3384b31c4 KS: Lic. Biotecnología y Biología Molecular. U...
In [28]:
df_article.to_csv('pubmed_article.csv')
In [29]:
df_author.to_csv('pubmed_author.csv')
In [30]:
df_affiliation.to_csv('pubmed_affiliation.csv')

EDA¶

Reload the CSV so that you can resume the session at any time.

In [31]:
# reload
df_article = pd.read_csv('pubmed_article.csv', index_col=0)
df_author = pd.read_csv('pubmed_author.csv', index_col=0)
df_affiliation = pd.read_csv('pubmed_affiliation.csv', index_col=0)
In [32]:
print('number of article: ', len(df_article))
number of article:  9999

categorical feature¶

In [33]:
# data_canada = px.data.gapminder().query("country == 'Canada'")
# fig = px.bar(data_canada, x='year', y='pop')
# fig.show()
In [50]:
# Categorical Feature
for catCol in ['Language', 'Status']:
    df_article_count = df_article[catCol].value_counts()
    #df_article.head()
    fig = px.bar(df_article_count, x=df_article_count.index, y=df_article_count, height=400, width=800, title="{}'s unique count".format(catCol), labels={catCol: 'Count', 'index': catCol})
    fig.show()

publish distribution¶

In [35]:
# concat
# df_article['ArticlePublishDate'] = df_article['Year_A'].fillna(0).astype(int).astype(str).str.zfill(4) + '-' +  df_article['Month_A'].fillna(0).astype(int).astype(str).str.zfill(2) + '-' +  df_article['Day_A'].fillna(0).astype(int).astype(str).str.zfill(2)
df_article['ArticlePublishDate'] = df_article['Year_A'].fillna(0).astype(int).astype(str).str.zfill(4) + '-' +  df_article['Month_A'].fillna(0).astype(int).astype(str).str.zfill(2)
df_article['PubMedPublishDate'] = df_article['Year_PM'].fillna(0).astype(int).astype(str).str.zfill(4) + '-' +  df_article['Month_PM'].fillna(0).astype(int).astype(str).str.zfill(2)
In [51]:
for catCol in ['ArticlePublishDate', 'PubMedPublishDate']:
    df_article_article = df_article[df_article[catCol] != '0000-00'][catCol].value_counts().sort_index()
    #fig = px.line(df_article_article, x=df_article_article.index, y=catCol, height=600, width=1200, title="{}'s distribution".format(catCol), labels={catCol: 'Publish Count', 'index': catCol})
    fig = px.line(df_article_article, x=df_article_article.index, y=df_article_article, height=600, width=1200, title="{}'s distribution".format(catCol))
    fig.show()

what content 'not' english article includes?¶

In [37]:
df_article[df_article['Language'] != 'eng'].head(5)
Out[37]:
PMID JournalTitle Title doi Abstract Language Year_A Month_A Day_A Year_PM Month_PM Day_PM Status MeSH MeSH_UI Keyword ArticlePublishDate PubMedPublishDate
1 36753623 Revista de salud publica (Bogota, Colombia) [Biological cancer therapies: an approach towa... 10.15446/rsap.V21n4.73686 Monoclonal antibodies are a useful tool for la... spa NaN NaN NaN 2023 2 9 ppublish Humans|Neoplasms|Antibodies, Monoclonal D006801|D009369|D000911 NaN 0000-00 2023-02
2 36753205 Revista de salud publica (Bogota, Colombia) [Survival of young adults with oral squamous c... 10.15446/rsap.V21n5.76193 To analyze the profile and survival in young a... por NaN NaN NaN 2023 2 9 ppublish Humans|Male|Young Adult|Female|Squamous Cell C... D006801|D008297|D055815|D005260|D000077195|D00... NaN 0000-00 2023-02
3 36753200 Revista de salud publica (Bogota, Colombia) [Screening mammography coverage and Decennial ... 10.15446/rsap.V21n5.81275 Estimate the coverage of screening mammograms ... spa NaN NaN NaN 2023 2 9 ppublish Humans|Female|Mammography|Breast Neoplasms|Col... D006801|D005260|D008327|D001943|D003105|D00840... NaN 0000-00 2023-02
4 36753199 Revista de salud publica (Bogota, Colombia) [Exercise tolerance and fatigue in women survi... 10.15446/rsap.V21n5.81849 Breast cancer is the most common type of cance... spa NaN NaN NaN 2023 2 9 ppublish Female|Humans|Breast Neoplasms|Exercise Tolera... D005260|D006801|D001943|D017079|D003430|D01774... NaN 0000-00 2023-02
5 36753182 Revista de salud publica (Bogota, Colombia) [Sociodemographic and clinical characterizatio... 10.15446/rsap.V21n3.70678 To characterize socioeconomic, demographic, he... por NaN NaN NaN 2023 2 9 ppublish Humans|Male|Prospective Studies|Prostatic Neop... D006801|D008297|D011446|D011471|D007182|D00193... NaN 0000-00 2023-02

How many nan do articles include?¶

  • PMID is require column.
  • There are scarcely user identify ID(ORCID)
In [38]:
# False: not nan, True: is nan, values are percent
pd.merge(df_article, df_author, on='PMID', how='left').isnull().apply(lambda col: col.value_counts(), axis=0).fillna(0).astype(float).apply(lambda col: col/col.sum(), axis=0)
Out[38]:
PMID JournalTitle Title doi Abstract Language Year_A Month_A Day_A Year_PM ... Status MeSH MeSH_UI Keyword ArticlePublishDate PubMedPublishDate authorId name identifier identifierSource
False 1.0 1.0 0.994038 0.886236 0.916267 1.0 0.797636 0.797636 0.797636 1.0 ... 1.0 0.557208 0.557208 0.739556 1.0 1.0 0.999541 0.999541 0.070625 0.070625
True 0.0 0.0 0.005962 0.113764 0.083733 0.0 0.202364 0.202364 0.202364 0.0 ... 0.0 0.442792 0.442792 0.260444 0.0 0.0 0.000459 0.000459 0.929375 0.929375

2 rows × 22 columns

How many authors in each articles?¶

In [39]:
for catCol in ['authorId']:
    df_author_group = df_author.groupby('PMID', as_index=False).count()[catCol].value_counts().reset_index()
    fig = px.bar(df_author_group, x=df_author_group.index, y=catCol, height=600, width=1600, title="How many authors in each articles?".format(catCol), labels={catCol: 'Count', 'index': catCol})
    fig.show()

Is MeSH or Keyword NaN?¶

In [40]:
# False: not nan, True: is nan, values are percent
pd.DataFrame(pd.DataFrame(df_article['MeSH'].fillna('').astype(str) + df_article['Keyword'].fillna('').astype(str))[0] == '').apply(lambda col: col.value_counts(), axis=0).fillna(0).astype(float).apply(lambda col: col/col.sum(), axis=0)
Out[40]:
0
0
False 0.920692
True 0.079308

Clustering¶

In [41]:
df_article['allText'] = df_article['Title'].fillna('') + df_article['Abstract'].fillna('') + df_article['MeSH'].fillna('') + df_article['Keyword'].fillna('')
In [42]:
tfidf = TfidfVectorizer(
    min_df = 5,
    max_df = 0.95,
    max_features = 8000,
    stop_words = 'english'
)

tfidf.fit(df_article.allText)
text = tfidf.fit_transform(df_article.allText)
In [43]:
"""
Finding Optimal Clusters¶
https://www.kaggle.com/jbencina/clustering-documents-with-tfidf-and-kmeans
"""
def find_optimal_clusters(data, max_k):
    iters = range(10, max_k+1, 10)
    
    sse = []
    for k in iters:
        sse.append(MiniBatchKMeans(n_clusters=k, n_init=10, init_size=1024, batch_size=2048, random_state=20).fit(data).inertia_)
        print('Fit {} clusters'.format(k))
        
    f, ax = plt.subplots(1, 1)
    ax.plot(iters, sse, marker='o')
    ax.set_xlabel('Cluster Centers')
    ax.set_xticks(iters)
    ax.set_xticklabels(iters)
    ax.set_ylabel('SSE')
    ax.set_title('SSE by Cluster Center Plot')
    
find_optimal_clusters(text, 100)
Fit 10 clusters
Fit 20 clusters
Fit 30 clusters
Fit 40 clusters
Fit 50 clusters
Fit 60 clusters
Fit 70 clusters
Fit 80 clusters
Fit 90 clusters
Fit 100 clusters
No description has been provided for this image
In [44]:
# Clustring
clusters = MiniBatchKMeans(n_clusters=80, n_init=10, init_size=1024, batch_size=2048, random_state=20).fit_predict(text)
In [45]:
print(text.shape)
print(len(clusters))
(9999, 8000)
9999
In [46]:
# random sampling size
RANDOM_SAMPLING_SIZE = 3000

# random sampling
random_idx = np.random.choice(range(text.shape[0]), size=RANDOM_SAMPLING_SIZE, replace=False)

# t-sne(with random sampling pca)
# tsne = TSNE(random_state=RANDOM_STATE).fit_transform(PCA(n_components=50, random_state=RANDOM_STATE).fit_transform(text[random_idx,:].todense()))
tsne = TSNE(random_state=RANDOM_STATE).fit_transform(np.asarray(text[random_idx,:].todense()))

# random sampling df
df_article_tsne = df_article.iloc[random_idx]

# horizontal concat
df_article_tsne = df_article.iloc[random_idx].copy()
df_article_tsne['tsne_x'] = tsne[:, 0]
df_article_tsne['tsne_y'] = tsne[:, 1]
df_article_tsne['cluster'] = clusters[random_idx]
df_article_tsne.head()
Out[46]:
PMID JournalTitle Title doi Abstract Language Year_A Month_A Day_A Year_PM ... Status MeSH MeSH_UI Keyword ArticlePublishDate PubMedPublishDate allText tsne_x tsne_y cluster
5344 31939433 Journal of cancer research and therapeutics Risk factors for 30-day unplanned reoperation ... 10.4103/jcrt.JCRT_137_19 The purpose of this study was to investigate t... eng NaN NaN NaN 2020 ... ppublish Aged|Comorbidity|Female|Humans|Male|Middle Age... D000368|D015897|D005260|D006801|D008297|D00887... Complications|pancreatoduodenectomy|reoperatio... 0000-00 2020-01 Risk factors for 30-day unplanned reoperation ... -0.213256 0.419736 40
7444 31898683 Journal of cancer research and therapeutics Sister Mary Joseph's nodule in endometrial can... 10.4103/jcrt.JCRT_523_18 Sister Mary Joseph's nodule (SMJN) is an umbil... eng NaN NaN NaN 2020 ... ppublish Aged, 80 and over|Biopsy|Combined Modality The... D000369|D001706|D003131|D016889|D005260|D00680... Endometrial cancer|robotic surgery|umbilical m... 0000-00 2020-01 Sister Mary Joseph's nodule in endometrial can... 0.985034 3.328131 40
1731 32184851 Iranian journal of pharmaceutical research : IJPR A Comparison of Cytotoxic Effects of 10.22037/ijpr.2019.111977.13462 Natural products isolated from plant sources a... eng NaN NaN NaN 2020 ... ppublish NaN NaN Anticancer activity|Chronic Lymphocytic Leukem... 0000-00 2020-03 A Comparison of Cytotoxic Effects of Natural p... -2.783489 -5.879603 38
8719 31885567 Journal of oncology Exploring the Role of Breast Density on Cancer... 10.1155/2019/1781762 Our aim was to assess the role of breast densi... eng 2019.0 11.0 27.0 2019 ... epublish NaN NaN NaN 2019-11 2019-12 Exploring the Role of Breast Density on Cancer... -4.970211 1.333966 40
4521 31972942 The Science of the total environment Spatially resolved distribution, sources and h... 10.1016/j.scitotenv.2019.135805 This work reports the first assessment of cont... eng 2019.0 11.0 27.0 2020 ... ppublish Adult|Child|China|Cities|Dust|Environmental Mo... D000328|D002648|D002681|D002947|D004391|D00478... Contamination indices|Health risk assessment|R... 2019-11 2020-01 Spatially resolved distribution, sources and h... -1.170886 -0.506048 38

5 rows × 22 columns

In [47]:
# scatter visualize
fig = px.scatter(
    df_article_tsne, 
    x="tsne_x", 
    y="tsne_y", 
    color="cluster",
    height=1200,
    # size='petal_length', 
    color_continuous_scale=px.colors.sequential.Plasma,
    hover_data=['Title', 'PMID']
)
fig.update_layout(
    showlegend=False
) 
fig.show()
In [48]:
def get_top_keywords(data, clusters, labels, n_terms):
    df = pd.DataFrame(data.todense()).groupby(clusters).mean()
    clusterTexts = []
    
    for i,r in df.iterrows():
        top_keywords = ','.join([labels[t] for t in np.argsort(r)[-n_terms:]])
        clusterTexts.append(top_keywords)
#         print('\nCluster {}'.format(i))
#         print(top_keywords)
    return clusterTexts

clusterTexts = get_top_keywords(text, clusters, tfidf.get_feature_names_out(), 10)

find seeds¶

In [49]:
df_article['cluster'] = clusters
df_cluster = pd.DataFrame(df_article.groupby('cluster', as_index=False).count().sort_values('cluster')['PMID'].values, columns=['num_articles'])
df_cluster['keywords'] = clusterTexts
df_cluster.sort_values('num_articles')
Out[49]:
num_articles keywords
0 1 90,carcinoma,transcriptome,extended,accounting...
42 1 capacities,clinical,coding,variants,variant,ge...
43 1 arterial,tgf,rat,factor,hepatic,hypoxia,transf...
44 1 cancer,explain,various,meta,association,estima...
45 1 evaluations,tuberculous,attributed,negative,rr...
... ... ...
48 6 survival,metastatic,prostatic,mcrpc,bone,resis...
19 10 dissection,metastasis,nodes,axilla,biopsy,brea...
34 12 patients,inflammatory,prognostic,monocyte,plat...
38 3634 treatment,study,disease,care,diseases,patients...
40 6246 treatment,expression,carcinoma,patients,lung,c...

80 rows × 2 columns