# conda install numpy
# conda install matplotlib
# conda install plotly pandas
# conda install requests
# conda install -c conda-forge scikit-learn
# conda install -c conda-forge tqdm
# conda install -c conda-forge ipywidgets
# conda install -c conda-forge nbformat
# ipython kernel install --user --name=pyPubTator
# conda install -c conda-forge nodejs
# (base) conda install -c plotly plotly
# (base) conda install jupyterlab ipywidgets
# (base) conda install -c conda-forge nodejs
# (base) jupyter labextension install jupyterlab-plotly
## UserWarning: ValueError: The extension "jupyterlab-plotly" 
## does not yet support the current version of JupyterLab.
## on 2023/10/09

import math
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import requests
import urllib.parse
import uuid
import xml.etree.ElementTree as ET
from collections import OrderedDict
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
# from tqdm import tqdm_notebook as tqdm
from tqdm.notebook import tqdm
plotly.offline.init_notebook_mode(connected=False)

iterCountTrial = 20 # just for trial

# const
BASEURL_INFO = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
BASEURL_SRCH = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
BASEURL_FTCH = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'

# const (PubTator)
BASEURL_PUBTATOR = 'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator'
BASEURL_BIOCXML = 'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml'
BASEURL_BIOCJSON = 'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson'
# https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator?pmids=28483577&concepts=gene
# https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmcids=PMC6207735
# https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmids=28483577,28483578,28483579
BATCH_NUM_PUBTATOR    = 100

# parameters
TERM         = 'cancer'

DATE_TYPE    = 'pdat'       # Type of date used to limit a search. The allowed values vary between Entrez databases, but common values are 'mdat' (modification date), 'pdat' (publication date) and 'edat' (Entrez date). Generally an Entrez database will have only two allowed values for datetype.
MIN_DATE     = '2018/01/01' # yyyy/mm/dd
MAX_DATE     = '2019/12/31' # yyyy/mm/dd

SOURCE_DB    = 'pubmed'
SEP          = '|'
BATCH_NUM    = 1000

# seed
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

'''
make query function

base_url: base_url
params: parameter dictionary
        ex) {key1: value1, key2: value2}
'''
def mkquery(base_url, params):
    base_url += '?'
    for key, value in zip(params.keys(), params.values()):
        base_url += '{key}={value}&'.format(key=key, value=value)
    url = base_url[0:len(base_url) - 1]
    # print('request url is: ' + url)
    return url

def mkqueryprint(base_url, params, i):
    base_url += '?'
    for key, value in zip(params.keys(), params.values()):
        base_url += '{key}={value}&'.format(key=key, value=value)
    url = base_url[0:len(base_url) - 1]
    print('request url ' + str(i) + ' is: ' + url)
    return url

'''
getXmlFromURL
(mkquery wrapper)

base_url: base_url
params: parameter dictionary
        ex) {key1: value1, key2: value2}
'''
def getXmlFromURL(base_url, params):
    response = requests.get(mkquery(base_url, params))
    return ET.fromstring(response.text)

'''
getTextFromNode

root: Xml root node
path: XPath
fill: fill na string
mode: 0 = text, 1 = attribute
attrib: attribute name
'''
def getTextFromNode(root, path, fill='', mode=0, attrib='attribute'):
    if (root.find(path) == None):
        return fill
    else:
        if mode == 0:
            return root.find(path).text
        if mode == 1:
            return root.find(path).get(attrib)
    

# example
rootXml = getXmlFromURL(BASEURL_INFO, {'db': SOURCE_DB})

# Info API
rootXml = getXmlFromURL(BASEURL_INFO, {'db': SOURCE_DB})
print(rootXml.find('DbInfo').find('Count').text)
print(rootXml.find('DbInfo').find('LastUpdate').text)

36298248
2023/10/08 20:23

# get xml
rootXml = getXmlFromURL(BASEURL_SRCH, {
    'db': SOURCE_DB,
    'term': TERM,
    'usehistory': 'y',
    'datetype': DATE_TYPE,
    'mindate': MIN_DATE,
    'maxdate': MAX_DATE})

# get PMIDs
IdList = rootXml.find('IdList')
len(IdList)
PMIDs = []
PMIDsOneLine = ''
for child in IdList.iter():
    # 特定要素の抽出
    if child.tag == 'Id':
        PMID = child.text
        # print(PMID)
        PMIDs.append(PMID)
        # PMIDsOneLine += PMID + ","

PMIDsOneLine = ",".join(PMIDs)

PMIDsOneLine
#print('PMIDs: ', Idlist)

'37318346,36753623,36753205,36753200,36753199,36753182,36452791,36075667,35779889,35779886,35779885,35779883,35590431,35483769,35462651,35242282,35100937,34980445,34980443,34980441'

# get querykey and webenv
Count = rootXml.find('Count').text
QueryKey = rootXml.find('QueryKey').text
WebEnv = urllib.parse.quote(rootXml.find('WebEnv').text)

print('total Count: ', Count)
print('QueryKey   : ', QueryKey)
print('WebEnv     : ', WebEnv)

total Count:  421386
QueryKey   :  1
WebEnv     :  MCID_6523bbb9d8c193677b66260e

articleDics = []
authorArticleDics = []
authorAffiliationDics = []

def pushData(rootXml):
    for article in rootXml.iter('PubmedArticle'):
        # get article info
        articleDic = {
            'PMID'                    : getTextFromNode(article, 'MedlineCitation/PMID', ''),
            'JournalTitle'            : getTextFromNode(article, 'MedlineCitation/Article/Journal/Title', ''),
            'Title'                   : getTextFromNode(article, 'MedlineCitation/Article/ArticleTitle', ''),
            'doi'                     : getTextFromNode(article, 'MedlineCitation/Article/ELocationID[@EIdType="doi"]', ''),
            'Abstract'                : getTextFromNode(article, 'MedlineCitation/Article/Abstract/AbstractText', ''),
        #    if you want to get data in flat(denormalized), uncomment below. but it's difficult to use for analytics.
        #    'Authors'                 : SEP.join([author.find('ForeName').text + ' ' +  author.find('LastName').text if author.find('CollectiveName') == None else author.find('CollectiveName').text for author in article.findall('MedlineCitation/Article/AuthorList/')]),
        #    'AuthorIdentifiers'       : SEP.join([getTextFromNode(author, 'Identifier', 'None') for author in article.findall('MedlineCitation/Article/AuthorList/')]),
        #    'AuthorIdentifierSources' : SEP.join([getTextFromNode(author, 'Identifier', 'None', 1, 'Source') for author in article.findall('MedlineCitation/Article/AuthorList/')]),
            'Language'                : getTextFromNode(article, 'MedlineCitation/Article/Language', ''),
            'Year_A'                  : getTextFromNode(article, 'MedlineCitation/Article/ArticleDate/Year', ''),
            'Month_A'                 : getTextFromNode(article, 'MedlineCitation/Article/ArticleDate/Month', ''),
            'Day_A'                   : getTextFromNode(article, 'MedlineCitation/Article/ArticleDate/Day', ''),
            'Year_PM'                 : getTextFromNode(article, 'PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]/Year', ''),
            'Month_PM'                : getTextFromNode(article, 'PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]/Month', ''),
            'Day_PM'                  : getTextFromNode(article, 'PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]/Day', ''),
            'Status'                  : getTextFromNode(article, './PubmedData/PublicationStatus', ''),
            'MeSH'                    : SEP.join([getTextFromNode(mesh, 'DescriptorName') for mesh in article.findall('MedlineCitation/MeshHeadingList/')]),
            'MeSH_UI'                 : SEP.join([getTextFromNode(mesh, 'DescriptorName', '', 1, 'UI') for mesh in article.findall('MedlineCitation/MeshHeadingList/')]),
            'Keyword'                 : SEP.join([keyword.text if keyword.text != None else ''  for keyword in article.findall('MedlineCitation/KeywordList/')])
        }
        articleDics.append(OrderedDict(articleDic))

        if article.find('MedlineCitation/MeshHeadingList/MeshHeading/') != None:
            tmp = article

        # get author info
        for author in article.findall('MedlineCitation/Article/AuthorList/'):

            # publish author ID
            # * It's only random id. not use for identify author. if you want to identify author, you can use identifier.
            authorId = str(uuid.uuid4())

            # author article
            authorArticleDic = {
                'authorId'         : authorId,
                'PMID'             : getTextFromNode(article, 'MedlineCitation/PMID', ''),
                'name'             : getTextFromNode(author, 'ForeName') + ' ' +  getTextFromNode(author,'LastName') if author.find('CollectiveName') == None else author.find('CollectiveName').text,
                'identifier'       : getTextFromNode(author, 'Identifier', '') ,
                'identifierSource' : getTextFromNode(author, 'Identifier', '', 1, 'Source')
            }
            authorArticleDics.append(OrderedDict(authorArticleDic))

            # author affiliation(author: affiliation = 1 : n)
            if author.find('./AffiliationInfo') != None:
                for affiliation in author.findall('./AffiliationInfo'):
                    authorAffiliationDic = {
                        'authorId'          : authorId,
                        'affiliation'       : getTextFromNode(affiliation, 'Affiliation', '') ,
                    }
                    authorAffiliationDics.append(OrderedDict(authorAffiliationDic))

annotation = 'MESH:D013629'
MeshId = annotation[5:]
#https://id.nlm.nih.gov/mesh/lookup/label?resource=D013629
MeshUrl = 'https://id.nlm.nih.gov/mesh/lookup/label?resource=' + MeshId
MeshResponse = requests.get(MeshUrl)
Mesh = MeshResponse.text[2:-2]

# https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/
MeshUrl = 'desc2023.xml'
MeshXml = ET.parse('desc2023.xml')
DicsMeshUidName = []
for MeshRecord in MeshXml.iter('DescriptorRecord'):
    MeshUidName = {
        'MeshUids' : getTextFromNode(MeshRecord, 'DescriptorUI', ''), 
        'MeshName' : getTextFromNode(MeshRecord, 'DescriptorName/String', '')
    }
    DicsMeshUidName.append(OrderedDict(MeshUidName))
# MeshUidName
df_Mesh = pd.DataFrame(DicsMeshUidName)
df_Mesh = df_Mesh.set_index('MeshUids')
df_Mesh.head(5)

# import gzip

# HomoloGeneXml = ET.parse(gzip.open('homologene.xml.gz'))
# DicsHomoloGene = []
# root = HomoloGeneXml.getroot()

# for child in root:
#    print(child.tag, child.attrib)
    

#for MeshRecord in HomoloGeneXml.iter('DescriptorRecord'):
#    MeshUidName = {
#        'MeshUids' : getTextFromNode(MeshRecord, 'DescriptorUI', ''), 
#        'MeshName' : getTextFromNode(MeshRecord, 'DescriptorName/String', '')
#    }
#    DicsMeshUidName.append(OrderedDict(MeshUidName))
# MeshUidName
#df_Mesh = pd.DataFrame(DicsMeshUidName)
#df_Mesh = df_Mesh.set_index('MeshUids')
#df_Mesh.head(5)

# (cmd) python -m gzip -d Homo_sapiens.gene_info.gz
# https://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/
df_gene = pd.read_table('Homo_sapiens.gene_info.gz', index_col=0)
df_gene = df_gene.set_index('GeneID')
df_gene.head(5)

# https://ftp.ncbi.nih.gov/pub/HomoloGene/current/build_inputs/taxid_taxname
df_Taxonomy = pd.read_table('taxid_taxname.txt', names=['TaxID', 'TaxName'])
df_Taxonomy = df_Taxonomy.set_index('TaxID')
df_Taxonomy.head(5)

# https://ftp.expasy.org/databases/cellosaurus/
CellLineUrl = 'cellosaurus.xml'
CellLineXml = ET.parse('cellosaurus.xml')
DicsCellLine = []
# i = 0
# root = CellLineXml.getroot()
# 
# for child in root:
#     print(child.tag, child.attrib)
    
for cellline in CellLineXml.findall('cell-line-list/cell-line'):
    DicCellLine = {
        'Accession' : getTextFromNode(cellline, 'accession-list/accession[@type="primary"]', ''), 
        'CellName' : getTextFromNode(cellline, 'name-list/name[@type="identifier"]', '')
    }
    DicsCellLine.append(OrderedDict(DicCellLine))

df_CellLine = pd.DataFrame(DicsCellLine)
df_CellLine = df_CellLine.set_index('Accession')
df_CellLine.head(5)

print(len(CellLineXml.findall('cell-line-list/cell-line')))
# print(i)
df_CellLine.head(5)

146062

articleDicsPubTator = []
# authorArticleDics = []
# authorAffiliationDics = []

def getPubTatorGene(annotation, df_Gene):
    if annotation == '':
        gene = ''
    else:
        gene = df_Gene[annotation][Symbol]
    return gene

def getMESH(annotation, df_Mesh):
    if getTextFromNode(annotation, 'infon[@key="type"]', '') == 'MESH':
        # MESH:D013629
        MeshId = getTextFromNode(annotation, 'infon[@key="identifier"]', '')
        MeshId = MeshId[5:]
        Mesh = df_Mesh[MeshId][MeshName]
    else:
        Mesh = ''    
    return Mesh
    
def getCellLine(annotaion, df_CellLine):
    if getTextFromNode(annotation, 'infon[@key="type"]', '') == 'CellLine':
        CellLine = df_CellLine[getTextFromNode(annotation, 'infon[@key="identifier"]', '')][CellName]
    else:
        CellLine = ''
    return CellLine

def getMutation(aanotation):
    test
    return Mutation

def getSpecies(annotation, df_Taxonomy):
    if annotation == '':
        Species = ''
    else:
        Species = df_Taxonomy[annotation][TaxName]
    return Species

def getTextAnnotaion(annotation):
    pidentifier = getTextFromNode(annotation, 'infon[@key="identifier"]', '')
    ptype = getTextFromNode(annotation, 'infon[@key="type"]', '')
    # ptype: Disease, Chemical, Gene, Species, CellLine
    #if ptype == 
    
def pushDataPubTator(rootXml):
    for document in rootXml.iter('document'):
        # get article info
        for annotation in document.findall('passage/annotation/infon'):
            if annotation.attrib['key'] == 'identifier':
                PubTatorIdentifierTmp = annotation.text
            if annotation.attrib['key'] == 'type':
                PubTatorTypeTmp = annotation.text
        articleDicPubTator = {
            'PMID'                    : getTextFromNode(document, 'id', ''),
            'PubTatorIdentifier'      : SEP.join([getTextFromNode(annotation, 'infon[@key="identifier"]', '')
                                                  for annotation in document.findall('passage/annotation')]),
            'PubTatorType'            : SEP.join([getTextFromNode(annotation, 'infon[@key="type"]', '')
                                                  for annotation in document.findall('passage/annotation')]),
            'PubTatorHomoloGene'      : SEP.join([getTextFromNode(annotation, 'infon[@key="NCBI Homologene"]', '')
                                                  for annotation in document.findall('passage/annotation')]),
            'PubTatorLocation'        : SEP.join([getTextFromNode(annotation, 'location', '', 1, 'offset')
                                                  for annotation in document.findall('passage/annotation')]),
            'PubTatorText'            : SEP.join([getTextFromNode(annotation, 'text', '')
                                                  for annotation in document.findall('passage/annotation')]),
        }
        articleDicsPubTator.append(OrderedDict(articleDicPubTator))

# ceil
iterCount = math.ceil(int(Count) / BATCH_NUM)
iterCount = iterCountTrial # just for trial
# get all data
for i in tqdm(range(iterCount)):
    rootXml = getXmlFromURL(BASEURL_FTCH, {
        'db': SOURCE_DB,
        'query_key': QueryKey,
        'WebEnv': WebEnv,
        'retstart': i * BATCH_NUM,
        'retmax': BATCH_NUM,
        'retmode': 'xml'})
    
    pushData(rootXml)

  0%|          | 0/20 [00:00<?, ?it/s]

# article
df_article = pd.DataFrame(articleDics)
# ceil
iterCount = math.ceil(int(Count) / BATCH_NUM)
iterCount = 100 # just for trial
printCount = 20
# get all data
for i in tqdm(range(iterCount)):
    retstart = i * BATCH_NUM_PUBTATOR
    retend =  (i + 1) * BATCH_NUM_PUBTATOR
    ret_df_article = df_article.iloc[retstart:retend]
    ret_df_article_pmids  = ret_df_article['PMID']
    PMIDsOneLine = ",".join(ret_df_article_pmids)
    rootXmlPubTator = getXmlFromURL(BASEURL_BIOCXML, {'pmids': PMIDsOneLine})
    if i % printCount == 0:
        mkqueryprint(BASEURL_BIOCXML, {'pmids': PMIDsOneLine}, i)
    pushDataPubTator(rootXmlPubTator)

  0%|          | 0/100 [00:00<?, ?it/s]

request url 0 is: https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmids=37318346,36753623,36753205,36753200,36753199,36753182,36452791,36075667,35779889,35779886,35779885,35779883,35590431,35483769,35462651,35242282,35100937,34980445,34980443,34980441,34980438,34754911,34754909,34754903,34590506,34539049,34539046,34493369,34426479,34414848,34414847,34306915,34243914,34243910,34239389,34190022,34177378,34175032,34175030,34096432,34082642,34059235,34027418,34026405,34026404,34024926,33998520,33998482,33998468,33994729,33994726,33994724,33994723,33969772,33927493,33911342,33907724,33899924,33884377,33870087,33869785,33867720,33867704,33854408,33838042,33816203,33790488,33790481,33777636,33776557,33747610,33737982,33730117,33730115,33726891,33692833,33692603,33688113,33619573,33616513,33608435,33603623,33584050,33584040,33569282,33569278,33563372,33518805,33505620,33505137,33505135,33505127,33501156,33487634,33469503,33464845,33463296,33463284,33463270,33463251
request url 20 is: https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmids=32156962,32156961,32156960,32156959,32156958,32156957,32156956,32156955,32156954,32156953,32156952,32156951,32156950,32156949,32156948,32156947,32156946,32156945,32156944,32156943,32156942,32156941,32156940,32156939,32156938,32156937,32156936,32156935,32156934,32156933,32156932,32156931,32156930,32156929,32156928,32156927,32156926,32156925,32156924,32156923,32156922,32156921,32156920,32156919,32156918,32156917,32156916,32156915,32156914,32156913,32156912,32156911,32156910,32156909,32156908,32156907,32156906,32156905,32156904,32156903,32156902,32156901,32156900,32156899,32156898,32156897,32156896,32156895,32156894,32156893,32156892,32156891,32156890,32156889,32156888,32156887,32156886,32156885,32156884,32156883,32156882,32156881,32156880,32156879,32156878,32156877,32156876,32156875,32156874,32156873,32156872,32156871,32156870,32156869,32156868,32156867,32156866,32156865,32156864,32156863
request url 40 is: https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmids=32001161,32001154,32001147,32001012,32000916,32000914,32000913,32000912,32000911,32000910,32000909,32000908,32000907,32000906,32000905,32000904,32000903,32000902,32000901,32000900,32000899,32000898,32000897,32000896,32000895,32000894,32000893,32000892,32000891,32000481,32000329,32000075,32000032,31999669,31999590,31999584,31999577,31999576,31999573,31999571,31999555,31999544,31998864,31998827,31998820,31998818,31998817,31998816,31998813,31998812,31998772,31998754,31998749,31998733,31998724,31998722,31998721,31998718,31998707,31998704,31998654,31998653,31998652,31998651,31998650,31998649,31998648,31998647,31998646,31998645,31998644,31998643,31998642,31998641,31998640,31998639,31998638,31998637,31998636,31998635,31998634,31998633,31998631,31998630,31998629,31998627,31998622,31998621,31998620,31998618,31998543,31998542,31998541,31998538,31998537,31998495,31998494,31998486,31998480,31998477
request url 60 is: https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmids=31926773,31926772,31926768,31926762,31926733,31926732,31926730,31926722,31926715,31926713,31926701,31926698,31926696,31926682,31926681,31926680,31926667,31926648,31926628,31926624,31926622,31926619,31926609,31926587,31926585,31926584,31926582,31926580,31926576,31926575,31926566,31926501,31926493,31926475,31926469,31926441,31926403,31926384,31926377,31924585,31924584,31924574,31924526,31924513,31924512,31924481,31924480,31924432,31924412,31924365,31924363,31924355,31924335,31924315,31924055,31924044,31924042,31924041,31924033,31924031,31924029,31924026,31924025,31924023,31924012,31924010,31924007,31923999,31923997,31923988,31923987,31923986,31923975,31923965,31923958,31923957,31923953,31923949,31923941,31923940,31923934,31923860,31923859,31923858,31923847,31923822,31923791,31923786,31923762,31923747,31923745,31923744,31923732,31923687,31923684,31923676,31922704,31922703,31922508,31922503
request url 80 is: https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmids=31891584,31891574,31891569,31891547,31891546,31891413,31891326,31891322,31891321,31891293,31891291,31891283,31891275,31891274,31891260,31891240,31891239,31891232,31891230,31891228,31891216,31891149,31891146,31891138,31891135,31891131,31891130,31891129,31891128,31891127,31891126,31891112,31891105,31891090,31891085,31891063,31891061,31891054,31891050,31891019,31891018,31891016,31890904,31890897,31890889,31890886,31890881,31890880,31890879,31890878,31890872,31890869,31890868,31890866,31890865,31890834,31890832,31890830,31890813,31890758,31890756,31890746,31890734,31890713,31890710,31890692,31890691,31890670,31890648,31890647,31890638,31890636,31890583,31890578,31890559,31890556,31890475,31890470,31890469,31890468,31890457,31890445,31890437,31890434,31890419,31890401,31890398,31890396,31890391,31890390,31890384,31890368,31890367,31890362,31890344,31890337,31890322,31890303,31890299,31890285

# documents
df_document = pd.DataFrame(articleDicsPubTator)
# df_document.iloc[0,2]
df_document.head(5)

document = rootXmlPubTator.find('document')
for result in document.findall('passage/infon'):
    print(result.attrib)
    if result.attrib['key'] == 'journal':
        print(result.text)

{'key': 'journal'}
Dig Liver Dis. 2020 Feb;52(2):234. doi: 10.1016/j.dld.2019.12.003. Epub 2019 Dec 
{'key': 'year'}
{'key': 'type'}
{'key': 'authors'}
{'key': 'type'}

# article
df_article = pd.DataFrame(articleDics)
df_article.head(5)

# author article
df_author = pd.DataFrame(authorArticleDics)
df_author.head(10)

# author affiliation
df_affiliation = pd.DataFrame(authorAffiliationDics)
df_affiliation.head(10)

df_article.to_csv('pubmed_article.csv')

df_author.to_csv('pubmed_author.csv')

df_affiliation.to_csv('pubmed_affiliation.csv')

# reload
df_article = pd.read_csv('pubmed_article.csv', index_col=0)
df_author = pd.read_csv('pubmed_author.csv', index_col=0)
df_affiliation = pd.read_csv('pubmed_affiliation.csv', index_col=0)

print('number of article: ', len(df_article))

number of article:  9999

# data_canada = px.data.gapminder().query("country == 'Canada'")
# fig = px.bar(data_canada, x='year', y='pop')
# fig.show()

# Categorical Feature
for catCol in ['Language', 'Status']:
    df_article_count = df_article[catCol].value_counts()
    #df_article.head()
    fig = px.bar(df_article_count, x=df_article_count.index, y=df_article_count, height=400, width=800, title="{}'s unique count".format(catCol), labels={catCol: 'Count', 'index': catCol})
    fig.show()

# concat
# df_article['ArticlePublishDate'] = df_article['Year_A'].fillna(0).astype(int).astype(str).str.zfill(4) + '-' +  df_article['Month_A'].fillna(0).astype(int).astype(str).str.zfill(2) + '-' +  df_article['Day_A'].fillna(0).astype(int).astype(str).str.zfill(2)
df_article['ArticlePublishDate'] = df_article['Year_A'].fillna(0).astype(int).astype(str).str.zfill(4) + '-' +  df_article['Month_A'].fillna(0).astype(int).astype(str).str.zfill(2)
df_article['PubMedPublishDate'] = df_article['Year_PM'].fillna(0).astype(int).astype(str).str.zfill(4) + '-' +  df_article['Month_PM'].fillna(0).astype(int).astype(str).str.zfill(2)

for catCol in ['ArticlePublishDate', 'PubMedPublishDate']:
    df_article_article = df_article[df_article[catCol] != '0000-00'][catCol].value_counts().sort_index()
    #fig = px.line(df_article_article, x=df_article_article.index, y=catCol, height=600, width=1200, title="{}'s distribution".format(catCol), labels={catCol: 'Publish Count', 'index': catCol})
    fig = px.line(df_article_article, x=df_article_article.index, y=df_article_article, height=600, width=1200, title="{}'s distribution".format(catCol))
    fig.show()

df_article[df_article['Language'] != 'eng'].head(5)

# False: not nan, True: is nan, values are percent
pd.merge(df_article, df_author, on='PMID', how='left').isnull().apply(lambda col: col.value_counts(), axis=0).fillna(0).astype(float).apply(lambda col: col/col.sum(), axis=0)

for catCol in ['authorId']:
    df_author_group = df_author.groupby('PMID', as_index=False).count()[catCol].value_counts().reset_index()
    fig = px.bar(df_author_group, x=df_author_group.index, y=catCol, height=600, width=1600, title="How many authors in each articles?".format(catCol), labels={catCol: 'Count', 'index': catCol})
    fig.show()

# False: not nan, True: is nan, values are percent
pd.DataFrame(pd.DataFrame(df_article['MeSH'].fillna('').astype(str) + df_article['Keyword'].fillna('').astype(str))[0] == '').apply(lambda col: col.value_counts(), axis=0).fillna(0).astype(float).apply(lambda col: col/col.sum(), axis=0)

df_article['allText'] = df_article['Title'].fillna('') + df_article['Abstract'].fillna('') + df_article['MeSH'].fillna('') + df_article['Keyword'].fillna('')

tfidf = TfidfVectorizer(
    min_df = 5,
    max_df = 0.95,
    max_features = 8000,
    stop_words = 'english'
)

tfidf.fit(df_article.allText)
text = tfidf.fit_transform(df_article.allText)

"""
Finding Optimal Clusters¶
https://www.kaggle.com/jbencina/clustering-documents-with-tfidf-and-kmeans
"""
def find_optimal_clusters(data, max_k):
    iters = range(10, max_k+1, 10)
    
    sse = []
    for k in iters:
        sse.append(MiniBatchKMeans(n_clusters=k, n_init=10, init_size=1024, batch_size=2048, random_state=20).fit(data).inertia_)
        print('Fit {} clusters'.format(k))
        
    f, ax = plt.subplots(1, 1)
    ax.plot(iters, sse, marker='o')
    ax.set_xlabel('Cluster Centers')
    ax.set_xticks(iters)
    ax.set_xticklabels(iters)
    ax.set_ylabel('SSE')
    ax.set_title('SSE by Cluster Center Plot')
    
find_optimal_clusters(text, 100)

Fit 10 clusters
Fit 20 clusters
Fit 30 clusters
Fit 40 clusters
Fit 50 clusters
Fit 60 clusters
Fit 70 clusters
Fit 80 clusters
Fit 90 clusters
Fit 100 clusters

# Clustring
clusters = MiniBatchKMeans(n_clusters=80, n_init=10, init_size=1024, batch_size=2048, random_state=20).fit_predict(text)

print(text.shape)
print(len(clusters))

(9999, 8000)
9999

# random sampling size
RANDOM_SAMPLING_SIZE = 3000

# random sampling
random_idx = np.random.choice(range(text.shape[0]), size=RANDOM_SAMPLING_SIZE, replace=False)

# t-sne(with random sampling pca)
# tsne = TSNE(random_state=RANDOM_STATE).fit_transform(PCA(n_components=50, random_state=RANDOM_STATE).fit_transform(text[random_idx,:].todense()))
tsne = TSNE(random_state=RANDOM_STATE).fit_transform(np.asarray(text[random_idx,:].todense()))

# random sampling df
df_article_tsne = df_article.iloc[random_idx]

# horizontal concat
df_article_tsne = df_article.iloc[random_idx].copy()
df_article_tsne['tsne_x'] = tsne[:, 0]
df_article_tsne['tsne_y'] = tsne[:, 1]
df_article_tsne['cluster'] = clusters[random_idx]
df_article_tsne.head()

# scatter visualize
fig = px.scatter(
    df_article_tsne, 
    x="tsne_x", 
    y="tsne_y", 
    color="cluster",
    height=1200,
    # size='petal_length', 
    color_continuous_scale=px.colors.sequential.Plasma,
    hover_data=['Title', 'PMID']
)
fig.update_layout(
    showlegend=False
) 
fig.show()

def get_top_keywords(data, clusters, labels, n_terms):
    df = pd.DataFrame(data.todense()).groupby(clusters).mean()
    clusterTexts = []
    
    for i,r in df.iterrows():
        top_keywords = ','.join([labels[t] for t in np.argsort(r)[-n_terms:]])
        clusterTexts.append(top_keywords)
#         print('\nCluster {}'.format(i))
#         print(top_keywords)
    return clusterTexts

clusterTexts = get_top_keywords(text, clusters, tfidf.get_feature_names_out(), 10)

df_article['cluster'] = clusters
df_cluster = pd.DataFrame(df_article.groupby('cluster', as_index=False).count().sort_values('cluster')['PMID'].values, columns=['num_articles'])
df_cluster['keywords'] = clusterTexts
df_cluster.sort_values('num_articles')

	MeshName
MeshUids
D000001	Calcimycin
D000002	Temefos
D000003	Abattoirs
D000004	Abbreviations as Topic
D000005	Abdomen

	Symbol	LocusTag	Synonyms	dbXrefs	chromosome	map_location	description	type_of_gene	Symbol_from_nomenclature_authority	Full_name_from_nomenclature_authority	Nomenclature_status	Other_designations	Modification_date	Feature_type
GeneID
1	A1BG	-	A1B\|ABG\|GAB\|HYST2477	MIM:138670\|HGNC:HGNC:5\|Ensembl:ENSG00000121410...	19	19q13.43	alpha-1-B glycoprotein	protein-coding	A1BG	alpha-1-B glycoprotein	O	alpha-1B-glycoprotein\|HEL-S-163pA\|epididymis s...	20230621	-
2	A2M	-	A2MD\|CPAMD5\|FWP007\|S863-7	MIM:103950\|HGNC:HGNC:7\|Ensembl:ENSG00000175899...	12	12p13.31	alpha-2-macroglobulin	protein-coding	A2M	alpha-2-macroglobulin	O	alpha-2-macroglobulin\|C3 and PZP-like alpha-2-...	20230801	-
3	A2MP1	-	A2MP	HGNC:HGNC:8\|Ensembl:ENSG00000291190\|AllianceGe...	12	12p13.31	alpha-2-macroglobulin pseudogene 1	pseudo	A2MP1	alpha-2-macroglobulin pseudogene 1	O	pregnancy-zone protein pseudogene	20230329	-
9	NAT1	-	AAC1\|MNAT\|NAT-1\|NATI	MIM:108345\|HGNC:HGNC:7645\|Ensembl:ENSG00000171...	8	8p22	N-acetyltransferase 1	protein-coding	NAT1	N-acetyltransferase 1	O	arylamine N-acetyltransferase 1\|N-acetyltransf...	20230724	-
10	NAT2	-	AAC2\|NAT-2\|PNAT	MIM:612182\|HGNC:HGNC:7646\|Ensembl:ENSG00000156...	8	8p22	N-acetyltransferase 2	protein-coding	NAT2	N-acetyltransferase 2	O	arylamine N-acetyltransferase 2\|N-acetyltransf...	20230806	-

	TaxName
TaxID
10090	Mus musculus
10116	Rattus norvegicus
28985	Kluyveromyces lactis
318829	Magnaporthe oryzae
33169	Eremothecium gossypii

	CellName
Accession
CVCL_B0T9	#132 PC3-1-SC-E8
CVCL_B0T8	#132 PL12 SC-D1
CVCL_E548	#15310-LN
CVCL_KA96	#16-15
CVCL_IW91	#40a

	CellName
Accession
CVCL_B0T9	#132 PC3-1-SC-E8
CVCL_B0T8	#132 PL12 SC-D1
CVCL_E548	#15310-LN
CVCL_KA96	#16-15
CVCL_IW91	#40a

What's Output?¶

Import Library¶

Parameters¶

functions¶

Statistic¶

Get Total Count¶

Get All details¶

Store¶

EDA¶

categorical feature¶

publish distribution¶

what content 'not' english article includes?¶

How many nan do articles include?¶

How many authors in each articles?¶

Is MeSH or Keyword NaN?¶

Clustering¶

find seeds¶

	PMID	PubTatorIdentifier	PubTatorType	PubTatorHomoloGene	PubTatorLocation	PubTatorText
0	33463251	MESH:D008545\|MESH:D009369\|MESH:D008063\|CVCL_01...	Disease\|Disease\|Chemical\|CellLine\|Disease\|Spec...	\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|	107\|177\|435\|515\|522\|534\|603\|616\|629\|664\|704\|82...	Malignant Melanoma\|cancer\|lipoic acid\|B16F10\|m...
1	33463270	7474\|1499\|MESH:D001943\|MESH:D009369\|MESH:D0019...	Gene\|Gene\|Disease\|Disease\|Disease\|Gene\|Gene\|Ge...	20720\|1434\|\|\|\|20720\|1434\|20720\|1434\|\|20720\|143...	0\|4\|45\|138\|154\|281\|285\|407\|411\|435\|510\|514\|573...	Wnt\|beta-Catenin\|Osteogenesis for Breast Cance...
2	33463284	MESH:D004317\|-\|MESH:D008545\|MESH:D004317\|MESH:...	Chemical\|Chemical\|Disease\|Chemical\|Chemical\|Di...	\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|	10\|29\|149\|165\|178\|233\|270\|401\|688\|747\|758\|768\|...	Doxorubicin\|PEGylated Poly(Lactide-co-glycolid...
3	33463296	MESH:D001943\|MESH:D001943\|MESH:D009369\|MESH:D0...	Disease\|Disease\|Disease\|Disease\|Disease\|Specie...	\|\|\|\|\|\|\|\|	20\|156\|243\|391\|560\|587\|830\|934\|1236	Antibody Drug Conjugates Targeting the Breast ...
4	33464845	MESH:D008670\|MESH:D008670\|84148\|84148\|MESH:D00...	Chemical\|Chemical\|Gene\|Gene\|Disease\|Gene\|Disea...	\|\|41676\|41676\|\|41676\|\|\|\|	0\|384\|624\|755\|795\|866\|976\|994\|1196\|1318	Metal\|metal\|MOF\|MOF\|tumors\|MOF\|breast cancer\|l...

	PMID	JournalTitle	Title	doi	Abstract	Language	Year_A	Month_A	Day_A	Year_PM	Month_PM	Day_PM	Status	MeSH	MeSH_UI	Keyword
0	37318346	British journal of neurosurgery	Olfactory schwannomas - an enigmatic clinical ...	10.1080/02688697.2019.1661968	Olfactory Schwannomas (OS) are a rare, benign ...	eng	2019	09	06	2023	6	15	ppublish	Female\|Humans\|Aged\|Neurilemmoma\|Diagnosis, Dif...	D005260\|D006801\|D000368\|D009442\|D003937\|D012903	olfactory\|schwannoma\|skull base
1	36753623	Revista de salud publica (Bogota, Colombia)	[Biological cancer therapies: an approach towa...	10.15446/rsap.V21n4.73686	Monoclonal antibodies are a useful tool for la...	spa				2023	2	9	ppublish	Humans\|Neoplasms\|Antibodies, Monoclonal	D006801\|D009369\|D000911
2	36753205	Revista de salud publica (Bogota, Colombia)	[Survival of young adults with oral squamous c...	10.15446/rsap.V21n5.76193	To analyze the profile and survival in young a...	por				2023	2	9	ppublish	Humans\|Male\|Young Adult\|Female\|Squamous Cell C...	D006801\|D008297\|D055815\|D005260\|D000077195\|D00...
3	36753200	Revista de salud publica (Bogota, Colombia)	[Screening mammography coverage and Decennial ...	10.15446/rsap.V21n5.81275	Estimate the coverage of screening mammograms ...	spa				2023	2	9	ppublish	Humans\|Female\|Mammography\|Breast Neoplasms\|Col...	D006801\|D005260\|D008327\|D001943\|D003105\|D00840...
4	36753199	Revista de salud publica (Bogota, Colombia)	[Exercise tolerance and fatigue in women survi...	10.15446/rsap.V21n5.81849	Breast cancer is the most common type of cance...	spa				2023	2	9	ppublish	Female\|Humans\|Breast Neoplasms\|Exercise Tolera...	D005260\|D006801\|D001943\|D017079\|D003430\|D01774...

	authorId	PMID	name
0	48c6e144-080f-4c1b-896f-ca0c57285856	37318346	Georgios F Hadjigeorgiou
1	5a92951d-637d-4c10-8950-632632f5b65e	37318346	Eleana M Strouthou
2	eabab1d4-bd9f-4bf4-b441-ac42618bb5a6	37318346	Dimitri Koulousakis
3	d2df269f-6ba6-4ce1-b762-c127464c2c00	37318346	Victor Patsouris
4	a80084a0-5e78-457d-b2e2-a8a5d2c81fe4	37318346	Frauke Neff
5	2fb1c66f-bb7b-4aff-b80f-1d2b4eabdb8e	37318346	Christianto B Lumenta
6	dfd80221-6582-40de-81a1-c14e3756251b	37318346	David B Schul
7	a671e821-beaa-48e8-ae6c-e9e5d6e4c735	36753623	Monica B Aranda
8	23a184a5-8a71-4578-9d3b-18e3384b31c4	36753623	Karina B Sabalette
9	4173a4e2-5211-42e7-a8ad-253eff820425	36753205	Marília de Matos Amorim

	authorId	affiliation
0	48c6e144-080f-4c1b-896f-ca0c57285856	Department of Neurosugery, Klinikum Bogenhause...
1	48c6e144-080f-4c1b-896f-ca0c57285856	School of Medicine, European University of Cyp...
2	5a92951d-637d-4c10-8950-632632f5b65e	School of Medicine, European University of Cyp...
3	eabab1d4-bd9f-4bf4-b441-ac42618bb5a6	Department of Neurosugery, Klinikum Bogenhause...
4	d2df269f-6ba6-4ce1-b762-c127464c2c00	Charite Medical Faculty Berlin, Berlin, Germany.
5	a80084a0-5e78-457d-b2e2-a8a5d2c81fe4	Department of Pathology, Klinikum Bogenhausen,...
6	2fb1c66f-bb7b-4aff-b80f-1d2b4eabdb8e	Department of Neurosugery, Klinikum Bogenhause...
7	dfd80221-6582-40de-81a1-c14e3756251b	Department of Neurosugery, Klinikum Bogenhause...
8	a671e821-beaa-48e8-ae6c-e9e5d6e4c735	MA: Contador Público. Ph. D. Gobierno y Cultur...
9	23a184a5-8a71-4578-9d3b-18e3384b31c4	KS: Lic. Biotecnología y Biología Molecular. U...

	PMID	JournalTitle	Title	doi	Abstract	Language	Year_A	Month_A	Day_A	Year_PM	...	Status	MeSH	MeSH_UI	Keyword	ArticlePublishDate	PubMedPublishDate	authorId	name	identifier	identifierSource
False	1.0	1.0	0.994038	0.886236	0.916267	1.0	0.797636	0.797636	0.797636	1.0	...	1.0	0.557208	0.557208	0.739556	1.0	1.0	0.999541	0.999541	0.070625	0.070625
True	0.0	0.0	0.005962	0.113764	0.083733	0.0	0.202364	0.202364	0.202364	0.0	...	0.0	0.442792	0.442792	0.260444	0.0	0.0	0.000459	0.000459	0.929375	0.929375

	PMID	JournalTitle	Title	doi	Abstract	Language	Year_A	Month_A	Day_A	Year_PM	...	Status	MeSH	MeSH_UI	Keyword	ArticlePublishDate	PubMedPublishDate	allText	tsne_x	tsne_y	cluster
5344	31939433	Journal of cancer research and therapeutics	Risk factors for 30-day unplanned reoperation ...	10.4103/jcrt.JCRT_137_19	The purpose of this study was to investigate t...	eng	NaN	NaN	NaN	2020	...	ppublish	Aged\|Comorbidity\|Female\|Humans\|Male\|Middle Age...	D000368\|D015897\|D005260\|D006801\|D008297\|D00887...	Complications\|pancreatoduodenectomy\|reoperatio...	0000-00	2020-01	Risk factors for 30-day unplanned reoperation ...	-0.213256	0.419736	40
7444	31898683	Journal of cancer research and therapeutics	Sister Mary Joseph's nodule in endometrial can...	10.4103/jcrt.JCRT_523_18	Sister Mary Joseph's nodule (SMJN) is an umbil...	eng	NaN	NaN	NaN	2020	...	ppublish	Aged, 80 and over\|Biopsy\|Combined Modality The...	D000369\|D001706\|D003131\|D016889\|D005260\|D00680...	Endometrial cancer\|robotic surgery\|umbilical m...	0000-00	2020-01	Sister Mary Joseph's nodule in endometrial can...	0.985034	3.328131	40
1731	32184851	Iranian journal of pharmaceutical research : IJPR	A Comparison of Cytotoxic Effects of	10.22037/ijpr.2019.111977.13462	Natural products isolated from plant sources a...	eng	NaN	NaN	NaN	2020	...	ppublish	NaN	NaN	Anticancer activity\|Chronic Lymphocytic Leukem...	0000-00	2020-03	A Comparison of Cytotoxic Effects of Natural p...	-2.783489	-5.879603	38
8719	31885567	Journal of oncology	Exploring the Role of Breast Density on Cancer...	10.1155/2019/1781762	Our aim was to assess the role of breast densi...	eng	2019.0	11.0	27.0	2019	...	epublish	NaN	NaN	NaN	2019-11	2019-12	Exploring the Role of Breast Density on Cancer...	-4.970211	1.333966	40
4521	31972942	The Science of the total environment	Spatially resolved distribution, sources and h...	10.1016/j.scitotenv.2019.135805	This work reports the first assessment of cont...	eng	2019.0	11.0	27.0	2020	...	ppublish	Adult\|Child\|China\|Cities\|Dust\|Environmental Mo...	D000328\|D002648\|D002681\|D002947\|D004391\|D00478...	Contamination indices\|Health risk assessment\|R...	2019-11	2020-01	Spatially resolved distribution, sources and h...	-1.170886	-0.506048	38

	num_articles	keywords
0	1	90,carcinoma,transcriptome,extended,accounting...
42	1	capacities,clinical,coding,variants,variant,ge...
43	1	arterial,tgf,rat,factor,hepatic,hypoxia,transf...
44	1	cancer,explain,various,meta,association,estima...
45	1	evaluations,tuberculous,attributed,negative,rr...
...	...	...
48	6	survival,metastatic,prostatic,mcrpc,bone,resis...
19	10	dissection,metastasis,nodes,axilla,biopsy,brea...
34	12	patients,inflammatory,prognostic,monocyte,plat...
38	3634	treatment,study,disease,care,diseases,patients...
40	6246	treatment,expression,carcinoma,patients,lung,c...

	0
0
False	0.920692
True	0.079308