- This is the python script use for get huge PubMed data from NCBI API.
Document is here. https://www.ncbi.nlm.nih.gov/books/NBK25501/ - You have to read below term before use this script.
https://www.nlm.nih.gov/databases/download/terms_and_conditions.html
What's Output?¶
Thinking about outputimage but no concrete image.
Import Library¶
In [1]:
# conda install numpy
# conda install matplotlib
# conda install plotly pandas
# conda install requests
# conda install -c conda-forge scikit-learn
# conda install -c conda-forge tqdm
# conda install -c conda-forge ipywidgets
# conda install -c conda-forge nbformat
# ipython kernel install --user --name=pyPubTator
# conda install -c conda-forge nodejs
# (base) conda install -c plotly plotly
# (base) conda install jupyterlab ipywidgets
# (base) conda install -c conda-forge nodejs
# (base) jupyter labextension install jupyterlab-plotly
## UserWarning: ValueError: The extension "jupyterlab-plotly"
## does not yet support the current version of JupyterLab.
## on 2023/10/09
import math
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import requests
import urllib.parse
import uuid
import xml.etree.ElementTree as ET
from collections import OrderedDict
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
# from tqdm import tqdm_notebook as tqdm
from tqdm.notebook import tqdm
plotly.offline.init_notebook_mode(connected=False)
Parameters¶
In [2]:
iterCountTrial = 20 # just for trial
In [3]:
# const
BASEURL_INFO = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
BASEURL_SRCH = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
BASEURL_FTCH = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
In [4]:
# const (PubTator)
BASEURL_PUBTATOR = 'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator'
BASEURL_BIOCXML = 'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml'
BASEURL_BIOCJSON = 'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson'
# https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator?pmids=28483577&concepts=gene
# https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmcids=PMC6207735
# https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmids=28483577,28483578,28483579
BATCH_NUM_PUBTATOR = 100
In [5]:
# parameters
TERM = 'cancer'
In [6]:
DATE_TYPE = 'pdat' # Type of date used to limit a search. The allowed values vary between Entrez databases, but common values are 'mdat' (modification date), 'pdat' (publication date) and 'edat' (Entrez date). Generally an Entrez database will have only two allowed values for datetype.
MIN_DATE = '2018/01/01' # yyyy/mm/dd
MAX_DATE = '2019/12/31' # yyyy/mm/dd
In [7]:
SOURCE_DB = 'pubmed'
SEP = '|'
BATCH_NUM = 1000
In [8]:
# seed
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
functions¶
In [9]:
'''
make query function
base_url: base_url
params: parameter dictionary
ex) {key1: value1, key2: value2}
'''
def mkquery(base_url, params):
base_url += '?'
for key, value in zip(params.keys(), params.values()):
base_url += '{key}={value}&'.format(key=key, value=value)
url = base_url[0:len(base_url) - 1]
# print('request url is: ' + url)
return url
def mkqueryprint(base_url, params, i):
base_url += '?'
for key, value in zip(params.keys(), params.values()):
base_url += '{key}={value}&'.format(key=key, value=value)
url = base_url[0:len(base_url) - 1]
print('request url ' + str(i) + ' is: ' + url)
return url
'''
getXmlFromURL
(mkquery wrapper)
base_url: base_url
params: parameter dictionary
ex) {key1: value1, key2: value2}
'''
def getXmlFromURL(base_url, params):
response = requests.get(mkquery(base_url, params))
return ET.fromstring(response.text)
'''
getTextFromNode
root: Xml root node
path: XPath
fill: fill na string
mode: 0 = text, 1 = attribute
attrib: attribute name
'''
def getTextFromNode(root, path, fill='', mode=0, attrib='attribute'):
if (root.find(path) == None):
return fill
else:
if mode == 0:
return root.find(path).text
if mode == 1:
return root.find(path).get(attrib)
# example
rootXml = getXmlFromURL(BASEURL_INFO, {'db': SOURCE_DB})
Statistic¶
In [10]:
# Info API
rootXml = getXmlFromURL(BASEURL_INFO, {'db': SOURCE_DB})
print(rootXml.find('DbInfo').find('Count').text)
print(rootXml.find('DbInfo').find('LastUpdate').text)
36298248 2023/10/08 20:23
Get Total Count¶
In [11]:
# get xml
rootXml = getXmlFromURL(BASEURL_SRCH, {
'db': SOURCE_DB,
'term': TERM,
'usehistory': 'y',
'datetype': DATE_TYPE,
'mindate': MIN_DATE,
'maxdate': MAX_DATE})
In [12]:
# get PMIDs
IdList = rootXml.find('IdList')
len(IdList)
PMIDs = []
PMIDsOneLine = ''
for child in IdList.iter():
# 特定要素の抽出
if child.tag == 'Id':
PMID = child.text
# print(PMID)
PMIDs.append(PMID)
# PMIDsOneLine += PMID + ","
PMIDsOneLine = ",".join(PMIDs)
PMIDsOneLine
#print('PMIDs: ', Idlist)
Out[12]:
'37318346,36753623,36753205,36753200,36753199,36753182,36452791,36075667,35779889,35779886,35779885,35779883,35590431,35483769,35462651,35242282,35100937,34980445,34980443,34980441'
In [13]:
# get querykey and webenv
Count = rootXml.find('Count').text
QueryKey = rootXml.find('QueryKey').text
WebEnv = urllib.parse.quote(rootXml.find('WebEnv').text)
print('total Count: ', Count)
print('QueryKey : ', QueryKey)
print('WebEnv : ', WebEnv)
total Count: 421386 QueryKey : 1 WebEnv : MCID_6523bbb9d8c193677b66260e
Get All details¶
In [14]:
articleDics = []
authorArticleDics = []
authorAffiliationDics = []
def pushData(rootXml):
for article in rootXml.iter('PubmedArticle'):
# get article info
articleDic = {
'PMID' : getTextFromNode(article, 'MedlineCitation/PMID', ''),
'JournalTitle' : getTextFromNode(article, 'MedlineCitation/Article/Journal/Title', ''),
'Title' : getTextFromNode(article, 'MedlineCitation/Article/ArticleTitle', ''),
'doi' : getTextFromNode(article, 'MedlineCitation/Article/ELocationID[@EIdType="doi"]', ''),
'Abstract' : getTextFromNode(article, 'MedlineCitation/Article/Abstract/AbstractText', ''),
# if you want to get data in flat(denormalized), uncomment below. but it's difficult to use for analytics.
# 'Authors' : SEP.join([author.find('ForeName').text + ' ' + author.find('LastName').text if author.find('CollectiveName') == None else author.find('CollectiveName').text for author in article.findall('MedlineCitation/Article/AuthorList/')]),
# 'AuthorIdentifiers' : SEP.join([getTextFromNode(author, 'Identifier', 'None') for author in article.findall('MedlineCitation/Article/AuthorList/')]),
# 'AuthorIdentifierSources' : SEP.join([getTextFromNode(author, 'Identifier', 'None', 1, 'Source') for author in article.findall('MedlineCitation/Article/AuthorList/')]),
'Language' : getTextFromNode(article, 'MedlineCitation/Article/Language', ''),
'Year_A' : getTextFromNode(article, 'MedlineCitation/Article/ArticleDate/Year', ''),
'Month_A' : getTextFromNode(article, 'MedlineCitation/Article/ArticleDate/Month', ''),
'Day_A' : getTextFromNode(article, 'MedlineCitation/Article/ArticleDate/Day', ''),
'Year_PM' : getTextFromNode(article, 'PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]/Year', ''),
'Month_PM' : getTextFromNode(article, 'PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]/Month', ''),
'Day_PM' : getTextFromNode(article, 'PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]/Day', ''),
'Status' : getTextFromNode(article, './PubmedData/PublicationStatus', ''),
'MeSH' : SEP.join([getTextFromNode(mesh, 'DescriptorName') for mesh in article.findall('MedlineCitation/MeshHeadingList/')]),
'MeSH_UI' : SEP.join([getTextFromNode(mesh, 'DescriptorName', '', 1, 'UI') for mesh in article.findall('MedlineCitation/MeshHeadingList/')]),
'Keyword' : SEP.join([keyword.text if keyword.text != None else '' for keyword in article.findall('MedlineCitation/KeywordList/')])
}
articleDics.append(OrderedDict(articleDic))
if article.find('MedlineCitation/MeshHeadingList/MeshHeading/') != None:
tmp = article
# get author info
for author in article.findall('MedlineCitation/Article/AuthorList/'):
# publish author ID
# * It's only random id. not use for identify author. if you want to identify author, you can use identifier.
authorId = str(uuid.uuid4())
# author article
authorArticleDic = {
'authorId' : authorId,
'PMID' : getTextFromNode(article, 'MedlineCitation/PMID', ''),
'name' : getTextFromNode(author, 'ForeName') + ' ' + getTextFromNode(author,'LastName') if author.find('CollectiveName') == None else author.find('CollectiveName').text,
'identifier' : getTextFromNode(author, 'Identifier', '') ,
'identifierSource' : getTextFromNode(author, 'Identifier', '', 1, 'Source')
}
authorArticleDics.append(OrderedDict(authorArticleDic))
# author affiliation(author: affiliation = 1 : n)
if author.find('./AffiliationInfo') != None:
for affiliation in author.findall('./AffiliationInfo'):
authorAffiliationDic = {
'authorId' : authorId,
'affiliation' : getTextFromNode(affiliation, 'Affiliation', '') ,
}
authorAffiliationDics.append(OrderedDict(authorAffiliationDic))
In [15]:
annotation = 'MESH:D013629'
MeshId = annotation[5:]
#https://id.nlm.nih.gov/mesh/lookup/label?resource=D013629
MeshUrl = 'https://id.nlm.nih.gov/mesh/lookup/label?resource=' + MeshId
MeshResponse = requests.get(MeshUrl)
Mesh = MeshResponse.text[2:-2]
# https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/
MeshUrl = 'desc2023.xml'
MeshXml = ET.parse('desc2023.xml')
DicsMeshUidName = []
for MeshRecord in MeshXml.iter('DescriptorRecord'):
MeshUidName = {
'MeshUids' : getTextFromNode(MeshRecord, 'DescriptorUI', ''),
'MeshName' : getTextFromNode(MeshRecord, 'DescriptorName/String', '')
}
DicsMeshUidName.append(OrderedDict(MeshUidName))
# MeshUidName
df_Mesh = pd.DataFrame(DicsMeshUidName)
df_Mesh = df_Mesh.set_index('MeshUids')
df_Mesh.head(5)
Out[15]:
MeshName | |
---|---|
MeshUids | |
D000001 | Calcimycin |
D000002 | Temefos |
D000003 | Abattoirs |
D000004 | Abbreviations as Topic |
D000005 | Abdomen |
In [16]:
# import gzip
# HomoloGeneXml = ET.parse(gzip.open('homologene.xml.gz'))
# DicsHomoloGene = []
# root = HomoloGeneXml.getroot()
# for child in root:
# print(child.tag, child.attrib)
#for MeshRecord in HomoloGeneXml.iter('DescriptorRecord'):
# MeshUidName = {
# 'MeshUids' : getTextFromNode(MeshRecord, 'DescriptorUI', ''),
# 'MeshName' : getTextFromNode(MeshRecord, 'DescriptorName/String', '')
# }
# DicsMeshUidName.append(OrderedDict(MeshUidName))
# MeshUidName
#df_Mesh = pd.DataFrame(DicsMeshUidName)
#df_Mesh = df_Mesh.set_index('MeshUids')
#df_Mesh.head(5)
# (cmd) python -m gzip -d Homo_sapiens.gene_info.gz
# https://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/
df_gene = pd.read_table('Homo_sapiens.gene_info.gz', index_col=0)
df_gene = df_gene.set_index('GeneID')
df_gene.head(5)
Out[16]:
Symbol | LocusTag | Synonyms | dbXrefs | chromosome | map_location | description | type_of_gene | Symbol_from_nomenclature_authority | Full_name_from_nomenclature_authority | Nomenclature_status | Other_designations | Modification_date | Feature_type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
GeneID | ||||||||||||||
1 | A1BG | - | A1B|ABG|GAB|HYST2477 | MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410... | 19 | 19q13.43 | alpha-1-B glycoprotein | protein-coding | A1BG | alpha-1-B glycoprotein | O | alpha-1B-glycoprotein|HEL-S-163pA|epididymis s... | 20230621 | - |
2 | A2M | - | A2MD|CPAMD5|FWP007|S863-7 | MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899... | 12 | 12p13.31 | alpha-2-macroglobulin | protein-coding | A2M | alpha-2-macroglobulin | O | alpha-2-macroglobulin|C3 and PZP-like alpha-2-... | 20230801 | - |
3 | A2MP1 | - | A2MP | HGNC:HGNC:8|Ensembl:ENSG00000291190|AllianceGe... | 12 | 12p13.31 | alpha-2-macroglobulin pseudogene 1 | pseudo | A2MP1 | alpha-2-macroglobulin pseudogene 1 | O | pregnancy-zone protein pseudogene | 20230329 | - |
9 | NAT1 | - | AAC1|MNAT|NAT-1|NATI | MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171... | 8 | 8p22 | N-acetyltransferase 1 | protein-coding | NAT1 | N-acetyltransferase 1 | O | arylamine N-acetyltransferase 1|N-acetyltransf... | 20230724 | - |
10 | NAT2 | - | AAC2|NAT-2|PNAT | MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156... | 8 | 8p22 | N-acetyltransferase 2 | protein-coding | NAT2 | N-acetyltransferase 2 | O | arylamine N-acetyltransferase 2|N-acetyltransf... | 20230806 | - |
In [17]:
# https://ftp.ncbi.nih.gov/pub/HomoloGene/current/build_inputs/taxid_taxname
df_Taxonomy = pd.read_table('taxid_taxname.txt', names=['TaxID', 'TaxName'])
df_Taxonomy = df_Taxonomy.set_index('TaxID')
df_Taxonomy.head(5)
Out[17]:
TaxName | |
---|---|
TaxID | |
10090 | Mus musculus |
10116 | Rattus norvegicus |
28985 | Kluyveromyces lactis |
318829 | Magnaporthe oryzae |
33169 | Eremothecium gossypii |
In [18]:
# https://ftp.expasy.org/databases/cellosaurus/
CellLineUrl = 'cellosaurus.xml'
CellLineXml = ET.parse('cellosaurus.xml')
DicsCellLine = []
# i = 0
# root = CellLineXml.getroot()
#
# for child in root:
# print(child.tag, child.attrib)
for cellline in CellLineXml.findall('cell-line-list/cell-line'):
DicCellLine = {
'Accession' : getTextFromNode(cellline, 'accession-list/accession[@type="primary"]', ''),
'CellName' : getTextFromNode(cellline, 'name-list/name[@type="identifier"]', '')
}
DicsCellLine.append(OrderedDict(DicCellLine))
df_CellLine = pd.DataFrame(DicsCellLine)
df_CellLine = df_CellLine.set_index('Accession')
df_CellLine.head(5)
Out[18]:
CellName | |
---|---|
Accession | |
CVCL_B0T9 | #132 PC3-1-SC-E8 |
CVCL_B0T8 | #132 PL12 SC-D1 |
CVCL_E548 | #15310-LN |
CVCL_KA96 | #16-15 |
CVCL_IW91 | #40a |
In [19]:
print(len(CellLineXml.findall('cell-line-list/cell-line')))
# print(i)
df_CellLine.head(5)
146062
Out[19]:
CellName | |
---|---|
Accession | |
CVCL_B0T9 | #132 PC3-1-SC-E8 |
CVCL_B0T8 | #132 PL12 SC-D1 |
CVCL_E548 | #15310-LN |
CVCL_KA96 | #16-15 |
CVCL_IW91 | #40a |
In [20]:
articleDicsPubTator = []
# authorArticleDics = []
# authorAffiliationDics = []
def getPubTatorGene(annotation, df_Gene):
if annotation == '':
gene = ''
else:
gene = df_Gene[annotation][Symbol]
return gene
def getMESH(annotation, df_Mesh):
if getTextFromNode(annotation, 'infon[@key="type"]', '') == 'MESH':
# MESH:D013629
MeshId = getTextFromNode(annotation, 'infon[@key="identifier"]', '')
MeshId = MeshId[5:]
Mesh = df_Mesh[MeshId][MeshName]
else:
Mesh = ''
return Mesh
def getCellLine(annotaion, df_CellLine):
if getTextFromNode(annotation, 'infon[@key="type"]', '') == 'CellLine':
CellLine = df_CellLine[getTextFromNode(annotation, 'infon[@key="identifier"]', '')][CellName]
else:
CellLine = ''
return CellLine
def getMutation(aanotation):
test
return Mutation
def getSpecies(annotation, df_Taxonomy):
if annotation == '':
Species = ''
else:
Species = df_Taxonomy[annotation][TaxName]
return Species
def getTextAnnotaion(annotation):
pidentifier = getTextFromNode(annotation, 'infon[@key="identifier"]', '')
ptype = getTextFromNode(annotation, 'infon[@key="type"]', '')
# ptype: Disease, Chemical, Gene, Species, CellLine
#if ptype ==
def pushDataPubTator(rootXml):
for document in rootXml.iter('document'):
# get article info
for annotation in document.findall('passage/annotation/infon'):
if annotation.attrib['key'] == 'identifier':
PubTatorIdentifierTmp = annotation.text
if annotation.attrib['key'] == 'type':
PubTatorTypeTmp = annotation.text
articleDicPubTator = {
'PMID' : getTextFromNode(document, 'id', ''),
'PubTatorIdentifier' : SEP.join([getTextFromNode(annotation, 'infon[@key="identifier"]', '')
for annotation in document.findall('passage/annotation')]),
'PubTatorType' : SEP.join([getTextFromNode(annotation, 'infon[@key="type"]', '')
for annotation in document.findall('passage/annotation')]),
'PubTatorHomoloGene' : SEP.join([getTextFromNode(annotation, 'infon[@key="NCBI Homologene"]', '')
for annotation in document.findall('passage/annotation')]),
'PubTatorLocation' : SEP.join([getTextFromNode(annotation, 'location', '', 1, 'offset')
for annotation in document.findall('passage/annotation')]),
'PubTatorText' : SEP.join([getTextFromNode(annotation, 'text', '')
for annotation in document.findall('passage/annotation')]),
}
articleDicsPubTator.append(OrderedDict(articleDicPubTator))
In [21]:
# ceil
iterCount = math.ceil(int(Count) / BATCH_NUM)
iterCount = iterCountTrial # just for trial
# get all data
for i in tqdm(range(iterCount)):
rootXml = getXmlFromURL(BASEURL_FTCH, {
'db': SOURCE_DB,
'query_key': QueryKey,
'WebEnv': WebEnv,
'retstart': i * BATCH_NUM,
'retmax': BATCH_NUM,
'retmode': 'xml'})
pushData(rootXml)
0%| | 0/20 [00:00<?, ?it/s]
In [22]:
# article
df_article = pd.DataFrame(articleDics)
# ceil
iterCount = math.ceil(int(Count) / BATCH_NUM)
iterCount = 100 # just for trial
printCount = 20
# get all data
for i in tqdm(range(iterCount)):
retstart = i * BATCH_NUM_PUBTATOR
retend = (i + 1) * BATCH_NUM_PUBTATOR
ret_df_article = df_article.iloc[retstart:retend]
ret_df_article_pmids = ret_df_article['PMID']
PMIDsOneLine = ",".join(ret_df_article_pmids)
rootXmlPubTator = getXmlFromURL(BASEURL_BIOCXML, {'pmids': PMIDsOneLine})
if i % printCount == 0:
mkqueryprint(BASEURL_BIOCXML, {'pmids': PMIDsOneLine}, i)
pushDataPubTator(rootXmlPubTator)
0%| | 0/100 [00:00<?, ?it/s]
request url 0 is: https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmids=37318346,36753623,36753205,36753200,36753199,36753182,36452791,36075667,35779889,35779886,35779885,35779883,35590431,35483769,35462651,35242282,35100937,34980445,34980443,34980441,34980438,34754911,34754909,34754903,34590506,34539049,34539046,34493369,34426479,34414848,34414847,34306915,34243914,34243910,34239389,34190022,34177378,34175032,34175030,34096432,34082642,34059235,34027418,34026405,34026404,34024926,33998520,33998482,33998468,33994729,33994726,33994724,33994723,33969772,33927493,33911342,33907724,33899924,33884377,33870087,33869785,33867720,33867704,33854408,33838042,33816203,33790488,33790481,33777636,33776557,33747610,33737982,33730117,33730115,33726891,33692833,33692603,33688113,33619573,33616513,33608435,33603623,33584050,33584040,33569282,33569278,33563372,33518805,33505620,33505137,33505135,33505127,33501156,33487634,33469503,33464845,33463296,33463284,33463270,33463251 request url 20 is: https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmids=32156962,32156961,32156960,32156959,32156958,32156957,32156956,32156955,32156954,32156953,32156952,32156951,32156950,32156949,32156948,32156947,32156946,32156945,32156944,32156943,32156942,32156941,32156940,32156939,32156938,32156937,32156936,32156935,32156934,32156933,32156932,32156931,32156930,32156929,32156928,32156927,32156926,32156925,32156924,32156923,32156922,32156921,32156920,32156919,32156918,32156917,32156916,32156915,32156914,32156913,32156912,32156911,32156910,32156909,32156908,32156907,32156906,32156905,32156904,32156903,32156902,32156901,32156900,32156899,32156898,32156897,32156896,32156895,32156894,32156893,32156892,32156891,32156890,32156889,32156888,32156887,32156886,32156885,32156884,32156883,32156882,32156881,32156880,32156879,32156878,32156877,32156876,32156875,32156874,32156873,32156872,32156871,32156870,32156869,32156868,32156867,32156866,32156865,32156864,32156863 request url 40 is: https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmids=32001161,32001154,32001147,32001012,32000916,32000914,32000913,32000912,32000911,32000910,32000909,32000908,32000907,32000906,32000905,32000904,32000903,32000902,32000901,32000900,32000899,32000898,32000897,32000896,32000895,32000894,32000893,32000892,32000891,32000481,32000329,32000075,32000032,31999669,31999590,31999584,31999577,31999576,31999573,31999571,31999555,31999544,31998864,31998827,31998820,31998818,31998817,31998816,31998813,31998812,31998772,31998754,31998749,31998733,31998724,31998722,31998721,31998718,31998707,31998704,31998654,31998653,31998652,31998651,31998650,31998649,31998648,31998647,31998646,31998645,31998644,31998643,31998642,31998641,31998640,31998639,31998638,31998637,31998636,31998635,31998634,31998633,31998631,31998630,31998629,31998627,31998622,31998621,31998620,31998618,31998543,31998542,31998541,31998538,31998537,31998495,31998494,31998486,31998480,31998477 request url 60 is: https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmids=31926773,31926772,31926768,31926762,31926733,31926732,31926730,31926722,31926715,31926713,31926701,31926698,31926696,31926682,31926681,31926680,31926667,31926648,31926628,31926624,31926622,31926619,31926609,31926587,31926585,31926584,31926582,31926580,31926576,31926575,31926566,31926501,31926493,31926475,31926469,31926441,31926403,31926384,31926377,31924585,31924584,31924574,31924526,31924513,31924512,31924481,31924480,31924432,31924412,31924365,31924363,31924355,31924335,31924315,31924055,31924044,31924042,31924041,31924033,31924031,31924029,31924026,31924025,31924023,31924012,31924010,31924007,31923999,31923997,31923988,31923987,31923986,31923975,31923965,31923958,31923957,31923953,31923949,31923941,31923940,31923934,31923860,31923859,31923858,31923847,31923822,31923791,31923786,31923762,31923747,31923745,31923744,31923732,31923687,31923684,31923676,31922704,31922703,31922508,31922503 request url 80 is: https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmids=31891584,31891574,31891569,31891547,31891546,31891413,31891326,31891322,31891321,31891293,31891291,31891283,31891275,31891274,31891260,31891240,31891239,31891232,31891230,31891228,31891216,31891149,31891146,31891138,31891135,31891131,31891130,31891129,31891128,31891127,31891126,31891112,31891105,31891090,31891085,31891063,31891061,31891054,31891050,31891019,31891018,31891016,31890904,31890897,31890889,31890886,31890881,31890880,31890879,31890878,31890872,31890869,31890868,31890866,31890865,31890834,31890832,31890830,31890813,31890758,31890756,31890746,31890734,31890713,31890710,31890692,31890691,31890670,31890648,31890647,31890638,31890636,31890583,31890578,31890559,31890556,31890475,31890470,31890469,31890468,31890457,31890445,31890437,31890434,31890419,31890401,31890398,31890396,31890391,31890390,31890384,31890368,31890367,31890362,31890344,31890337,31890322,31890303,31890299,31890285
In [23]:
# documents
df_document = pd.DataFrame(articleDicsPubTator)
# df_document.iloc[0,2]
df_document.head(5)
Out[23]:
PMID | PubTatorIdentifier | PubTatorType | PubTatorHomoloGene | PubTatorLocation | PubTatorText | |
---|---|---|---|---|---|---|
0 | 33463251 | MESH:D008545|MESH:D009369|MESH:D008063|CVCL_01... | Disease|Disease|Chemical|CellLine|Disease|Spec... | ||||||||||||||||||||||||||||||| | 107|177|435|515|522|534|603|616|629|664|704|82... | Malignant Melanoma|cancer|lipoic acid|B16F10|m... |
1 | 33463270 | 7474|1499|MESH:D001943|MESH:D009369|MESH:D0019... | Gene|Gene|Disease|Disease|Disease|Gene|Gene|Ge... | 20720|1434||||20720|1434|20720|1434||20720|143... | 0|4|45|138|154|281|285|407|411|435|510|514|573... | Wnt|beta-Catenin|Osteogenesis for Breast Cance... |
2 | 33463284 | MESH:D004317|-|MESH:D008545|MESH:D004317|MESH:... | Chemical|Chemical|Disease|Chemical|Chemical|Di... | ||||||||||||||||||||||||||||| | 10|29|149|165|178|233|270|401|688|747|758|768|... | Doxorubicin|PEGylated Poly(Lactide-co-glycolid... |
3 | 33463296 | MESH:D001943|MESH:D001943|MESH:D009369|MESH:D0... | Disease|Disease|Disease|Disease|Disease|Specie... | |||||||| | 20|156|243|391|560|587|830|934|1236 | Antibody Drug Conjugates Targeting the Breast ... |
4 | 33464845 | MESH:D008670|MESH:D008670|84148|84148|MESH:D00... | Chemical|Chemical|Gene|Gene|Disease|Gene|Disea... | ||41676|41676||41676|||| | 0|384|624|755|795|866|976|994|1196|1318 | Metal|metal|MOF|MOF|tumors|MOF|breast cancer|l... |
In [24]:
document = rootXmlPubTator.find('document')
for result in document.findall('passage/infon'):
print(result.attrib)
if result.attrib['key'] == 'journal':
print(result.text)
{'key': 'journal'} Dig Liver Dis. 2020 Feb;52(2):234. doi: 10.1016/j.dld.2019.12.003. Epub 2019 Dec {'key': 'year'} {'key': 'type'} {'key': 'authors'} {'key': 'type'}
Store¶
In [25]:
# article
df_article = pd.DataFrame(articleDics)
df_article.head(5)
Out[25]:
PMID | JournalTitle | Title | doi | Abstract | Language | Year_A | Month_A | Day_A | Year_PM | Month_PM | Day_PM | Status | MeSH | MeSH_UI | Keyword | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 37318346 | British journal of neurosurgery | Olfactory schwannomas - an enigmatic clinical ... | 10.1080/02688697.2019.1661968 | Olfactory Schwannomas (OS) are a rare, benign ... | eng | 2019 | 09 | 06 | 2023 | 6 | 15 | ppublish | Female|Humans|Aged|Neurilemmoma|Diagnosis, Dif... | D005260|D006801|D000368|D009442|D003937|D012903 | olfactory|schwannoma|skull base |
1 | 36753623 | Revista de salud publica (Bogota, Colombia) | [Biological cancer therapies: an approach towa... | 10.15446/rsap.V21n4.73686 | Monoclonal antibodies are a useful tool for la... | spa | 2023 | 2 | 9 | ppublish | Humans|Neoplasms|Antibodies, Monoclonal | D006801|D009369|D000911 | ||||
2 | 36753205 | Revista de salud publica (Bogota, Colombia) | [Survival of young adults with oral squamous c... | 10.15446/rsap.V21n5.76193 | To analyze the profile and survival in young a... | por | 2023 | 2 | 9 | ppublish | Humans|Male|Young Adult|Female|Squamous Cell C... | D006801|D008297|D055815|D005260|D000077195|D00... | ||||
3 | 36753200 | Revista de salud publica (Bogota, Colombia) | [Screening mammography coverage and Decennial ... | 10.15446/rsap.V21n5.81275 | Estimate the coverage of screening mammograms ... | spa | 2023 | 2 | 9 | ppublish | Humans|Female|Mammography|Breast Neoplasms|Col... | D006801|D005260|D008327|D001943|D003105|D00840... | ||||
4 | 36753199 | Revista de salud publica (Bogota, Colombia) | [Exercise tolerance and fatigue in women survi... | 10.15446/rsap.V21n5.81849 | Breast cancer is the most common type of cance... | spa | 2023 | 2 | 9 | ppublish | Female|Humans|Breast Neoplasms|Exercise Tolera... | D005260|D006801|D001943|D017079|D003430|D01774... |
In [26]:
# author article
df_author = pd.DataFrame(authorArticleDics)
df_author.head(10)
Out[26]:
authorId | PMID | name | identifier | identifierSource | |
---|---|---|---|---|---|
0 | 48c6e144-080f-4c1b-896f-ca0c57285856 | 37318346 | Georgios F Hadjigeorgiou | ||
1 | 5a92951d-637d-4c10-8950-632632f5b65e | 37318346 | Eleana M Strouthou | ||
2 | eabab1d4-bd9f-4bf4-b441-ac42618bb5a6 | 37318346 | Dimitri Koulousakis | ||
3 | d2df269f-6ba6-4ce1-b762-c127464c2c00 | 37318346 | Victor Patsouris | ||
4 | a80084a0-5e78-457d-b2e2-a8a5d2c81fe4 | 37318346 | Frauke Neff | ||
5 | 2fb1c66f-bb7b-4aff-b80f-1d2b4eabdb8e | 37318346 | Christianto B Lumenta | ||
6 | dfd80221-6582-40de-81a1-c14e3756251b | 37318346 | David B Schul | ||
7 | a671e821-beaa-48e8-ae6c-e9e5d6e4c735 | 36753623 | Monica B Aranda | ||
8 | 23a184a5-8a71-4578-9d3b-18e3384b31c4 | 36753623 | Karina B Sabalette | ||
9 | 4173a4e2-5211-42e7-a8ad-253eff820425 | 36753205 | Marília de Matos Amorim |
In [27]:
# author affiliation
df_affiliation = pd.DataFrame(authorAffiliationDics)
df_affiliation.head(10)
Out[27]:
authorId | affiliation | |
---|---|---|
0 | 48c6e144-080f-4c1b-896f-ca0c57285856 | Department of Neurosugery, Klinikum Bogenhause... |
1 | 48c6e144-080f-4c1b-896f-ca0c57285856 | School of Medicine, European University of Cyp... |
2 | 5a92951d-637d-4c10-8950-632632f5b65e | School of Medicine, European University of Cyp... |
3 | eabab1d4-bd9f-4bf4-b441-ac42618bb5a6 | Department of Neurosugery, Klinikum Bogenhause... |
4 | d2df269f-6ba6-4ce1-b762-c127464c2c00 | Charite Medical Faculty Berlin, Berlin, Germany. |
5 | a80084a0-5e78-457d-b2e2-a8a5d2c81fe4 | Department of Pathology, Klinikum Bogenhausen,... |
6 | 2fb1c66f-bb7b-4aff-b80f-1d2b4eabdb8e | Department of Neurosugery, Klinikum Bogenhause... |
7 | dfd80221-6582-40de-81a1-c14e3756251b | Department of Neurosugery, Klinikum Bogenhause... |
8 | a671e821-beaa-48e8-ae6c-e9e5d6e4c735 | MA: Contador Público. Ph. D. Gobierno y Cultur... |
9 | 23a184a5-8a71-4578-9d3b-18e3384b31c4 | KS: Lic. Biotecnología y Biología Molecular. U... |
In [28]:
df_article.to_csv('pubmed_article.csv')
In [29]:
df_author.to_csv('pubmed_author.csv')
In [30]:
df_affiliation.to_csv('pubmed_affiliation.csv')
EDA¶
Reload the CSV so that you can resume the session at any time.
In [31]:
# reload
df_article = pd.read_csv('pubmed_article.csv', index_col=0)
df_author = pd.read_csv('pubmed_author.csv', index_col=0)
df_affiliation = pd.read_csv('pubmed_affiliation.csv', index_col=0)
In [32]:
print('number of article: ', len(df_article))
number of article: 9999
categorical feature¶
In [33]:
# data_canada = px.data.gapminder().query("country == 'Canada'")
# fig = px.bar(data_canada, x='year', y='pop')
# fig.show()
In [50]:
# Categorical Feature
for catCol in ['Language', 'Status']:
df_article_count = df_article[catCol].value_counts()
#df_article.head()
fig = px.bar(df_article_count, x=df_article_count.index, y=df_article_count, height=400, width=800, title="{}'s unique count".format(catCol), labels={catCol: 'Count', 'index': catCol})
fig.show()
publish distribution¶
In [35]:
# concat
# df_article['ArticlePublishDate'] = df_article['Year_A'].fillna(0).astype(int).astype(str).str.zfill(4) + '-' + df_article['Month_A'].fillna(0).astype(int).astype(str).str.zfill(2) + '-' + df_article['Day_A'].fillna(0).astype(int).astype(str).str.zfill(2)
df_article['ArticlePublishDate'] = df_article['Year_A'].fillna(0).astype(int).astype(str).str.zfill(4) + '-' + df_article['Month_A'].fillna(0).astype(int).astype(str).str.zfill(2)
df_article['PubMedPublishDate'] = df_article['Year_PM'].fillna(0).astype(int).astype(str).str.zfill(4) + '-' + df_article['Month_PM'].fillna(0).astype(int).astype(str).str.zfill(2)
In [51]:
for catCol in ['ArticlePublishDate', 'PubMedPublishDate']:
df_article_article = df_article[df_article[catCol] != '0000-00'][catCol].value_counts().sort_index()
#fig = px.line(df_article_article, x=df_article_article.index, y=catCol, height=600, width=1200, title="{}'s distribution".format(catCol), labels={catCol: 'Publish Count', 'index': catCol})
fig = px.line(df_article_article, x=df_article_article.index, y=df_article_article, height=600, width=1200, title="{}'s distribution".format(catCol))
fig.show()
what content 'not' english article includes?¶
In [37]:
df_article[df_article['Language'] != 'eng'].head(5)
Out[37]:
PMID | JournalTitle | Title | doi | Abstract | Language | Year_A | Month_A | Day_A | Year_PM | Month_PM | Day_PM | Status | MeSH | MeSH_UI | Keyword | ArticlePublishDate | PubMedPublishDate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 36753623 | Revista de salud publica (Bogota, Colombia) | [Biological cancer therapies: an approach towa... | 10.15446/rsap.V21n4.73686 | Monoclonal antibodies are a useful tool for la... | spa | NaN | NaN | NaN | 2023 | 2 | 9 | ppublish | Humans|Neoplasms|Antibodies, Monoclonal | D006801|D009369|D000911 | NaN | 0000-00 | 2023-02 |
2 | 36753205 | Revista de salud publica (Bogota, Colombia) | [Survival of young adults with oral squamous c... | 10.15446/rsap.V21n5.76193 | To analyze the profile and survival in young a... | por | NaN | NaN | NaN | 2023 | 2 | 9 | ppublish | Humans|Male|Young Adult|Female|Squamous Cell C... | D006801|D008297|D055815|D005260|D000077195|D00... | NaN | 0000-00 | 2023-02 |
3 | 36753200 | Revista de salud publica (Bogota, Colombia) | [Screening mammography coverage and Decennial ... | 10.15446/rsap.V21n5.81275 | Estimate the coverage of screening mammograms ... | spa | NaN | NaN | NaN | 2023 | 2 | 9 | ppublish | Humans|Female|Mammography|Breast Neoplasms|Col... | D006801|D005260|D008327|D001943|D003105|D00840... | NaN | 0000-00 | 2023-02 |
4 | 36753199 | Revista de salud publica (Bogota, Colombia) | [Exercise tolerance and fatigue in women survi... | 10.15446/rsap.V21n5.81849 | Breast cancer is the most common type of cance... | spa | NaN | NaN | NaN | 2023 | 2 | 9 | ppublish | Female|Humans|Breast Neoplasms|Exercise Tolera... | D005260|D006801|D001943|D017079|D003430|D01774... | NaN | 0000-00 | 2023-02 |
5 | 36753182 | Revista de salud publica (Bogota, Colombia) | [Sociodemographic and clinical characterizatio... | 10.15446/rsap.V21n3.70678 | To characterize socioeconomic, demographic, he... | por | NaN | NaN | NaN | 2023 | 2 | 9 | ppublish | Humans|Male|Prospective Studies|Prostatic Neop... | D006801|D008297|D011446|D011471|D007182|D00193... | NaN | 0000-00 | 2023-02 |
How many nan do articles include?¶
- PMID is require column.
- There are scarcely user identify ID(ORCID)
In [38]:
# False: not nan, True: is nan, values are percent
pd.merge(df_article, df_author, on='PMID', how='left').isnull().apply(lambda col: col.value_counts(), axis=0).fillna(0).astype(float).apply(lambda col: col/col.sum(), axis=0)
Out[38]:
PMID | JournalTitle | Title | doi | Abstract | Language | Year_A | Month_A | Day_A | Year_PM | ... | Status | MeSH | MeSH_UI | Keyword | ArticlePublishDate | PubMedPublishDate | authorId | name | identifier | identifierSource | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
False | 1.0 | 1.0 | 0.994038 | 0.886236 | 0.916267 | 1.0 | 0.797636 | 0.797636 | 0.797636 | 1.0 | ... | 1.0 | 0.557208 | 0.557208 | 0.739556 | 1.0 | 1.0 | 0.999541 | 0.999541 | 0.070625 | 0.070625 |
True | 0.0 | 0.0 | 0.005962 | 0.113764 | 0.083733 | 0.0 | 0.202364 | 0.202364 | 0.202364 | 0.0 | ... | 0.0 | 0.442792 | 0.442792 | 0.260444 | 0.0 | 0.0 | 0.000459 | 0.000459 | 0.929375 | 0.929375 |
2 rows × 22 columns
How many authors in each articles?¶
In [39]:
for catCol in ['authorId']:
df_author_group = df_author.groupby('PMID', as_index=False).count()[catCol].value_counts().reset_index()
fig = px.bar(df_author_group, x=df_author_group.index, y=catCol, height=600, width=1600, title="How many authors in each articles?".format(catCol), labels={catCol: 'Count', 'index': catCol})
fig.show()
Is MeSH or Keyword NaN?¶
In [40]:
# False: not nan, True: is nan, values are percent
pd.DataFrame(pd.DataFrame(df_article['MeSH'].fillna('').astype(str) + df_article['Keyword'].fillna('').astype(str))[0] == '').apply(lambda col: col.value_counts(), axis=0).fillna(0).astype(float).apply(lambda col: col/col.sum(), axis=0)
Out[40]:
0 | |
---|---|
0 | |
False | 0.920692 |
True | 0.079308 |
Clustering¶
In [41]:
df_article['allText'] = df_article['Title'].fillna('') + df_article['Abstract'].fillna('') + df_article['MeSH'].fillna('') + df_article['Keyword'].fillna('')
In [42]:
tfidf = TfidfVectorizer(
min_df = 5,
max_df = 0.95,
max_features = 8000,
stop_words = 'english'
)
tfidf.fit(df_article.allText)
text = tfidf.fit_transform(df_article.allText)
In [43]:
"""
Finding Optimal Clusters¶
https://www.kaggle.com/jbencina/clustering-documents-with-tfidf-and-kmeans
"""
def find_optimal_clusters(data, max_k):
iters = range(10, max_k+1, 10)
sse = []
for k in iters:
sse.append(MiniBatchKMeans(n_clusters=k, n_init=10, init_size=1024, batch_size=2048, random_state=20).fit(data).inertia_)
print('Fit {} clusters'.format(k))
f, ax = plt.subplots(1, 1)
ax.plot(iters, sse, marker='o')
ax.set_xlabel('Cluster Centers')
ax.set_xticks(iters)
ax.set_xticklabels(iters)
ax.set_ylabel('SSE')
ax.set_title('SSE by Cluster Center Plot')
find_optimal_clusters(text, 100)
Fit 10 clusters Fit 20 clusters Fit 30 clusters Fit 40 clusters Fit 50 clusters Fit 60 clusters Fit 70 clusters Fit 80 clusters Fit 90 clusters Fit 100 clusters
In [44]:
# Clustring
clusters = MiniBatchKMeans(n_clusters=80, n_init=10, init_size=1024, batch_size=2048, random_state=20).fit_predict(text)
In [45]:
print(text.shape)
print(len(clusters))
(9999, 8000) 9999
In [46]:
# random sampling size
RANDOM_SAMPLING_SIZE = 3000
# random sampling
random_idx = np.random.choice(range(text.shape[0]), size=RANDOM_SAMPLING_SIZE, replace=False)
# t-sne(with random sampling pca)
# tsne = TSNE(random_state=RANDOM_STATE).fit_transform(PCA(n_components=50, random_state=RANDOM_STATE).fit_transform(text[random_idx,:].todense()))
tsne = TSNE(random_state=RANDOM_STATE).fit_transform(np.asarray(text[random_idx,:].todense()))
# random sampling df
df_article_tsne = df_article.iloc[random_idx]
# horizontal concat
df_article_tsne = df_article.iloc[random_idx].copy()
df_article_tsne['tsne_x'] = tsne[:, 0]
df_article_tsne['tsne_y'] = tsne[:, 1]
df_article_tsne['cluster'] = clusters[random_idx]
df_article_tsne.head()
Out[46]:
PMID | JournalTitle | Title | doi | Abstract | Language | Year_A | Month_A | Day_A | Year_PM | ... | Status | MeSH | MeSH_UI | Keyword | ArticlePublishDate | PubMedPublishDate | allText | tsne_x | tsne_y | cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5344 | 31939433 | Journal of cancer research and therapeutics | Risk factors for 30-day unplanned reoperation ... | 10.4103/jcrt.JCRT_137_19 | The purpose of this study was to investigate t... | eng | NaN | NaN | NaN | 2020 | ... | ppublish | Aged|Comorbidity|Female|Humans|Male|Middle Age... | D000368|D015897|D005260|D006801|D008297|D00887... | Complications|pancreatoduodenectomy|reoperatio... | 0000-00 | 2020-01 | Risk factors for 30-day unplanned reoperation ... | -0.213256 | 0.419736 | 40 |
7444 | 31898683 | Journal of cancer research and therapeutics | Sister Mary Joseph's nodule in endometrial can... | 10.4103/jcrt.JCRT_523_18 | Sister Mary Joseph's nodule (SMJN) is an umbil... | eng | NaN | NaN | NaN | 2020 | ... | ppublish | Aged, 80 and over|Biopsy|Combined Modality The... | D000369|D001706|D003131|D016889|D005260|D00680... | Endometrial cancer|robotic surgery|umbilical m... | 0000-00 | 2020-01 | Sister Mary Joseph's nodule in endometrial can... | 0.985034 | 3.328131 | 40 |
1731 | 32184851 | Iranian journal of pharmaceutical research : IJPR | A Comparison of Cytotoxic Effects of | 10.22037/ijpr.2019.111977.13462 | Natural products isolated from plant sources a... | eng | NaN | NaN | NaN | 2020 | ... | ppublish | NaN | NaN | Anticancer activity|Chronic Lymphocytic Leukem... | 0000-00 | 2020-03 | A Comparison of Cytotoxic Effects of Natural p... | -2.783489 | -5.879603 | 38 |
8719 | 31885567 | Journal of oncology | Exploring the Role of Breast Density on Cancer... | 10.1155/2019/1781762 | Our aim was to assess the role of breast densi... | eng | 2019.0 | 11.0 | 27.0 | 2019 | ... | epublish | NaN | NaN | NaN | 2019-11 | 2019-12 | Exploring the Role of Breast Density on Cancer... | -4.970211 | 1.333966 | 40 |
4521 | 31972942 | The Science of the total environment | Spatially resolved distribution, sources and h... | 10.1016/j.scitotenv.2019.135805 | This work reports the first assessment of cont... | eng | 2019.0 | 11.0 | 27.0 | 2020 | ... | ppublish | Adult|Child|China|Cities|Dust|Environmental Mo... | D000328|D002648|D002681|D002947|D004391|D00478... | Contamination indices|Health risk assessment|R... | 2019-11 | 2020-01 | Spatially resolved distribution, sources and h... | -1.170886 | -0.506048 | 38 |
5 rows × 22 columns
In [47]:
# scatter visualize
fig = px.scatter(
df_article_tsne,
x="tsne_x",
y="tsne_y",
color="cluster",
height=1200,
# size='petal_length',
color_continuous_scale=px.colors.sequential.Plasma,
hover_data=['Title', 'PMID']
)
fig.update_layout(
showlegend=False
)
fig.show()
In [48]:
def get_top_keywords(data, clusters, labels, n_terms):
df = pd.DataFrame(data.todense()).groupby(clusters).mean()
clusterTexts = []
for i,r in df.iterrows():
top_keywords = ','.join([labels[t] for t in np.argsort(r)[-n_terms:]])
clusterTexts.append(top_keywords)
# print('\nCluster {}'.format(i))
# print(top_keywords)
return clusterTexts
clusterTexts = get_top_keywords(text, clusters, tfidf.get_feature_names_out(), 10)
find seeds¶
In [49]:
df_article['cluster'] = clusters
df_cluster = pd.DataFrame(df_article.groupby('cluster', as_index=False).count().sort_values('cluster')['PMID'].values, columns=['num_articles'])
df_cluster['keywords'] = clusterTexts
df_cluster.sort_values('num_articles')
Out[49]:
num_articles | keywords | |
---|---|---|
0 | 1 | 90,carcinoma,transcriptome,extended,accounting... |
42 | 1 | capacities,clinical,coding,variants,variant,ge... |
43 | 1 | arterial,tgf,rat,factor,hepatic,hypoxia,transf... |
44 | 1 | cancer,explain,various,meta,association,estima... |
45 | 1 | evaluations,tuberculous,attributed,negative,rr... |
... | ... | ... |
48 | 6 | survival,metastatic,prostatic,mcrpc,bone,resis... |
19 | 10 | dissection,metastasis,nodes,axilla,biopsy,brea... |
34 | 12 | patients,inflammatory,prognostic,monocyte,plat... |
38 | 3634 | treatment,study,disease,care,diseases,patients... |
40 | 6246 | treatment,expression,carcinoma,patients,lung,c... |
80 rows × 2 columns