# 使用 Entrez API 进行 PubMed 文献信息提取
PubMed 是一个常用的文献数据库,提供了大量生物医学领域的文献。当需要整理大量文献时,手动搜索可能会非常耗时。
我们可以使用 Entrez API 自动化这一过程。本文将介绍如何使用 Python 和 Entrez API 提取 PubMed 文献信息,并将结果保存为 CSV 文件。
# 环境准备
首先,我们需要安装 Biopython,这是一个用于生物信息学的强大工具包,包含了访问 Entrez API 的模块。
pip install biopython |
此外,还需要一些其他的 Python 库:
import pandas as pd | |
from Bio import Entrez, Medline | |
import time | |
from urllib.error import HTTPError | |
import re | |
import ssl | |
import urllib3 |
为了避免 SSL 证书验证错误,我们可以禁用证书验证和警告:
# 禁用警告和证书验证 | |
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | |
ssl._create_default_https_context = ssl._create_unverified_context |
接下来,设置 NCBI 账户的邮箱(需要在 NCBI 网站上注册一个账户):
Entrez.email = "your_email@example.com" |
# 提取日期的正则表达式模式
为了处理日期信息,我们定义了一个正则表达式模式:
date_pattern = re.compile(r'\b\d{4} \b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b \d{1,2}') |
# 定义函数
# fetch_pubmed_data 函数
这个函数从 PubMed 获取文献信息并保存到 CSV 文件中。
由于 Entrez API 最多只能获取检索到的前 9999 篇(其实在网页上我们也只能看到检索到的文献的前 10000 篇),若使用检索词检索到的文章数量少于 10000 篇,则直接获取文献 PMID 列表;否则,按年份分段进行检索以获取所有文献的 PMID 列表。
这种方法可以应付大多数的情况,即一年发文不超过 9999 篇的情况。但如果你的关键词检索结果显示有些年份的发文数量仍然大于 9999 篇,你可以自行修改此处的逻辑,如按照月份来重构检索词。
def fetch_pubmed_data(term, fname): | |
""" | |
从 PubMed 获取文献信息并保存到 CSV 文件。 | |
参数: | |
- term: 检索词 | |
- fname: 保存文件名 | |
""" | |
handle_esearch = Entrez.esearch(db="pubmed", term=term, retmax=0) | |
record_esearch = Entrez.read(handle_esearch) | |
handle_esearch.close() | |
retmax = int(record_esearch["Count"]) | |
time.sleep(5) | |
pmid_list = [] | |
if retmax < 10000: | |
handle_esearch = Entrez.esearch(db="pubmed", term=term, retmax=retmax) | |
record_esearch = Entrez.read(handle_esearch) | |
handle_esearch.close() | |
pmid_list = record_esearch["IdList"] | |
else: | |
start_year = 2004 | |
end_year = 2024 | |
for year in range(start_year, end_year + 1): | |
subterm = term + f' AND (("{year}"[Date - Publication] : "{year}"[Date - Publication]))' | |
handle_esearch = Entrez.esearch(db="pubmed", term=subterm, retmax=0) | |
record_esearch = Entrez.read(handle_esearch) | |
handle_esearch.close() | |
retmax = int(record_esearch["Count"]) | |
handle_esearch = Entrez.esearch(db="pubmed", term=subterm, retmax=retmax) | |
record_esearch = Entrez.read(handle_esearch) | |
handle_esearch.close() | |
pmid_list += record_esearch["IdList"] | |
time.sleep(5) | |
pmid_list = sorted(list(set(pmid_list)), key=lambda x: int(x), reverse=True) | |
data = [] | |
for i in range(0, len(pmid_list), 50): | |
for _ in range(5): | |
try: | |
handle_efetch = Entrez.efetch("pubmed", id=pmid_list[i:i + 50], rettype='medline', retmode="text") | |
records_efetch = list(Medline.parse(handle_efetch)) | |
handle_efetch.close() | |
break | |
except HTTPError as e: | |
if e.code in [429, 500]: | |
print(f"efetch: HTTP错误 {e.code},5 秒后重试...") | |
time.sleep(5) | |
continue | |
else: | |
print(f"efetch: HTTP错误: {e}") | |
raise | |
except Exception as e: | |
print(f"efetch: Error: {e}") | |
time.sleep(5) | |
continue | |
for record_efetch in records_efetch: | |
PMID = record_efetch.get('PMID', 'NA') | |
print(PMID) | |
linked, references = fetch_citation_data(PMID) | |
publication_date = record_efetch.get('SO', 'NA') | |
match = date_pattern.search(publication_date) | |
formatted_publication_date = match.group() if match else 'NA' | |
record_dict = create_record_dict(record_efetch, formatted_publication_date, linked, references) | |
data.append(record_dict) | |
print(f'已成功获取 {len(data)}/{len(pmid_list)} 篇文献信息') | |
df = pd.DataFrame(data) | |
df.to_csv(fname, index=False) | |
time.sleep(5) |
# fetch_citation_data 函数
这个函数获取文章的引用和参考文献信息:
def fetch_citation_data(PMID): | |
""" | |
获取引用和参考文献信息。 | |
参数: | |
- PMID: PubMed 标识符 | |
返回值: | |
- linked: 引用 PMID 列表 | |
- references: 参考文献 PMID 列表 | |
""" | |
linked = [] | |
references = [] | |
for _ in range(5): | |
try: | |
handle_elink = Entrez.elink(db="pubmed", id=PMID, linkname="pubmed_pubmed_citedin,pubmed_pubmed_refs") | |
record_elink = Entrez.read(handle_elink) | |
handle_elink.close() | |
break | |
except HTTPError as e: | |
if e.code in [429, 500]: | |
print(f"elink: HTTP错误 {e.code},5 秒后重试...") | |
time.sleep(5) | |
continue | |
else: | |
print(f"elink: HTTP错误: {e}") | |
raise | |
except Exception as e: | |
print(f"elink: Error: {e}") | |
time.sleep(5) | |
continue | |
if record_elink and "LinkSetDb" in record_elink[0]: | |
for linkset in record_elink[0]["LinkSetDb"]: | |
if linkset["LinkName"] == "pubmed_pubmed_citedin" and "Link" in linkset: | |
linked.extend(link["Id"] for link in linkset["Link"]) | |
elif linkset["LinkName"] == "pubmed_pubmed_refs" and "Link" in linkset: | |
references.extend(link["Id"] for link in linkset["Link"]) | |
return linked, references |
# create_record_dict 函数
这个函数创建一个包含文章详细信息的字典:
def create_record_dict(record_efetch, publication_date, linked, references): | |
""" | |
创建记录字典。 | |
参数: | |
- record_efetch: 文章记录 | |
- publication_date: 正式发表日期 | |
- linked: 引用 PMID 列表 | |
- references: 参考文献 PMID 列表 | |
返回值: | |
- record_dict: 记录字典 | |
""" | |
return { | |
'Title': record_efetch.get('TI', 'NA'), # 文章标题 | |
'Status': record_efetch.get('STAT', 'NA'), # 记录状态(Status),如 'PubMed-not-MEDLINE',表示在 PubMed 中但未被 MEDLINE 索引 | |
'Last Revision Date': record_efetch.get('LR', 'NA'), # 最后修订日期 | |
'ISSN': record_efetch.get('IS', 'NA'), # 国际标准刊号 | |
'Type': record_efetch.get('PT', 'NA'), # 提取文章类型 | |
'Year of Publication': record_efetch.get('DP', 'NA').split(' ')[0] if 'DP' in record_efetch else 'NA', # 出版年份 | |
'Date of Electronic Publication': record_efetch.get('DEP', 'NA'), # 电子出版日期 | |
'Publication Date': publication_date, # 正式发表日期 | |
'Place of Publication': record_efetch.get('PL', 'NA'), # 出版地 | |
'F_Author': record_efetch.get('FAU', 'NA'), # 作者全名 | |
'Author': record_efetch.get('AU', 'NA'), # 作者名字缩写 | |
'Affiliation': record_efetch.get('AD', 'NA'), # 机构信息 | |
'Abstract': record_efetch.get('AB', 'NA'), # 摘要 | |
'Language': record_efetch.get('LA', 'NA'), # 文章语言 | |
'Keywords': record_efetch.get('OT', 'NA'), # 文章关键词 | |
'PMID': record_efetch.get('PMID', 'NA'), # PubMed 文章 ID | |
'Medline Volume': record_efetch.get('VI', 'NA'), # MEDLINE 卷号 | |
'Medline Issue': record_efetch.get('IP', 'NA'), # MEDLINE 期号 | |
'Medline Pagination': record_efetch.get('PG', 'NA'), # MEDLINE 页码 | |
'DOI': record_efetch.get('LID', 'NA').split(' ')[0] if 'LID' in record_efetch else 'NA', # 数字对象标识符 | |
'PMC': record_efetch.get('PMC', 'NA'), # PubMed Central 文章 ID | |
'Processing History': record_efetch.get('PSTT', 'NA'), # 处理历史 | |
'Publication Status': record_efetch.get('PST', 'NA'), # 出版状态 | |
'Journal Title Abbreviation': record_efetch.get('TA', 'NA'), # 期刊缩写 | |
'Journal Title': record_efetch.get('JT', 'NA'), # 期刊全称 | |
'Journal ID': record_efetch.get('JID', 'NA'), # 期刊 ID | |
'Source': record_efetch.get('SO', 'NA'), # 文章来源 | |
'Grant List': record_efetch.get('GR', 'NA'), # 资助号 | |
'cited': len(linked), # 引用数量 | |
'cited_by': linked, # 引用该文章的 PubMed 文章 ID | |
'References': len(references), # 参考文献数量 | |
'References_PMID': references, # 参考文献的 PubMed 文章 ID | |
} |
# 运行脚本
在主程序中,我们定义检索词列表和对应的文件名列表,然后调用 fetch_pubmed_data
函数:
if __name__ == "__main__": | |
# 单个检索词 | |
term_list = ['''((ophthalmology[Text Word])) AND (ophthalmology[Title/Abstract])'''] | |
fname_list = ['./ophthalmology.csv'] | |
# 多个检索词分别保存为多个文件 | |
# term_list = ['''"Refractive Errors"[Text Word] AND "Refractive Errors"[Title/Abstract]''', | |
# '''"diabetic retinopathy"[Text Word] AND "diabetic retinopathy"[Title/Abstract]''', | |
# '''"cornea diseases" OR "uveitis" OR "visual function" OR "ocular cancer" OR "choroidal diseases" OR "strabismus"'''] | |
# fname_list = ['./result/Eye_RE.csv', | |
# './result/Eye_DR.csv', | |
# './result/Eye_Other.csv'] | |
for term, fname in zip(term_list, fname_list): | |
fetch_pubmed_data(term, fname) |
运行脚本后,符合条件的文献信息将被提取并保存到对应的 CSV 文件中。
# 完整代码
import pandas as pd | |
from Bio import Entrez, Medline | |
import time | |
from urllib.error import HTTPError | |
import re | |
import ssl | |
import urllib3 | |
# 禁用警告和证书验证 | |
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | |
ssl._create_default_https_context = ssl._create_unverified_context | |
# 设置 NCBI 账户的邮箱 | |
Entrez.email = "your_email@example.com" | |
# 提取日期的正则表达式模式 | |
date_pattern = re.compile(r'\b\d{4} \b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b \d{1,2}') | |
def fetch_pubmed_data(term, fname): | |
""" | |
从 PubMed 获取文献信息并保存到 CSV 文件。 | |
参数: | |
- term: 检索词 | |
- fname: 保存文件名 | |
""" | |
handle_esearch = Entrez.esearch(db="pubmed", term=term, retmax=0) | |
record_esearch = Entrez.read(handle_esearch) | |
handle_esearch.close() | |
retmax = int(record_esearch["Count"]) | |
time.sleep(5) | |
pmid_list = [] | |
if retmax < 10000: | |
handle_esearch = Entrez.esearch(db="pubmed", term=term, retmax=retmax) | |
record_esearch = Entrez.read(handle_esearch) | |
handle_esearch.close() | |
pmid_list = record_esearch["IdList"] | |
else: | |
start_year = 2004 | |
end_year = 2024 | |
for year in range(start_year, end_year + 1): | |
subterm = term + f' AND (("{year}"[Date - Publication] : "{year}"[Date - Publication]))' | |
handle_esearch = Entrez.esearch(db="pubmed", term=subterm, retmax=0) | |
record_esearch = Entrez.read(handle_esearch) | |
handle_esearch.close() | |
retmax = int(record_esearch["Count"]) | |
handle_esearch = Entrez.esearch(db="pubmed", term=subterm, retmax=retmax) | |
record_esearch = Entrez.read(handle_esearch) | |
handle_esearch.close() | |
pmid_list += record_esearch["IdList"] | |
time.sleep(5) | |
pmid_list = sorted(list(set(pmid_list)), key=lambda x: int(x), reverse=True) | |
data = [] | |
for i in range(0, len(pmid_list), 50): | |
for _ in range(5): | |
try: | |
handle_efetch = Entrez.efetch("pubmed", id=pmid_list[i:i + 50], rettype='medline', retmode="text") | |
records_efetch = list(Medline.parse(handle_efetch)) | |
handle_efetch.close() | |
break | |
except HTTPError as e: | |
if e.code in [429, 500]: | |
print(f"efetch: HTTP错误 {e.code},5 秒后重试...") | |
time.sleep(5) | |
continue | |
else: | |
print(f"efetch: HTTP错误: {e}") | |
raise | |
except Exception as e: | |
print(f"efetch: Error: {e}") | |
time.sleep(5) | |
continue | |
for record_efetch in records_efetch: | |
PMID = record_efetch.get('PMID', 'NA') | |
print(PMID) | |
linked, references = fetch_citation_data(PMID) | |
publication_date = record_efetch.get('SO', 'NA') | |
match = date_pattern.search(publication_date) | |
formatted_publication_date = match.group() if match else 'NA' | |
record_dict = create_record_dict(record_efetch, formatted_publication_date, linked, references) | |
data.append(record_dict) | |
print(f'已成功获取 {len(data)}/{len(pmid_list)} 篇文献信息') | |
df = pd.DataFrame(data) | |
df.to_csv(fname, index=False) | |
time.sleep(5) | |
def fetch_citation_data(PMID): | |
""" | |
获取引用和参考文献信息。 | |
参数: | |
- PMID: PubMed 标识符 | |
返回值: | |
- linked: 引用 PMID 列表 | |
- references: 参考文献 PMID 列表 | |
""" | |
linked = [] | |
references = [] | |
for _ in range(5): | |
try: | |
handle_elink = Entrez.elink(db="pubmed", id=PMID, linkname="pubmed_pubmed_citedin,pubmed_pubmed_refs") | |
record_elink = Entrez.read(handle_elink) | |
handle_elink.close() | |
break | |
except HTTPError as e: | |
if e.code in [429, 500]: | |
print(f"elink: HTTP错误 {e.code},5 秒后重试...") | |
time.sleep(5) | |
continue | |
else: | |
print(f"elink: HTTP错误: {e}") | |
raise | |
except Exception as e: | |
print(f"elink: Error: {e}") | |
time.sleep(5) | |
continue | |
if record_elink and "LinkSetDb" in record_elink[0]: | |
for linkset in record_elink[0]["LinkSetDb"]: | |
if linkset["LinkName"] == "pubmed_pubmed_citedin" and "Link" in linkset: | |
linked.extend(link["Id"] for link in linkset["Link"]) | |
elif linkset["LinkName"] == "pubmed_pubmed_refs" and "Link" in linkset: | |
references.extend(link["Id"] for link in linkset["Link"]) | |
return linked, references | |
def create_record_dict(record_efetch, publication_date, linked, references): | |
""" | |
创建记录字典。 | |
参数: | |
- record_efetch: 文章记录 | |
- publication_date: 正式发表日期 | |
- linked: 引用 PMID 列表 | |
- references: 参考文献 PMID 列表 | |
返回值: | |
- record_dict: 记录字典 | |
""" | |
return { | |
'Title': record_efetch.get('TI', 'NA'), # 文章标题 | |
'Status': record_efetch.get('STAT', 'NA'), # 记录状态(Status),如 'PubMed-not-MEDLINE',表示在 PubMed 中但未被 MEDLINE 索引 | |
'Last Revision Date': record_efetch.get('LR', 'NA'), # 最后修订日期 | |
'ISSN': record_efetch.get('IS', 'NA'), # 国际标准刊号 | |
'Type': record_efetch.get('PT', 'NA'), # 提取文章类型 | |
'Year of Publication': record_efetch.get('DP', 'NA').split(' ')[0] if 'DP' in record_efetch else 'NA', # 出版年份 | |
'Date of Electronic Publication': record_efetch.get('DEP', 'NA'), # 电子出版日期 | |
'Publication Date': publication_date, # 正式发表日期 | |
'Place of Publication': record_efetch.get('PL', 'NA'), # 出版地 | |
'F_Author': record_efetch.get('FAU', 'NA'), # 作者全名 | |
'Author': record_efetch.get('AU', 'NA'), # 作者名字缩写 | |
'Affiliation': record_efetch.get('AD', 'NA'), # 机构信息 | |
'Abstract': record_efetch.get('AB', 'NA'), # 摘要 | |
'Language': record_efetch.get('LA', 'NA'), # 文章语言 | |
'Keywords': record_efetch.get('OT', 'NA'), # 文章关键词 | |
'PMID': record_efetch.get('PMID', 'NA'), # PubMed 文章 ID | |
'Medline Volume': record_efetch.get('VI', 'NA'), # MEDLINE 卷号 | |
'Medline Issue': record_efetch.get('IP', 'NA'), # MEDLINE 期号 | |
'Medline Pagination': record_efetch.get('PG', 'NA'), # MEDLINE 页码 | |
'DOI': record_efetch.get('LID', 'NA').split(' ')[0] if 'LID' in record_efetch else 'NA', # 数字对象标识符 | |
'PMC': record_efetch.get('PMC', 'NA'), # PubMed Central 文章 ID | |
'Processing History': record_efetch.get('PSTT', 'NA'), # 处理历史 | |
'Publication Status': record_efetch.get('PST', 'NA'), # 出版状态 | |
'Journal Title Abbreviation': record_efetch.get('TA', 'NA'), # 期刊缩写 | |
'Journal Title': record_efetch.get('JT', 'NA'), # 期刊全称 | |
'Journal ID': record_efetch.get('JID', 'NA'), # 期刊 ID | |
'Source': record_efetch.get('SO', 'NA'), # 文章来源 | |
'Grant List': record_efetch.get('GR', 'NA'), # 资助号 | |
'cited': len(linked), # 引用数量 | |
'cited_by': linked, # 引用该文章的 PubMed 文章 ID | |
'References': len(references), # 参考文献数量 | |
'References_PMID': references, # 参考文献的 PubMed 文章 ID | |
} | |
if __name__ == "__main__": | |
# 单个检索词 | |
term_list = ['''((ophthalmology[Text Word])) AND (ophthalmology[Title/Abstract])'''] | |
fname_list = ['./ophthalmology.csv'] | |
# 多个检索词分别保存为多个文件 | |
# term_list = ['''"Refractive Errors"[Text Word] AND "Refractive Errors"[Title/Abstract]''', | |
# '''"diabetic retinopathy"[Text Word] AND "diabetic retinopathy"[Title/Abstract]''', | |
# '''"cornea diseases" OR "uveitis" OR "visual function" OR "ocular cancer" OR "choroidal diseases" OR "strabismus"'''] | |
# fname_list = ['./result/Eye_RE.csv', | |
# './result/Eye_DR.csv', | |
# './result/Eye_Other.csv'] | |
for term, fname in zip(term_list, fname_list): | |
fetch_pubmed_data(term, fname) |