pymed

Pubmed_search_python

Posted by Yuan on October 4, 2021

To use pymed package to search pubmed database, we need to have an NCBI account and an ‘API’ key generated from NCBI account settings. After this basic info, I query Pubmed with different titles to get the authors information. My code for this task:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from pymed import PubMed
import pandas as pd
from difflib import SequenceMatcher
import json
import os
#Some Basic Settings
pubmed = PubMed(tool="PubMedSearcher", email="myNCBIAccountEmail")
my_api_key = 'MyAPI key'
pubmed.parameters.update({'api_key': my_api_key})
pubmed._rateLimit = 10

#Append results one by one to avoid repeated searches in case bugs occured
def append_record(record):
    with open('pub_files.txt', 'a') as f:
        json.dump(record, f)
        f.write(os.linesep)

def getpubaus(authos_pub):
    '''
    get lastname and initials from pubmed author dicts
    '''
    aus = list()
    for oneau in authos_pub:
        onename = oneau['initials']
        if onename is None:
            onename = oneau['lastname']
        else:
            onename = oneau['initials']
            if onename is None:
                continue
            else:
                onename = oneau['initials'] + " " + oneau['lastname'].upper()
            #oneau['initials']+" "+
        if onename is None:
            #print(authos_pub)
            continue
        aus.append(onename)
    return ",".join(aus)


## PUT YOUR SEARCH TERM HERE ##
search_terms = allcitations['title_source'].values.tolist()
articleInfo = []
tip = 0
for i in range(len(search_terms)):
    search_term = search_terms[i]
    authors_source = str(aus_terms[i])
    authors_source = authors_source.split(",")
    authors_source_last = authors_source[0]
    authors_source_last = authors_source_last.split(" ")
    authors_source_last = authors_source_last[-1].strip()
    authors_source_last = authors_source_last.upper()
    title_source = title_terms[i]
    title_source = title_source.strip().upper()
    if i > 367:
        #pass
        break

    print(f"Pubmed search for {i}th article:\n#######################")
    print(search_term)
    print("#######################")
    results = pubmed.query(search_term, max_results=20)
    articleList = []
    #

    for article in results:
    # Print the type of object we've found (can be either PubMedBookArticle or PubMedArticle).
    # We need to convert it to dictionary with available function
        articleDict = article.toDict()
        articleList.append(articleDict)

    # Generate list of dict records which will hold all article details that could be fetch from PUBMED API
    found = False
    for article in articleList:
        #Sometimes article['pubmed_id'] contains list separated with comma - take first pubmedId in that list - thats article pubmedId
        #{'lastname': 'Emperador-Melero', 'firstname':
        #{'lastname': 'Tse', 'firstname': 'Wai-Pui', '...
        #'lastname': 'Oliva', 'firstname': 'Rosario', 'initials': 'R',
        authors_pubmed = article['authors']
        if len(authors_pubmed) == 0:
            continue
        #print(authors_pubmed)
        #print(authors_pubmed[0]['lastname'])
        lastau_pubmed = authors_pubmed[0]['lastname']
        if lastau_pubmed is None:
            lastau_pubmed = ""
        lastau_pubmed = lastau_pubmed.upper()
        #initau_pubmed = authors_pubmed[0]['initials']
        #if initau_pubmed is None:
        #    initau_pubmed = " "
        #initau_pubmed = initau_pubmed.strip()
        title_pubmed = article['title']
        pubmedId = article['pubmed_id'].partition('\n')[0]
        #print("Current search result:")
        #print(title_pubmed)
        #print("authors")
        au_pub = getpubaus(article['authors'])
        #print(au_pub)
        #print("---------------------")
        if SequenceMatcher(None, title_pubmed.upper(), title_source).ratio() > 0.8:
            found = True
            print("Find match for:")
            print(title_pubmed)
            #append df infomation
            au_pub = getpubaus(article['authors'])
            print(au_pub)
            print("-----------------------\n")
            my_dict = {u'index_source':i,
                    u'pubmed_id':pubmedId,
                    u'title_pub':article['title'],
                    u'title_source':title_terms[i],
                    u'authors_pub':au_pub,
                    u'clusterid_google':id_terms[i],
                    u'titleMatch':'High'
                    }
            articleInfo.append(my_dict)
            append_record(my_dict)
            
            break
        if lastau_pubmed is not None and lastau_pubmed == authors_source_last:
            found = True
            print(f"Find potential match for:{title_source}")
            print(title_pubmed)
            #append
            au_pub = getpubaus(article['authors'])
            print(au_pub)
            print("-----------------------\n")
            my_dict = {u'index_source':i,
                    u'pubmed_id':pubmedId,
                    u'title_pub':article['title'],
                    u'title_source':title_terms[i],
                    u'authors_pub':au_pub,
                    u'clusterid_google':id_terms[i],
                    u'titleMatch':'Low'
                    }
            articleInfo.append(my_dict)
            append_record(my_dict)
            break
        '''
        pubmedId = article['pubmed_id'].partition('\n')[0]
        # Append article info to dictionary 
        articleInfo.append({u'pubmed_id':pubmedId,
                        u'title':article['title'],
                        u'keywords':article['keywords'],
                        u'journal':article['journal'],
                        u'abstract':article['abstract'],
                        u'conclusions':article['conclusions'],
                        u'methods':article['methods'],
                        u'results': article['results'],
                        u'copyrights':article['copyrights'],
                        u'doi':article['doi'],
                        u'publication_date':article['publication_date'], 
                        u'authors':article['authors']})
        '''
    if found==False:
        #appendNone
        #print(len(id_terms))
        my_dict = {u'index_source':i,
                    u'pubmed_id':"",
                    u'title_pub':"",
                    u'title_source':title_terms[i],
                    u'authors_pub':"",
                    u'clusterid_google':id_terms[i],
                    u'titleMatch':'None'
                    }
        articleInfo.append(my_dict)
        append_record(my_dict)
    # Generate Pandas DataFrame from list of dictionaries
    #articlesPD = pd.DataFrame.from_dict(articleInfo)
    #export_csv = articlesPD.to_csv (r'pubtest.csv', index = None, header=True) 

#Print first 10 rows of dataframe
#print(articlesPD.head(10))
articlesPD = pd.DataFrame.from_dict(articleInfo)
articlesPD.to_csv(r'pubResults2.csv', index = None, header=True) 

if allcitations.shape[0] == articlesPD.shape[0]:
    allcitations_update = pd.concat([allcitations, articlesPD], axis=1)
    allcitations_update.to_csv(r'all_pubmed.csv',index = None,header = True)
#for au in articlesPD.loc[:,"authors_pub"]:
#    print(au)