1
2
3
4
5
6
7 """Provides code to access NCBI over the WWW.
8
9 The main Entrez web page is available at:
10 http://www.ncbi.nlm.nih.gov/Entrez/
11
12 A list of the Entrez utilities is available at:
13 http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html
14
15 Variables:
16 email Set the Entrez email parameter (default is not set).
17 tool Set the Entrez tool parameter (default is biopython).
18
19 Functions:
20 efetch Retrieves records in the requested format from a list of one or
21 more primary IDs or from the user's environment
22 epost Posts a file containing a list of primary IDs for future use in
23 the user's environment to use with subsequent search strategies
24 esearch Searches and retrieves primary IDs (for use in EFetch, ELink,
25 and ESummary) and term translations and optionally retains
26 results for future use in the user's environment.
27 elink Checks for the existence of an external or Related Articles link
28 from a list of one or more primary IDs. Retrieves primary IDs
29 and relevancy scores for links to Entrez databases or Related
30 Articles; creates a hyperlink to the primary LinkOut provider
31 for a specific ID and database, or lists LinkOut URLs
32 and Attributes for multiple IDs.
33 einfo Provides field index term counts, last update, and available
34 links for each database.
35 esummary Retrieves document summaries from a list of primary IDs or from
36 the user's environment.
37 egquery Provides Entrez database counts in XML for a single search
38 using Global Query.
39 espell Retrieves spelling suggestions.
40
41 read Parses the XML results returned by any of the above functions.
42 Typical usage is:
43 >>> handle = Entrez.einfo() # or esearch, efetch, ...
44 >>> record = Entrez.read(handle)
45 where record is now a Python dictionary or list.
46
47 _open Internally used function.
48
49 """
50 import urllib, time, warnings
51 import os.path
52 from Bio import File
53
54
55 email = None
56 tool = "biopython"
57
58
59
60 -def epost(db, **keywds):
61 """Post a file of identifiers for future use.
62
63 Posts a file containing a list of UIs for future use in the user's
64 environment to use with subsequent search strategies.
65
66 See the online documentation for an explanation of the parameters:
67 http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html
68
69 Return a handle to the results.
70
71 Raises an IOError exception if there's a network error.
72 """
73 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi'
74 variables = {'db' : db}
75 variables.update(keywds)
76 return _open(cgi, variables, post=True)
77
79 """Fetches Entrez results which are returned as a handle.
80
81 EFetch retrieves records in the requested format from a list of one or
82 more UIs or from user's environment.
83
84 See the online documentation for an explanation of the parameters:
85 http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html
86
87 Return a handle to the results.
88
89 Raises an IOError exception if there's a network error.
90
91 Short example:
92
93 from Bio import Entrez
94 handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="gb")
95 print handle.read()
96 """
97 for key in keywds:
98 if key.lower()=="rettype" and keywds[key].lower()=="genbank":
99 warnings.warn('As of Easter 2009, Entrez EFetch no longer '
100 'supports the unofficial return type "genbank", '
101 'use "gb" or "gp" instead.', DeprecationWarning)
102 if db.lower()=="protein":
103 keywds[key] = "gp"
104 else:
105 keywds[key] = "gb"
106 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
107 variables = {'db' : db}
108 variables.update(keywds)
109 return _open(cgi, variables)
110
112 """ESearch runs an Entrez search and returns a handle to the results.
113
114 ESearch searches and retrieves primary IDs (for use in EFetch, ELink
115 and ESummary) and term translations, and optionally retains results
116 for future use in the user's environment.
117
118 See the online documentation for an explanation of the parameters:
119 http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html
120
121 Return a handle to the results which are always in XML format.
122
123 Raises an IOError exception if there's a network error.
124
125 Short example:
126
127 from Bio import Entez
128 handle = Entrez.esearch(db="nucleotide", retmax=10, term="Opuntia")
129 record = Entrez.read(handle)
130 print record["Count"]
131 print record["IdList"]
132 """
133 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
134 variables = {'db' : db,
135 'term' : term}
136 variables.update(keywds)
137 return _open(cgi, variables)
138
140 """ELink checks for linked external articles and returns a handle.
141
142 ELink checks for the existence of an external or Related Articles link
143 from a list of one or more primary IDs; retrieves IDs and relevancy
144 scores for links to Entrez databases or Related Articles; creates a
145 hyperlink to the primary LinkOut provider for a specific ID and
146 database, or lists LinkOut URLs and attributes for multiple IDs.
147
148 See the online documentation for an explanation of the parameters:
149 http://www.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html
150
151 Return a handle to the results, by default in XML format.
152
153 Raises an IOError exception if there's a network error.
154 """
155 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
156 variables = {}
157 variables.update(keywds)
158 return _open(cgi, variables)
159
161 """EInfo returns a summary of the Entez databases as a results handle.
162
163 EInfo provides field names, index term counts, last update, and
164 available links for each Entrez database.
165
166 See the online documentation for an explanation of the parameters:
167 http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html
168
169 Return a handle to the results, by default in XML format.
170
171 Raises an IOError exception if there's a network error.
172
173 Short example:
174
175 from Bio import Entrez
176 record = Entrez.read(Entrez.einfo())
177 print record['DbList']
178 """
179 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
180 variables = {}
181 variables.update(keywds)
182 return _open(cgi, variables)
183
185 """ESummary retrieves document summaries as a results handle.
186
187 ESummary retrieves document summaries from a list of primary IDs or
188 from the user's environment.
189
190 See the online documentation for an explanation of the parameters:
191 http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html
192
193 Return a handle to the results, by default in XML format.
194
195 Raises an IOError exception if there's a network error.
196 """
197 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
198 variables = {}
199 variables.update(keywds)
200 return _open(cgi, variables)
201
203 """EGQuery provides Entrez database counts for a global search.
204
205 EGQuery provides Entrez database counts in XML for a single search
206 using Global Query.
207
208 See the online documentation for an explanation of the parameters:
209 http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html
210
211 Return a handle to the results in XML format.
212
213 Raises an IOError exception if there's a network error.
214 """
215 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi'
216 variables = {}
217 variables.update(keywds)
218 return _open(cgi, variables)
219
221 """ESpell retrieves spelling suggestions, returned in a results handle.
222
223 ESpell retrieves spelling suggestions, if available.
224
225 See the online documentation for an explanation of the parameters:
226 http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html
227
228 Return a handle to the results, by default in XML format.
229
230 Raises an IOError exception if there's a network error.
231
232 Short example:
233
234 from Bio import Entrez
235 record = Entrez.read(Entrez.espell(term="biopythooon"))
236 print record["Query"]
237 print record["CorrectedQuery"]
238 """
239 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi'
240 variables = {}
241 variables.update(keywds)
242 return _open(cgi, variables)
243
245 """Parses an XML file from the NCBI Entrez Utilities into python objects.
246
247 This function parses an XML file created by NCBI's Entrez Utilities,
248 returning a multilevel data structure of Python lists and dictionaries.
249 Most XML files returned by NCBI's Entrez Utilities can be parsed by
250 this function, provided its DTD is available. Biopython includes the
251 DTDs for most commonly used Entrez Utilities.
252
253 Whereas the data structure seems to consist of generic Python lists,
254 dictionaries, strings, and so on, each of these is actually a class
255 derived from the base type. This allows us to store the attributes
256 (if any) of each element in a dictionary my_element.attributes, and
257 the tag name in my_element.tag.
258 """
259 from Parser import DataHandler
260 DTDs = os.path.join(__path__[0], "DTDs")
261 handler = DataHandler(DTDs)
262 record = handler.read(handle)
263 return record
264
271
272 -def _open(cgi, params={}, post=False):
273 """Helper function to build the URL and open a handle to it (PRIVATE).
274
275 Open a handle to Entrez. cgi is the URL for the cgi script to access.
276 params is a dictionary with the options to pass to it. Does some
277 simple error checking, and will raise an IOError if it encounters one.
278
279 This function also enforces the "up to three queries per second rule"
280 to avoid abusing the NCBI servers.
281 """
282
283
284 delay = 0.333333334
285 current = time.time()
286 wait = _open.previous + delay - current
287 if wait > 0:
288 time.sleep(wait)
289 _open.previous = current + wait
290 else:
291 _open.previous = current
292
293 for key, value in params.items():
294 if value is None:
295 del params[key]
296
297
298 if not "tool" in params:
299 params["tool"] = tool
300
301 if not "email" in params:
302 if email!=None:
303 params["email"] = email
304 else:
305 warnings.warn("""
306 Email address is not specified.
307
308 To make use of NCBI's E-utilities, NCBI strongly recommends you to specify
309 your email address with each request. From June 1, 2010, this will be
310 mandatory. As an example, if your email address is A.N.Other@example.com, you
311 can specify it as follows:
312 from Bio import Entrez
313 Entrez.email = 'A.N.Other@example.com'
314 In case of excessive usage of the E-utilities, NCBI will attempt to contact
315 a user at the email address provided before blocking access to the
316 E-utilities.""", UserWarning)
317
318 options = urllib.urlencode(params, doseq=True)
319 if post:
320
321 handle = urllib.urlopen(cgi, data=options)
322 else:
323
324 cgi += "?" + options
325 handle = urllib.urlopen(cgi)
326
327
328 uhandle = File.UndoHandle(handle)
329
330
331
332 lines = []
333 for i in range(7):
334 lines.append(uhandle.readline())
335 for i in range(6, -1, -1):
336 uhandle.saveline(lines[i])
337 data = ''.join(lines)
338
339 if "500 Proxy Error" in data:
340
341 raise IOError("500 Proxy Error (NCBI busy?)")
342 elif "502 Proxy Error" in data:
343 raise IOError("502 Proxy Error (NCBI busy?)")
344 elif "WWW Error 500 Diagnostic" in data:
345 raise IOError("WWW Error 500 Diagnostic (NCBI busy?)")
346 elif "<title>Service unavailable!</title>" in data:
347
348 raise IOError("Service unavailable!")
349 elif "<title>Bad Gateway!</title>" in data:
350
351
352
353 raise IOError("Bad Gateway!")
354 elif "<title>414 Request-URI Too Large</title>" in data \
355 or "<h1>Request-URI Too Large</h1>" in data:
356 raise IOError("Requested URL too long (try using EPost?)")
357 elif data.startswith("Error:"):
358
359 raise IOError(data.strip())
360 elif data.startswith("The resource is temporarily unavailable"):
361
362
363 raise IOError("The resource is temporarily unavailable")
364 elif data.startswith("download dataset is empty"):
365
366
367 raise IOError("download dataset is empty")
368 elif data[:5] == "ERROR":
369
370
371 raise IOError("ERROR, possibly because id not available?")
372
373 return uhandle
374
375 _open.previous = 0
376