1
2
3
4
5
6 """Parser for XML results returned by NCBI's Entrez Utilities. This
7 parser is used by the read() function in Bio.Entrez, and is not intended
8 be used directly.
9 """
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 import os.path
39 from xml.parsers import expat
40
41
42
43
45
47
49
51
53
54
55
56
68
69
72 return "Failed to parse the XML data. Please make sure that the input data are in XML format."
73
74
77
78
79 return "Failed to parse the XML data. Please make sure that the input data are in XML format, and that the data are not corrupted."
80
81
83
85 self.stack = []
86 self.errors = []
87 self.integers = []
88 self.strings = []
89 self.lists = []
90 self.dictionaries = []
91 self.structures = {}
92 self.items = []
93 self.dtd_dir = dtd_dir
94 self.valid = True
95
96 self.parser = expat.ParserCreate(namespace_separator=" ")
97 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
98 self.parser.XmlDeclHandler = self.xmlDeclHandler
99 self.parser.StartElementHandler = self.startElementHandler
100 self.parser.EndElementHandler = self.endElementHandler
101 self.parser.CharacterDataHandler = self.characterDataHandler
102 self.parser.ExternalEntityRefHandler = self.externalEntityRefHandler
103 self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler
104
105 - def read(self, handle):
106 """Set up the parser and let it parse the XML results"""
107 try:
108 self.parser.ParseFile(handle)
109 except expat.ExpatError:
110 if self.valid:
111
112
113
114 raise CorruptedXMLError
115 else:
116
117
118 raise NotXMLError
119 return self.object
120
121 - def parse(self, handle):
122 BLOCK = 1024
123 while True:
124
125 text = handle.read(BLOCK)
126 if not text:
127
128 if self.stack:
129 raise CorruptedXMLError
130 for record in self.object:
131 yield record
132 self.parser.Parse("", True)
133 self.parser = None
134 return
135
136 try:
137 self.parser.Parse(text, False)
138 except expat.ExpatError:
139 if self.valid:
140
141
142
143 raise CorruptedXMLError
144 else:
145
146
147 raise NotXMLError
148
149 if not self.stack:
150
151 continue
152
153 records = self.stack[0]
154 if not isinstance(records, list):
155 raise ValueError("The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse")
156 while len(records) > 1:
157 record = records.pop(0)
158 yield record
159
161
162 self.valid = True
163
165 raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")
166
168 if not self.valid:
169 raise NotXMLError
170 self.content = ""
171 if name in self.lists:
172 object = ListElement()
173 elif name in self.dictionaries:
174 object = DictionaryElement()
175 elif name in self.structures:
176 object = StructureElement(self.structures[name])
177 elif name in self.items:
178 name = str(attrs["Name"])
179 del attrs["Name"]
180 itemtype = str(attrs["Type"])
181 del attrs["Type"]
182 if itemtype=="Structure":
183 object = DictionaryElement()
184 elif name in ("ArticleIds", "History"):
185 object = StructureElement(["pubmed", "medline"])
186 elif itemtype=="List":
187 object = ListElement()
188 else:
189 object = StringElement()
190 object.itemname = name
191 object.itemtype = itemtype
192 elif name in self.strings + self.errors + self.integers:
193 self.attributes = attrs
194 return
195 else:
196
197 object = ""
198 if object!="":
199 object.tag = name
200 if attrs:
201 object.attributes = dict(attrs)
202 if len(self.stack)!=0:
203 current = self.stack[-1]
204 try:
205 current.append(object)
206 except AttributeError:
207 current[name] = object
208 self.stack.append(object)
209
252
254 if not self.valid:
255 raise NotXMLError
256 self.content += content
257
259 """This callback function is called for each element declaration:
260 <!ELEMENT name (...)>
261 encountered in a DTD. The purpose of this function is to determine
262 whether this element should be regarded as a string, integer, list
263 dictionary, structure, or error."""
264 if not self.valid:
265 raise NotXMLError
266 if name.upper()=="ERROR":
267 self.errors.append(name)
268 return
269 if name=='Item' and model==(expat.model.XML_CTYPE_MIXED,
270 expat.model.XML_CQUANT_REP,
271 None, ((expat.model.XML_CTYPE_NAME,
272 expat.model.XML_CQUANT_NONE,
273 'Item',
274 ()
275 ),
276 )
277 ):
278
279
280 self.items.append(name)
281 return
282
283 while (model[0] in (expat.model.XML_CTYPE_SEQ,
284 expat.model.XML_CTYPE_CHOICE)
285 and model[1] in (expat.model.XML_CQUANT_NONE,
286 expat.model.XML_CQUANT_OPT)
287 and len(model[3])==1):
288 model = model[3][0]
289
290 if model[0] in (expat.model.XML_CTYPE_MIXED,
291 expat.model.XML_CTYPE_EMPTY):
292 self.strings.append(name)
293 return
294
295 if (model[0] in (expat.model.XML_CTYPE_CHOICE,
296 expat.model.XML_CTYPE_SEQ) and
297 model[1] in (expat.model.XML_CQUANT_PLUS,
298 expat.model.XML_CQUANT_REP)):
299 self.lists.append(name)
300 return
301
302
303
304
305
306
307
308 single = []
309 multiple = []
310
311
312
313 def count(model):
314 quantifier, name, children = model[1:]
315 if name==None:
316 if quantifier in (expat.model.XML_CQUANT_PLUS,
317 expat.model.XML_CQUANT_REP):
318 for child in children:
319 multiple.append(child[2])
320 else:
321 for child in children:
322 count(child)
323 elif name.upper()!="ERROR":
324 if quantifier in (expat.model.XML_CQUANT_NONE,
325 expat.model.XML_CQUANT_OPT):
326 single.append(name)
327 elif quantifier in (expat.model.XML_CQUANT_PLUS,
328 expat.model.XML_CQUANT_REP):
329 multiple.append(name)
330 count(model)
331 if len(single)==0 and len(multiple)==1:
332 self.lists.append(name)
333 elif len(multiple)==0:
334 self.dictionaries.append(name)
335 else:
336 self.structures.update({name: multiple})
337
339 """The purpose of this function is to load the DTD locally, instead
340 of downloading it from the URL specified in the XML. Using the local
341 DTD results in much faster parsing. If the DTD is not found locally,
342 we try to download it. In practice, this may fail though, if the XML
343 relies on many interrelated DTDs. If new DTDs appear, putting them in
344 Bio/Entrez/DTDs will allow the parser to see them."""
345 if not self.valid:
346 raise NotXMLError
347 location, filename = os.path.split(systemId)
348 path = os.path.join(self.dtd_dir, filename)
349 try:
350 handle = open(path)
351 except IOError:
352 message = """\
353 Unable to load DTD file %s.
354
355 Bio.Entrez uses NCBI's DTD files to parse XML files returned by NCBI Entrez.
356 Though most of NCBI's DTD files are included in the Biopython distribution,
357 sometimes you may find that a particular DTD file is missing. In such a
358 case, you can download the DTD file from NCBI and install it manually.
359
360 Usually, you can find missing DTD files at either
361 http://www.ncbi.nlm.nih.gov/dtd/
362 or
363 http://eutils.ncbi.nlm.nih.gov/entrez/query/DTD/
364 If you cannot find %s there, you may also try to search
365 for it with a search engine such as Google.
366
367 Please save %s in the directory
368 %s
369 in order for Bio.Entrez to find it.
370 Alternatively, you can save %s in the directory
371 Bio/Entrez/DTDs in the Biopython distribution, and reinstall Biopython.
372
373 Please also inform the Biopython developers by sending an email to
374 biopython-dev@biopython.org to inform us about this missing DTD, so that we
375 can include it with the next release of Biopython.
376 """ % (filename, filename, filename, self.dtd_dir, filename)
377 raise RuntimeError(message)
378
379 parser = self.parser.ExternalEntityParserCreate(context)
380 parser.ElementDeclHandler = self.elementDecl
381 parser.ParseFile(handle)
382 return 1
383