Package Bio :: Package Entrez :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module Bio.Entrez.Parser

  1  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Parser for XML results returned by NCBI's Entrez Utilities. This 
  7  parser is used by the read() function in Bio.Entrez, and is not intended 
  8  be used directly. 
  9  """ 
 10   
 11  # The question is how to represent an XML file as Python objects. Some 
 12  # XML files returned by NCBI look like lists, others look like dictionaries, 
 13  # and others look like a mix of lists and dictionaries. 
 14  # 
 15  # My approach is to classify each possible element in the XML as a plain 
 16  # string, an integer, a list, a dictionary, or a structure. The latter is a 
 17  # dictionary where the same key can occur multiple times; in Python, it is 
 18  # represented as a dictionary where that key occurs once, pointing to a list 
 19  # of values found in the XML file. 
 20  # 
 21  # The parser then goes through the XML and creates the appropriate Python 
 22  # object for each element. The different levels encountered in the XML are 
 23  # preserved on the Python side. So a subelement of a subelement of an element 
 24  # is a value in a dictionary that is stored in a list which is a value in 
 25  # some other dictionary (or a value in a list which itself belongs to a list 
 26  # which is a value in a dictionary, and so on). Attributes encountered in  
 27  # the XML are stored as a dictionary in a member .attributes of each element, 
 28  # and the tag name is saved in a member .tag. 
 29  # 
 30  # To decide which kind of Python object corresponds to each element in the 
 31  # XML, the parser analyzes the DTD referred at the top of (almost) every 
 32  # XML file returned by the Entrez Utilities. This is preferred over a hand- 
 33  # written solution, since the number of DTDs is rather large and their 
 34  # contents may change over time. About half the code in this parser deals 
 35  # wih parsing the DTD, and the other half with the XML itself. 
 36   
 37   
 38  import os.path 
 39  from xml.parsers import expat 
 40   
 41  # The following four classes are used to add a member .attributes to integers, 
 42  # strings, lists, and dictionaries, respectively. 
 43   
44 -class IntegerElement(int): pass
45
46 -class StringElement(str): pass
47
48 -class UnicodeElement(unicode): pass
49
50 -class ListElement(list): pass
51
52 -class DictionaryElement(dict): pass
53 54 # A StructureElement is like a dictionary, but some of its keys can have 55 # multiple values associated with it. These values are stored in a list 56 # under each key.
57 -class StructureElement(dict):
58 - def __init__(self, keys):
59 dict.__init__(self) 60 for key in keys: 61 dict.__setitem__(self, key, []) 62 self.listkeys = keys
63 - def __setitem__(self, key, value):
64 if key in self.listkeys: 65 self[key].append(value) 66 else: 67 dict.__setitem__(self, key, value)
68 69
70 -class NotXMLError(ValueError):
71 - def __str__(self):
72 return "Failed to parse the XML data. Please make sure that the input data are in XML format."
73 74
75 -class CorruptedXMLError(ValueError):
76 - def __str__(self):
77 # This message can be changed once all XML data returned by EUtils 78 # start with the XML declaration 79 return "Failed to parse the XML data. Please make sure that the input data are in XML format, and that the data are not corrupted."
80 81
82 -class DataHandler:
83
84 - def __init__(self, dtd_dir):
85 self.stack = [] 86 self.errors = [] 87 self.integers = [] 88 self.strings = [] 89 self.lists = [] 90 self.dictionaries = [] 91 self.structures = {} 92 self.items = [] 93 self.dtd_dir = dtd_dir 94 self.valid = True 95 # Set to False once EUtils always returns XML files starting with <!xml 96 self.parser = expat.ParserCreate(namespace_separator=" ") 97 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) 98 self.parser.XmlDeclHandler = self.xmlDeclHandler 99 self.parser.StartElementHandler = self.startElementHandler 100 self.parser.EndElementHandler = self.endElementHandler 101 self.parser.CharacterDataHandler = self.characterDataHandler 102 self.parser.ExternalEntityRefHandler = self.externalEntityRefHandler 103 self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler
104
105 - def read(self, handle):
106 """Set up the parser and let it parse the XML results""" 107 try: 108 self.parser.ParseFile(handle) 109 except expat.ExpatError: 110 if self.valid: 111 # We saw the initial <!xml declaration, so we can be sure that 112 # we are parsing XML data. Most likely, the XML file is 113 # corrupted. 114 raise CorruptedXMLError 115 else: 116 # We have not seen the initial <!xml declaration, so probably 117 # the input data is not in XML format. 118 raise NotXMLError 119 return self.object
120
121 - def parse(self, handle):
122 BLOCK = 1024 123 while True: 124 #Read in another block of the file... 125 text = handle.read(BLOCK) 126 if not text: 127 # We have reached the end of the XML file 128 if self.stack: 129 raise CorruptedXMLError 130 for record in self.object: 131 yield record 132 self.parser.Parse("", True) 133 self.parser = None 134 return 135 136 try: 137 self.parser.Parse(text, False) 138 except expat.ExpatError: 139 if self.valid: 140 # We saw the initial <!xml declaration, so we can be sure 141 # that we are parsing XML data. Most likely, the XML file 142 # is corrupted. 143 raise CorruptedXMLError 144 else: 145 # We have not seen the initial <!xml declaration, so 146 # probably the input data is not in XML format. 147 raise NotXMLError 148 149 if not self.stack: 150 # Haven't read enough from the XML file yet 151 continue 152 153 records = self.stack[0] 154 if not isinstance(records, list): 155 raise ValueError("The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse") 156 while len(records) > 1: # Then the top record is finished 157 record = records.pop(0) 158 yield record
159
160 - def xmlDeclHandler(self, version, encoding, standalone):
161 # The purpose of this method is to make sure that we are parsing XML. 162 self.valid = True
163
164 - def startNamespaceDeclHandler(self, prefix, un):
165 raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")
166
167 - def startElementHandler(self, name, attrs):
168 if not self.valid: 169 raise NotXMLError 170 self.content = "" 171 if name in self.lists: 172 object = ListElement() 173 elif name in self.dictionaries: 174 object = DictionaryElement() 175 elif name in self.structures: 176 object = StructureElement(self.structures[name]) 177 elif name in self.items: # Only appears in ESummary 178 name = str(attrs["Name"]) # convert from Unicode 179 del attrs["Name"] 180 itemtype = str(attrs["Type"]) # convert from Unicode 181 del attrs["Type"] 182 if itemtype=="Structure": 183 object = DictionaryElement() 184 elif name in ("ArticleIds", "History"): 185 object = StructureElement(["pubmed", "medline"]) 186 elif itemtype=="List": 187 object = ListElement() 188 else: 189 object = StringElement() 190 object.itemname = name 191 object.itemtype = itemtype 192 elif name in self.strings + self.errors + self.integers: 193 self.attributes = attrs 194 return 195 else: 196 # Element not found in DTD; this will not be stored in the record 197 object = "" 198 if object!="": 199 object.tag = name 200 if attrs: 201 object.attributes = dict(attrs) 202 if len(self.stack)!=0: 203 current = self.stack[-1] 204 try: 205 current.append(object) 206 except AttributeError: 207 current[name] = object 208 self.stack.append(object)
209
210 - def endElementHandler(self, name):
211 if not self.valid: 212 raise NotXMLError 213 value = self.content 214 if name in self.errors: 215 if value=="": 216 return 217 else: 218 raise RuntimeError(value) 219 elif name in self.integers: 220 value = IntegerElement(value) 221 elif name in self.strings: 222 # Convert Unicode strings to plain strings if possible 223 try: 224 value = StringElement(value) 225 except UnicodeEncodeError: 226 value = UnicodeElement(value) 227 elif name in self.items: 228 self.object = self.stack.pop() 229 if self.object.itemtype in ("List", "Structure"): 230 return 231 elif self.object.itemtype=="Integer" and value: 232 value = IntegerElement(value) 233 else: 234 # Convert Unicode strings to plain strings if possible 235 try: 236 value = StringElement(value) 237 except UnicodeEncodeError: 238 value = UnicodeElement(value) 239 name = self.object.itemname 240 else: 241 self.object = self.stack.pop() 242 return 243 value.tag = name 244 if self.attributes: 245 value.attributes = dict(self.attributes) 246 del self.attributes 247 current = self.stack[-1] 248 try: 249 current.append(value) 250 except AttributeError: 251 current[name] = value
252
253 - def characterDataHandler(self, content):
254 if not self.valid: 255 raise NotXMLError 256 self.content += content
257
258 - def elementDecl(self, name, model):
259 """This callback function is called for each element declaration: 260 <!ELEMENT name (...)> 261 encountered in a DTD. The purpose of this function is to determine 262 whether this element should be regarded as a string, integer, list 263 dictionary, structure, or error.""" 264 if not self.valid: 265 raise NotXMLError 266 if name.upper()=="ERROR": 267 self.errors.append(name) 268 return 269 if name=='Item' and model==(expat.model.XML_CTYPE_MIXED, 270 expat.model.XML_CQUANT_REP, 271 None, ((expat.model.XML_CTYPE_NAME, 272 expat.model.XML_CQUANT_NONE, 273 'Item', 274 () 275 ), 276 ) 277 ): 278 # Special case. As far as I can tell, this only occurs in the 279 # eSummary DTD. 280 self.items.append(name) 281 return 282 # First, remove ignorable parentheses around declarations 283 while (model[0] in (expat.model.XML_CTYPE_SEQ, 284 expat.model.XML_CTYPE_CHOICE) 285 and model[1] in (expat.model.XML_CQUANT_NONE, 286 expat.model.XML_CQUANT_OPT) 287 and len(model[3])==1): 288 model = model[3][0] 289 # PCDATA declarations correspond to strings 290 if model[0] in (expat.model.XML_CTYPE_MIXED, 291 expat.model.XML_CTYPE_EMPTY): 292 self.strings.append(name) 293 return 294 # List-type elements 295 if (model[0] in (expat.model.XML_CTYPE_CHOICE, 296 expat.model.XML_CTYPE_SEQ) and 297 model[1] in (expat.model.XML_CQUANT_PLUS, 298 expat.model.XML_CQUANT_REP)): 299 self.lists.append(name) 300 return 301 # This is the tricky case. Check which keys can occur multiple 302 # times. If only one key is possible, and it can occur multiple 303 # times, then this is a list. If more than one key is possible, 304 # but none of them can occur multiple times, then this is a 305 # dictionary. Otherwise, this is a structure. 306 # In 'single' and 'multiple', we keep track which keys can occur 307 # only once, and which can occur multiple times. 308 single = [] 309 multiple = [] 310 # The 'count' function is called recursively to make sure all the 311 # children in this model are counted. Error keys are ignored; 312 # they raise an exception in Python. 313 def count(model): 314 quantifier, name, children = model[1:] 315 if name==None: 316 if quantifier in (expat.model.XML_CQUANT_PLUS, 317 expat.model.XML_CQUANT_REP): 318 for child in children: 319 multiple.append(child[2]) 320 else: 321 for child in children: 322 count(child) 323 elif name.upper()!="ERROR": 324 if quantifier in (expat.model.XML_CQUANT_NONE, 325 expat.model.XML_CQUANT_OPT): 326 single.append(name) 327 elif quantifier in (expat.model.XML_CQUANT_PLUS, 328 expat.model.XML_CQUANT_REP): 329 multiple.append(name)
330 count(model) 331 if len(single)==0 and len(multiple)==1: 332 self.lists.append(name) 333 elif len(multiple)==0: 334 self.dictionaries.append(name) 335 else: 336 self.structures.update({name: multiple})
337
338 - def externalEntityRefHandler(self, context, base, systemId, publicId):
339 """The purpose of this function is to load the DTD locally, instead 340 of downloading it from the URL specified in the XML. Using the local 341 DTD results in much faster parsing. If the DTD is not found locally, 342 we try to download it. In practice, this may fail though, if the XML 343 relies on many interrelated DTDs. If new DTDs appear, putting them in 344 Bio/Entrez/DTDs will allow the parser to see them.""" 345 if not self.valid: 346 raise NotXMLError 347 location, filename = os.path.split(systemId) 348 path = os.path.join(self.dtd_dir, filename) 349 try: 350 handle = open(path) 351 except IOError: 352 message = """\ 353 Unable to load DTD file %s. 354 355 Bio.Entrez uses NCBI's DTD files to parse XML files returned by NCBI Entrez. 356 Though most of NCBI's DTD files are included in the Biopython distribution, 357 sometimes you may find that a particular DTD file is missing. In such a 358 case, you can download the DTD file from NCBI and install it manually. 359 360 Usually, you can find missing DTD files at either 361 http://www.ncbi.nlm.nih.gov/dtd/ 362 or 363 http://eutils.ncbi.nlm.nih.gov/entrez/query/DTD/ 364 If you cannot find %s there, you may also try to search 365 for it with a search engine such as Google. 366 367 Please save %s in the directory 368 %s 369 in order for Bio.Entrez to find it. 370 Alternatively, you can save %s in the directory 371 Bio/Entrez/DTDs in the Biopython distribution, and reinstall Biopython. 372 373 Please also inform the Biopython developers by sending an email to 374 biopython-dev@biopython.org to inform us about this missing DTD, so that we 375 can include it with the next release of Biopython. 376 """ % (filename, filename, filename, self.dtd_dir, filename) 377 raise RuntimeError(message) 378 379 parser = self.parser.ExternalEntityParserCreate(context) 380 parser.ElementDeclHandler = self.elementDecl 381 parser.ParseFile(handle) 382 return 1
383