Package Bio :: Package GenBank :: Module Scanner
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Scanner

   1  # Copyright 2007-2010 by Peter Cock.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # 
   6  # This code is NOT intended for direct use.  It provides a basic scanner 
   7  # (for use with a event consumer such as Bio.GenBank._FeatureConsumer) 
   8  # to parse a GenBank or EMBL file (with their shared INSDC feature table). 
   9  # 
  10  # It is used by Bio.GenBank to parse GenBank files 
  11  # It is also used by Bio.SeqIO to parse GenBank and EMBL files 
  12  # 
  13  # Feature Table Documentation: 
  14  # http://www.insdc.org/files/feature_table.html 
  15  # http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html 
  16  # ftp://ftp.ncbi.nih.gov/genbank/docs/ 
  17  # 
  18  # 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. 
  19  # These are GenBank files that summarize the content of a project, and provide lists of 
  20  # scaffold and contig files in the project. These will be in annotations['wgs'] and 
  21  # annotations['wgs_scafld']. These GenBank files do not have sequences. See 
  22  # http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 
  23  # http://is.gd/nNgk 
  24  # for more details of this format, and an example. 
  25  # Added by Ying Huang & Iddo Friedberg 
  26   
  27  import warnings 
  28  import os 
  29  from Bio.Seq import Seq 
  30  from Bio.SeqRecord import SeqRecord 
  31  from Bio.Alphabet import generic_alphabet, generic_protein 
  32   
33 -class InsdcScanner:
34 """Basic functions for breaking up a GenBank/EMBL file into sub sections. 35 36 The International Nucleotide Sequence Database Collaboration (INSDC) 37 between the DDBJ, EMBL, and GenBank. These organisations all use the 38 same "Feature Table" layout in their plain text flat file formats. 39 40 However, the header and sequence sections of an EMBL file are very 41 different in layout to those produced by GenBank/DDBJ.""" 42 43 #These constants get redefined with sensible values in the sub classes: 44 RECORD_START = "XXX" # "LOCUS " or "ID " 45 HEADER_WIDTH = 3 # 12 or 5 46 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"] 47 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"] 48 FEATURE_QUALIFIER_INDENT = 0 49 FEATURE_QUALIFIER_SPACER = "" 50 SEQUENCE_HEADERS=["XXX"] #with right hand side spaces removed 51
52 - def __init__(self, debug=0):
53 assert len(self.RECORD_START)==self.HEADER_WIDTH 54 for marker in self.SEQUENCE_HEADERS: 55 assert marker==marker.rstrip() 56 assert len(self.FEATURE_QUALIFIER_SPACER)==self.FEATURE_QUALIFIER_INDENT 57 self.debug = debug 58 self.line = None
59
60 - def set_handle(self, handle):
61 self.handle = handle 62 self.line = ""
63
64 - def find_start(self):
65 """Read in lines until find the ID/LOCUS line, which is returned. 66 67 Any preamble (such as the header used by the NCBI on *.seq.gz archives) 68 will we ignored.""" 69 while True: 70 if self.line: 71 line = self.line 72 self.line = "" 73 else: 74 line = self.handle.readline() 75 if not line: 76 if self.debug : print "End of file" 77 return None 78 if line[:self.HEADER_WIDTH]==self.RECORD_START: 79 if self.debug > 1: print "Found the start of a record:\n" + line 80 break 81 line = line.rstrip() 82 if line == "//": 83 if self.debug > 1: print "Skipping // marking end of last record" 84 elif line == "": 85 if self.debug > 1: print "Skipping blank line before record" 86 else: 87 #Ignore any header before the first ID/LOCUS line. 88 if self.debug > 1: 89 print "Skipping header line before record:\n" + line 90 self.line = line 91 return line
92
93 - def parse_header(self):
94 """Return list of strings making up the header 95 96 New line characters are removed. 97 98 Assumes you have just read in the ID/LOCUS line. 99 """ 100 assert self.line[:self.HEADER_WIDTH]==self.RECORD_START, \ 101 "Not at start of record" 102 103 header_lines = [] 104 while True: 105 line = self.handle.readline() 106 if not line: 107 raise ValueError("Premature end of line during sequence data") 108 line = line.rstrip() 109 if line in self.FEATURE_START_MARKERS: 110 if self.debug : print "Found header table" 111 break 112 #if line[:self.HEADER_WIDTH]==self.FEATURE_START_MARKER[:self.HEADER_WIDTH]: 113 # if self.debug : print "Found header table (?)" 114 # break 115 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 116 if self.debug : print "Found start of sequence" 117 break 118 if line == "//": 119 raise ValueError("Premature end of sequence data marker '//' found") 120 header_lines.append(line) 121 self.line = line 122 return header_lines
123
124 - def parse_features(self, skip=False):
125 """Return list of tuples for the features (if present) 126 127 Each feature is returned as a tuple (key, location, qualifiers) 128 where key and location are strings (e.g. "CDS" and 129 "complement(join(490883..490885,1..879))") while qualifiers 130 is a list of two string tuples (feature qualifier keys and values). 131 132 Assumes you have already read to the start of the features table. 133 """ 134 if self.line.rstrip() not in self.FEATURE_START_MARKERS: 135 if self.debug : print "Didn't find any feature table" 136 return [] 137 138 while self.line.rstrip() in self.FEATURE_START_MARKERS: 139 self.line = self.handle.readline() 140 141 features = [] 142 line = self.line 143 while True: 144 if not line: 145 raise ValueError("Premature end of line during features table") 146 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 147 if self.debug : print "Found start of sequence" 148 break 149 line = line.rstrip() 150 if line == "//": 151 raise ValueError("Premature end of features table, marker '//' found") 152 if line in self.FEATURE_END_MARKERS: 153 if self.debug : print "Found end of features" 154 line = self.handle.readline() 155 break 156 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "": 157 raise ValueError("Expected a feature qualifier in line '%s'" % line) 158 159 if skip: 160 line = self.handle.readline() 161 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER: 162 line = self.handle.readline() 163 else: 164 #Build up a list of the lines making up this feature: 165 if line[self.FEATURE_QUALIFIER_INDENT]!=" " \ 166 and " " in line[self.FEATURE_QUALIFIER_INDENT:]: 167 #The feature table design enforces a length limit on the feature keys. 168 #Some third party files (e.g. IGMT's EMBL like files) solve this by 169 #over indenting the location and qualifiers. 170 feature_key, line = line[2:].strip().split(None,1) 171 feature_lines = [line] 172 import warnings 173 warnings.warn("Overindented %s feature?" % feature_key) 174 else: 175 feature_key = line[2:self.FEATURE_QUALIFIER_INDENT].strip() 176 feature_lines = [line[self.FEATURE_QUALIFIER_INDENT:]] 177 line = self.handle.readline() 178 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 179 or line.rstrip() == "" : # cope with blank lines in the midst of a feature 180 #Use strip to remove any harmless trailing white space AND and leading 181 #white space (e.g. out of spec files with too much intentation) 182 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip()) 183 line = self.handle.readline() 184 features.append(self.parse_feature(feature_key, feature_lines)) 185 self.line = line 186 return features
187
188 - def parse_feature(self, feature_key, lines):
189 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers) 190 191 For example given this GenBank feature: 192 193 CDS complement(join(490883..490885,1..879)) 194 /locus_tag="NEQ001" 195 /note="conserved hypothetical [Methanococcus jannaschii]; 196 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear 197 localization signal; IPR002743: Protein of unknown 198 function DUF57" 199 /codon_start=1 200 /transl_table=11 201 /product="hypothetical protein" 202 /protein_id="NP_963295.1" 203 /db_xref="GI:41614797" 204 /db_xref="GeneID:2732620" 205 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK 206 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK 207 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP 208 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE 209 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS 210 LNSMGFGFVNTKKNSAR" 211 212 Then should give input key="CDS" and the rest of the data as a list of strings 213 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"] 214 where the leading spaces and trailing newlines have been removed. 215 216 Returns tuple containing: (key as string, location string, qualifiers as list) 217 as follows for this example: 218 219 key = "CDS", string 220 location = "complement(join(490883..490885,1..879))", string 221 qualifiers = list of string tuples: 222 223 [('locus_tag', '"NEQ001"'), 224 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'), 225 ('codon_start', '1'), 226 ('transl_table', '11'), 227 ('product', '"hypothetical protein"'), 228 ('protein_id', '"NP_963295.1"'), 229 ('db_xref', '"GI:41614797"'), 230 ('db_xref', '"GeneID:2732620"'), 231 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')] 232 233 In the above example, the "note" and "translation" were edited for compactness, 234 and they would contain multiple new line characters (displayed above as \n) 235 236 If a qualifier is quoted (in this case, everything except codon_start and 237 transl_table) then the quotes are NOT removed. 238 239 Note that no whitespace is removed. 240 """ 241 #Skip any blank lines 242 iterator = iter(filter(None, lines)) 243 try: 244 line = iterator.next() 245 246 feature_location = line.strip() 247 while feature_location[-1:]==",": 248 #Multiline location, still more to come! 249 feature_location += iterator.next().strip() 250 251 qualifiers=[] 252 253 for line in iterator: 254 if line[0]=="/": 255 #New qualifier 256 i = line.find("=") 257 key = line[1:i] #does not work if i==-1 258 value = line[i+1:] #we ignore 'value' if i==-1 259 if i==-1: 260 #Qualifier with no key, e.g. /pseudo 261 key = line[1:] 262 qualifiers.append((key,None)) 263 elif value[0]=='"': 264 #Quoted... 265 if value[-1]!='"' or value!='"': 266 #No closing quote on the first line... 267 while value[-1] != '"': 268 value += "\n" + iterator.next() 269 else: 270 #One single line (quoted) 271 assert value == '"' 272 if self.debug : print "Quoted line %s:%s" % (key, value) 273 #DO NOT remove the quotes... 274 qualifiers.append((key,value)) 275 else: 276 #Unquoted 277 #if debug : print "Unquoted line %s:%s" % (key,value) 278 qualifiers.append((key,value)) 279 else: 280 #Unquoted continuation 281 assert len(qualifiers) > 0 282 assert key==qualifiers[-1][0] 283 #if debug : print "Unquoted Cont %s:%s" % (key, line) 284 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line) 285 return (feature_key, feature_location, qualifiers) 286 except StopIteration: 287 #Bummer 288 raise ValueError("Problem with '%s' feature:\n%s" \ 289 % (feature_key, "\n".join(lines)))
290 311
312 - def _feed_first_line(self, consumer, line):
313 """Handle the LOCUS/ID line, passing data to the comsumer 314 315 This should be implemented by the EMBL / GenBank specific subclass 316 317 Used by the parse_records() and parse() methods. 318 """ 319 pass
320
321 - def _feed_header_lines(self, consumer, lines):
322 """Handle the header lines (list of strings), passing data to the comsumer 323 324 This should be implemented by the EMBL / GenBank specific subclass 325 326 Used by the parse_records() and parse() methods. 327 """ 328 pass
329 330
331 - def _feed_feature_table(self, consumer, feature_tuples):
332 """Handle the feature table (list of tuples), passing data to the comsumer 333 334 Used by the parse_records() and parse() methods. 335 """ 336 consumer.start_feature_table() 337 for feature_key, location_string, qualifiers in feature_tuples: 338 consumer.feature_key(feature_key) 339 consumer.location(location_string) 340 for q_key, q_value in qualifiers: 341 consumer.feature_qualifier_name([q_key]) 342 if q_value is not None: 343 consumer.feature_qualifier_description(q_value.replace("\n"," "))
344
345 - def _feed_misc_lines(self, consumer, lines):
346 """Handle any lines between features and sequence (list of strings), passing data to the consumer 347 348 This should be implemented by the EMBL / GenBank specific subclass 349 350 Used by the parse_records() and parse() methods. 351 """ 352 pass
353
354 - def feed(self, handle, consumer, do_features=True):
355 """Feed a set of data into the consumer. 356 357 This method is intended for use with the "old" code in Bio.GenBank 358 359 Arguments: 360 handle - A handle with the information to parse. 361 consumer - The consumer that should be informed of events. 362 do_features - Boolean, should the features be parsed? 363 Skipping the features can be much faster. 364 365 Return values: 366 true - Passed a record 367 false - Did not find a record 368 """ 369 #Should work with both EMBL and GenBank files provided the 370 #equivalent Bio.GenBank._FeatureConsumer methods are called... 371 self.set_handle(handle) 372 if not self.find_start(): 373 #Could not find (another) record 374 consumer.data=None 375 return False 376 377 #We use the above class methods to parse the file into a simplified format. 378 #The first line, header lines and any misc lines after the features will be 379 #dealt with by GenBank / EMBL specific derived classes. 380 381 #First line and header: 382 self._feed_first_line(consumer, self.line) 383 self._feed_header_lines(consumer, self.parse_header()) 384 385 #Features (common to both EMBL and GenBank): 386 if do_features: 387 self._feed_feature_table(consumer, self.parse_features(skip=False)) 388 else: 389 self.parse_features(skip=True) # ignore the data 390 391 #Footer and sequence 392 misc_lines, sequence_string = self.parse_footer() 393 self._feed_misc_lines(consumer, misc_lines) 394 395 consumer.sequence(sequence_string) 396 #Calls to consumer.base_number() do nothing anyway 397 consumer.record_end("//") 398 399 assert self.line == "//" 400 401 #And we are done 402 return True
403
404 - def parse(self, handle, do_features=True):
405 """Returns a SeqRecord (with SeqFeatures if do_features=True) 406 407 See also the method parse_records() for use on multi-record files. 408 """ 409 from Bio.GenBank import _FeatureConsumer 410 from Bio.GenBank.utils import FeatureValueCleaner 411 412 consumer = _FeatureConsumer(use_fuzziness = 1, 413 feature_cleaner = FeatureValueCleaner()) 414 415 if self.feed(handle, consumer, do_features): 416 return consumer.data 417 else: 418 return None
419 420
421 - def parse_records(self, handle, do_features=True):
422 """Returns a SeqRecord object iterator 423 424 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord 425 426 The SeqRecord objects include SeqFeatures if do_features=True 427 428 This method is intended for use in Bio.SeqIO 429 """ 430 #This is a generator function 431 while True: 432 record = self.parse(handle, do_features) 433 if record is None : break 434 assert record.id is not None 435 assert record.name != "<unknown name>" 436 assert record.description != "<unknown description>" 437 yield record
438
439 - def parse_cds_features(self, handle, 440 alphabet=generic_protein, 441 tags2id=('protein_id','locus_tag','product')):
442 """Returns SeqRecord object iterator 443 444 Each CDS feature becomes a SeqRecord. 445 446 alphabet - Used for any sequence found in a translation field. 447 tags2id - Tupple of three strings, the feature keys to use 448 for the record id, name and description, 449 450 This method is intended for use in Bio.SeqIO 451 """ 452 self.set_handle(handle) 453 while self.find_start(): 454 #Got an EMBL or GenBank record... 455 self.parse_header() # ignore header lines! 456 feature_tuples = self.parse_features() 457 #self.parse_footer() # ignore footer lines! 458 while True: 459 line = self.handle.readline() 460 if not line : break 461 if line[:2]=="//" : break 462 self.line = line.rstrip() 463 464 #Now go though those features... 465 for key, location_string, qualifiers in feature_tuples: 466 if key=="CDS": 467 #Create SeqRecord 468 #================ 469 #SeqRecord objects cannot be created with annotations, they 470 #must be added afterwards. So create an empty record and 471 #then populate it: 472 record = SeqRecord(seq=None) 473 annotations = record.annotations 474 475 #Should we add a location object to the annotations? 476 #I *think* that only makes sense for SeqFeatures with their 477 #sub features... 478 annotations['raw_location'] = location_string.replace(' ','') 479 480 for (qualifier_name, qualifier_data) in qualifiers: 481 if qualifier_data is not None \ 482 and qualifier_data[0]=='"' and qualifier_data[-1]=='"': 483 #Remove quotes 484 qualifier_data = qualifier_data[1:-1] 485 #Append the data to the annotation qualifier... 486 if qualifier_name == "translation": 487 assert record.seq is None, "Multiple translations!" 488 record.seq = Seq(qualifier_data.replace("\n",""), alphabet) 489 elif qualifier_name == "db_xref": 490 #its a list, possibly empty. Its safe to extend 491 record.dbxrefs.append(qualifier_data) 492 else: 493 if qualifier_data is not None: 494 qualifier_data = qualifier_data.replace("\n"," ").replace(" "," ") 495 try: 496 annotations[qualifier_name] += " " + qualifier_data 497 except KeyError: 498 #Not an addition to existing data, its the first bit 499 annotations[qualifier_name]= qualifier_data 500 501 #Fill in the ID, Name, Description 502 #================================= 503 try: 504 record.id = annotations[tags2id[0]] 505 except KeyError: 506 pass 507 try: 508 record.name = annotations[tags2id[1]] 509 except KeyError: 510 pass 511 try: 512 record.description = annotations[tags2id[2]] 513 except KeyError: 514 pass 515 516 yield record
517
518 -class EmblScanner(InsdcScanner):
519 """For extracting chunks of information in EMBL files""" 520 521 RECORD_START = "ID " 522 HEADER_WIDTH = 5 523 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers","FH"] 524 FEATURE_END_MARKERS = ["XX"] #XX can also mark the end of many things! 525 FEATURE_QUALIFIER_INDENT = 21 526 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT-2) 527 SEQUENCE_HEADERS=["SQ", "CO"] #Remove trailing spaces 528 562
563 - def _feed_first_line(self, consumer, line):
564 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 565 if line[self.HEADER_WIDTH:].count(";") == 6: 566 #Looks like the semi colon separated style introduced in 2006 567 self._feed_first_line_new(consumer, line) 568 elif line[self.HEADER_WIDTH:].count(";") == 3: 569 #Looks like the pre 2006 style 570 self._feed_first_line_old(consumer, line) 571 else: 572 raise ValueError('Did not recognise the ID line layout:\n' + line)
573
574 - def _feed_first_line_old(self, consumer, line):
575 #Expects an ID line in the style before 2006, e.g. 576 #ID SC10H5 standard; DNA; PRO; 4870 BP. 577 #ID BSUB9999 standard; circular DNA; PRO; 4214630 BP. 578 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 579 fields = [line[self.HEADER_WIDTH:].split(None,1)[0]] 580 fields.extend(line[self.HEADER_WIDTH:].split(None,1)[1].split(";")) 581 fields = [entry.strip() for entry in fields] 582 """ 583 The tokens represent: 584 0. Primary accession number 585 (space sep) 586 1. ??? (e.g. standard) 587 (semi-colon) 588 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA') 589 3. Taxonomic division (e.g. 'PRO') 590 4. Sequence length (e.g. '4639675 BP.') 591 """ 592 consumer.locus(fields[0]) #Should we also call the accession consumer? 593 consumer.residue_type(fields[2]) 594 consumer.data_file_division(fields[3]) 595 self._feed_seq_length(consumer, fields[4])
596
597 - def _feed_first_line_new(self, consumer, line):
598 #Expects an ID line in the style introduced in 2006, e.g. 599 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 600 #ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP. 601 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 602 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")] 603 assert len(fields) == 7 604 """ 605 The tokens represent: 606 0. Primary accession number 607 1. Sequence version number 608 2. Topology: 'circular' or 'linear' 609 3. Molecule type (e.g. 'genomic DNA') 610 4. Data class (e.g. 'STD') 611 5. Taxonomic division (e.g. 'PRO') 612 6. Sequence length (e.g. '4639675 BP.') 613 """ 614 615 consumer.locus(fields[0]) 616 617 #Call the accession consumer now, to make sure we record 618 #something as the record.id, in case there is no AC line 619 consumer.accession(fields[0]) 620 621 #TODO - How to deal with the version field? At the moment the consumer 622 #will try and use this for the ID which isn't ideal for EMBL files. 623 version_parts = fields[1].split() 624 if len(version_parts)==2 \ 625 and version_parts[0]=="SV" \ 626 and version_parts[1].isdigit(): 627 consumer.version_suffix(version_parts[1]) 628 629 #Based on how the old GenBank parser worked, merge these two: 630 consumer.residue_type(" ".join(fields[2:4])) #TODO - Store as two fields? 631 632 #consumer.xxx(fields[4]) #TODO - What should we do with the data class? 633 634 consumer.data_file_division(fields[5]) 635 636 self._feed_seq_length(consumer, fields[6])
637
638 - def _feed_seq_length(self, consumer, text):
639 length_parts = text.split() 640 assert len(length_parts) == 2 641 assert length_parts[1].upper() in ["BP", "BP."] 642 consumer.size(length_parts[0])
643
644 - def _feed_header_lines(self, consumer, lines):
645 EMBL_INDENT = self.HEADER_WIDTH 646 EMBL_SPACER = " " * EMBL_INDENT 647 consumer_dict = { 648 'AC' : 'accession', 649 'SV' : 'version', # SV line removed in June 2006, now part of ID line 650 'DE' : 'definition', 651 #'RN' : 'reference_num', 652 #'RC' : reference comment... TODO 653 #'RP' : 'reference_bases', 654 #'RX' : reference cross reference... DOI or Pubmed 655 'RG' : 'consrtm', #optional consortium 656 #'RA' : 'authors', 657 #'RT' : 'title', 658 'RL' : 'journal', 659 'OS' : 'organism', 660 'OC' : 'taxonomy', 661 #'DR' : data reference 662 'CC' : 'comment', 663 #'XX' : splitter 664 } 665 #We have to handle the following specially: 666 #RX (depending on reference type...) 667 lines = filter(None,lines) 668 line_iter = iter(lines) 669 try: 670 while True: 671 try: 672 line = line_iter.next() 673 except StopIteration: 674 break 675 if not line : break 676 line_type = line[:EMBL_INDENT].strip() 677 data = line[EMBL_INDENT:].strip() 678 679 if line_type == 'XX': 680 pass 681 elif line_type == 'RN': 682 # Reformat reference numbers for the GenBank based consumer 683 # e.g. '[1]' becomes '1' 684 if data[0] == "[" and data[-1] == "]" : data = data[1:-1] 685 consumer.reference_num(data) 686 elif line_type == 'RP': 687 # Reformat reference numbers for the GenBank based consumer 688 # e.g. '1-4639675' becomes '(bases 1 to 4639675)' 689 # and '160-550, 904-1055' becomes '(bases 160 to 550; 904 to 1055)' 690 parts = [bases.replace("-"," to ").strip() for bases in data.split(",")] 691 consumer.reference_bases("(bases %s)" % "; ".join(parts)) 692 elif line_type == 'RT': 693 #Remove the enclosing quotes and trailing semi colon. 694 #Note the title can be split over multiple lines. 695 if data.startswith('"'): 696 data = data[1:] 697 if data.endswith('";'): 698 data = data[:-2] 699 consumer.title(data) 700 elif line_type == 'RX': 701 # EMBL support three reference types at the moment: 702 # - PUBMED PUBMED bibliographic database (NLM) 703 # - DOI Digital Object Identifier (International DOI Foundation) 704 # - AGRICOLA US National Agriculture Library (NAL) of the US Department 705 # of Agriculture (USDA) 706 # 707 # Format: 708 # RX resource_identifier; identifier. 709 # 710 # e.g. 711 # RX DOI; 10.1016/0024-3205(83)90010-3. 712 # RX PUBMED; 264242. 713 # 714 # Currently our reference object only supports PUBMED and MEDLINE 715 # (as these were in GenBank files?). 716 key, value = data.split(";",1) 717 if value.endswith(".") : value = value[:-1] 718 value = value.strip() 719 if key == "PUBMED": 720 consumer.pubmed_id(value) 721 #TODO - Handle other reference types (here and in BioSQL bindings) 722 elif line_type == 'CC': 723 # Have to pass a list of strings for this one (not just a string) 724 consumer.comment([data]) 725 elif line_type == 'DR': 726 # Database Cross-reference, format: 727 # DR database_identifier; primary_identifier; secondary_identifier. 728 # 729 # e.g. 730 # DR MGI; 98599; Tcrb-V4. 731 # 732 # TODO - How should we store any secondary identifier? 733 parts = data.rstrip(".").split(";") 734 #Turn it into "database_identifier:primary_identifier" to 735 #mimic the GenBank parser. e.g. "MGI:98599" 736 consumer.dblink("%s:%s" % (parts[0].strip(), 737 parts[1].strip())) 738 elif line_type == 'RA': 739 # Remove trailing ; at end of authors list 740 consumer.authors(data.rstrip(";")) 741 elif line_type == 'PR': 742 # Remove trailing ; at end of the project reference 743 # In GenBank files this corresponds to the old PROJECT 744 # line which is being replaced with the DBLINK line. 745 consumer.project(data.rstrip(";")) 746 elif line_type in consumer_dict: 747 #Its a semi-automatic entry! 748 getattr(consumer, consumer_dict[line_type])(data) 749 else: 750 if self.debug: 751 print "Ignoring EMBL header line:\n%s" % line 752 except StopIteration: 753 raise ValueError("Problem with header")
754
755 - def _feed_misc_lines(self, consumer, lines):
756 #TODO - Should we do something with the information on the SQ line(s)? 757 lines.append("") 758 line_iter = iter(lines) 759 try: 760 for line in line_iter: 761 if line.startswith("CO "): 762 line = line[5:].strip() 763 contig_location = line 764 while True: 765 line = line_iter.next() 766 if not line: 767 break 768 elif line.startswith("CO "): 769 #Don't need to preseve the whitespace here. 770 contig_location += line[5:].strip() 771 else: 772 raise ValueError('Expected CO (contig) continuation line, got:\n' + line) 773 consumer.contig_location(contig_location) 774 return 775 except StopIteration: 776 raise ValueError("Problem in misc lines before sequence")
777
778 -class GenBankScanner(InsdcScanner):
779 """For extracting chunks of information in GenBank files""" 780 781 RECORD_START = "LOCUS " 782 HEADER_WIDTH = 12 783 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers","FEATURES"] 784 FEATURE_END_MARKERS = [] 785 FEATURE_QUALIFIER_INDENT = 21 786 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 787 SEQUENCE_HEADERS=["CONTIG", "ORIGIN", "BASE COUNT", "WGS"] # trailing spaces removed 788 832
833 - def _feed_first_line(self, consumer, line):
834 ##################################### 835 # LOCUS line # 836 ##################################### 837 GENBANK_INDENT = self.HEADER_WIDTH 838 GENBANK_SPACER = " "*GENBANK_INDENT 839 assert line[0:GENBANK_INDENT] == 'LOCUS ', \ 840 'LOCUS line does not start correctly:\n' + line 841 842 #Have to break up the locus line, and handle the different bits of it. 843 #There are at least two different versions of the locus line... 844 if line[29:33] in [' bp ', ' aa ',' rc ']: 845 #Old... 846 # 847 # Positions Contents 848 # --------- -------- 849 # 00:06 LOCUS 850 # 06:12 spaces 851 # 12:?? Locus name 852 # ??:?? space 853 # ??:29 Length of sequence, right-justified 854 # 29:33 space, bp, space 855 # 33:41 strand type 856 # 41:42 space 857 # 42:51 Blank (implies linear), linear or circular 858 # 51:52 space 859 # 52:55 The division code (e.g. BCT, VRL, INV) 860 # 55:62 space 861 # 62:73 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 862 # 863 assert line[29:33] in [' bp ', ' aa ',' rc '] , \ 864 'LOCUS line does not contain size units at expected position:\n' + line 865 assert line[41:42] == ' ', \ 866 'LOCUS line does not contain space at position 42:\n' + line 867 assert line[42:51].strip() in ['','linear','circular'], \ 868 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 869 assert line[51:52] == ' ', \ 870 'LOCUS line does not contain space at position 52:\n' + line 871 assert line[55:62] == ' ', \ 872 'LOCUS line does not contain spaces from position 56 to 62:\n' + line 873 if line[62:73].strip(): 874 assert line[64:65] == '-', \ 875 'LOCUS line does not contain - at position 65 in date:\n' + line 876 assert line[68:69] == '-', \ 877 'LOCUS line does not contain - at position 69 in date:\n' + line 878 879 name_and_length_str = line[GENBANK_INDENT:29] 880 while name_and_length_str.find(' ')!=-1: 881 name_and_length_str = name_and_length_str.replace(' ',' ') 882 name_and_length = name_and_length_str.split(' ') 883 assert len(name_and_length)<=2, \ 884 'Cannot parse the name and length in the LOCUS line:\n' + line 885 assert len(name_and_length)!=1, \ 886 'Name and length collide in the LOCUS line:\n' + line 887 #Should be possible to split them based on position, if 888 #a clear definition of the standard exists THAT AGREES with 889 #existing files. 890 consumer.locus(name_and_length[0]) 891 consumer.size(name_and_length[1]) 892 #consumer.residue_type(line[33:41].strip()) 893 894 if line[33:51].strip() == "" and line[29:33] == ' aa ': 895 #Amino acids -> protein (even if there is no residue type given) 896 #We want to use a protein alphabet in this case, rather than a 897 #generic one. Not sure if this is the best way to achieve this, 898 #but it works because the scanner checks for this: 899 consumer.residue_type("PROTEIN") 900 else: 901 consumer.residue_type(line[33:51].strip()) 902 903 consumer.data_file_division(line[52:55]) 904 if line[62:73].strip(): 905 consumer.date(line[62:73]) 906 elif line[40:44] in [' bp ', ' aa ',' rc ']: 907 #New... 908 # 909 # Positions Contents 910 # --------- -------- 911 # 00:06 LOCUS 912 # 06:12 spaces 913 # 12:?? Locus name 914 # ??:?? space 915 # ??:40 Length of sequence, right-justified 916 # 40:44 space, bp, space 917 # 44:47 Blank, ss-, ds-, ms- 918 # 47:54 Blank, DNA, RNA, tRNA, mRNA, uRNA, snRNA, cDNA 919 # 54:55 space 920 # 55:63 Blank (implies linear), linear or circular 921 # 63:64 space 922 # 64:67 The division code (e.g. BCT, VRL, INV) 923 # 67:68 space 924 # 68:79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 925 # 926 assert line[40:44] in [' bp ', ' aa ',' rc '] , \ 927 'LOCUS line does not contain size units at expected position:\n' + line 928 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 929 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 930 assert line[47:54].strip() == "" \ 931 or line[47:54].strip().find('DNA') != -1 \ 932 or line[47:54].strip().find('RNA') != -1, \ 933 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 934 assert line[54:55] == ' ', \ 935 'LOCUS line does not contain space at position 55:\n' + line 936 assert line[55:63].strip() in ['','linear','circular'], \ 937 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 938 assert line[63:64] == ' ', \ 939 'LOCUS line does not contain space at position 64:\n' + line 940 assert line[67:68] == ' ', \ 941 'LOCUS line does not contain space at position 68:\n' + line 942 if line[68:79].strip(): 943 assert line[70:71] == '-', \ 944 'LOCUS line does not contain - at position 71 in date:\n' + line 945 assert line[74:75] == '-', \ 946 'LOCUS line does not contain - at position 75 in date:\n' + line 947 948 name_and_length_str = line[GENBANK_INDENT:40] 949 while name_and_length_str.find(' ')!=-1: 950 name_and_length_str = name_and_length_str.replace(' ',' ') 951 name_and_length = name_and_length_str.split(' ') 952 assert len(name_and_length)<=2, \ 953 'Cannot parse the name and length in the LOCUS line:\n' + line 954 assert len(name_and_length)!=1, \ 955 'Name and length collide in the LOCUS line:\n' + line 956 #Should be possible to split them based on position, if 957 #a clear definition of the stand exists THAT AGREES with 958 #existing files. 959 consumer.locus(name_and_length[0]) 960 consumer.size(name_and_length[1]) 961 962 if line[44:54].strip() == "" and line[40:44] == ' aa ': 963 #Amino acids -> protein (even if there is no residue type given) 964 #We want to use a protein alphabet in this case, rather than a 965 #generic one. Not sure if this is the best way to achieve this, 966 #but it works because the scanner checks for this: 967 consumer.residue_type(("PROTEIN " + line[54:63]).strip()) 968 else: 969 consumer.residue_type(line[44:63].strip()) 970 971 consumer.data_file_division(line[64:67]) 972 if line[68:79].strip(): 973 consumer.date(line[68:79]) 974 elif line[GENBANK_INDENT:].strip().count(" ")==0 : 975 #Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762 976 # 977 #e.g. 978 # 979 # "LOCUS U00096" 980 # 981 #rather than: 982 # 983 # "LOCUS U00096 4639675 bp DNA circular BCT" 984 # 985 # Positions Contents 986 # --------- -------- 987 # 00:06 LOCUS 988 # 06:12 spaces 989 # 12:?? Locus name 990 if line[GENBANK_INDENT:].strip() != "": 991 consumer.locus(line[GENBANK_INDENT:].strip()) 992 else: 993 #Must just have just "LOCUS ", is this even legitimate? 994 #We should be able to continue parsing... we need real world testcases! 995 warnings.warn("Minimal LOCUS line found - is this correct?\n" + line) 996 elif len(line.split())>=4 and line.split()[3] in ["aa","bp"]: 997 #Cope with EMBOSS seqret output where it seems the locus id can cause 998 #the other fields to overflow. We just IGNORE the other fields! 999 consumer.locus(line.split()[1]) 1000 consumer.size(line.split()[2]) 1001 warnings.warn("Malformed LOCUS line found - is this correct?\n" + line) 1002 else: 1003 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
1004 1005
1006 - def _feed_header_lines(self, consumer, lines):
1007 #Following dictionary maps GenBank lines to the associated 1008 #consumer methods - the special cases like LOCUS where one 1009 #genbank line triggers several consumer calls have to be 1010 #handled individually. 1011 GENBANK_INDENT = self.HEADER_WIDTH 1012 GENBANK_SPACER = " "*GENBANK_INDENT 1013 consumer_dict = { 1014 'DEFINITION' : 'definition', 1015 'ACCESSION' : 'accession', 1016 'NID' : 'nid', 1017 'PID' : 'pid', 1018 'DBSOURCE' : 'db_source', 1019 'KEYWORDS' : 'keywords', 1020 'SEGMENT' : 'segment', 1021 'SOURCE' : 'source', 1022 'AUTHORS' : 'authors', 1023 'CONSRTM' : 'consrtm', 1024 'PROJECT' : 'project', 1025 'DBLINK' : 'dblink', 1026 'TITLE' : 'title', 1027 'JOURNAL' : 'journal', 1028 'MEDLINE' : 'medline_id', 1029 'PUBMED' : 'pubmed_id', 1030 'REMARK' : 'remark'} 1031 #We have to handle the following specially: 1032 #ORIGIN (locus, size, residue_type, data_file_division and date) 1033 #COMMENT (comment) 1034 #VERSION (version and gi) 1035 #REFERENCE (eference_num and reference_bases) 1036 #ORGANISM (organism and taxonomy) 1037 lines = filter(None,lines) 1038 lines.append("") #helps avoid getting StopIteration all the time 1039 line_iter = iter(lines) 1040 try: 1041 line = line_iter.next() 1042 while True: 1043 if not line : break 1044 line_type = line[:GENBANK_INDENT].strip() 1045 data = line[GENBANK_INDENT:].strip() 1046 1047 if line_type == 'VERSION': 1048 #Need to call consumer.version(), and maybe also consumer.gi() as well. 1049 #e.g. 1050 # VERSION AC007323.5 GI:6587720 1051 while data.find(' ')!=-1: 1052 data = data.replace(' ',' ') 1053 if data.find(' GI:')==-1: 1054 consumer.version(data) 1055 else: 1056 if self.debug : print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]" 1057 consumer.version(data.split(' GI:')[0]) 1058 consumer.gi(data.split(' GI:')[1]) 1059 #Read in the next line! 1060 line = line_iter.next() 1061 elif line_type == 'REFERENCE': 1062 if self.debug >1 : print "Found reference [" + data + "]" 1063 #Need to call consumer.reference_num() and consumer.reference_bases() 1064 #e.g. 1065 # REFERENCE 1 (bases 1 to 86436) 1066 # 1067 #Note that this can be multiline, see Bug 1968, e.g. 1068 # 1069 # REFERENCE 42 (bases 1517 to 1696; 3932 to 4112; 17880 to 17975; 21142 to 1070 # 28259) 1071 # 1072 #For such cases we will call the consumer once only. 1073 data = data.strip() 1074 1075 #Read in the next line, and see if its more of the reference: 1076 while True: 1077 line = line_iter.next() 1078 if line[:GENBANK_INDENT] == GENBANK_SPACER: 1079 #Add this continuation to the data string 1080 data += " " + line[GENBANK_INDENT:] 1081 if self.debug >1 : print "Extended reference text [" + data + "]" 1082 else: 1083 #End of the reference, leave this text in the variable "line" 1084 break 1085 1086 #We now have all the reference line(s) stored in a string, data, 1087 #which we pass to the consumer 1088 while data.find(' ')!=-1: 1089 data = data.replace(' ',' ') 1090 if data.find(' ')==-1: 1091 if self.debug >2 : print 'Reference number \"' + data + '\"' 1092 consumer.reference_num(data) 1093 else: 1094 if self.debug >2 : print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ')+1:] + '\"' 1095 consumer.reference_num(data[:data.find(' ')]) 1096 consumer.reference_bases(data[data.find(' ')+1:]) 1097 elif line_type == 'ORGANISM': 1098 #Typically the first line is the organism, and subsequent lines 1099 #are the taxonomy lineage. However, given longer and longer 1100 #species names (as more and more strains and sub strains get 1101 #sequenced) the oragnism name can now get wrapped onto multiple 1102 #lines. The NCBI say we have to recognise the lineage line by 1103 #the presense of semi-colon delimited entries. In the long term, 1104 #they are considering adding a new keyword (e.g. LINEAGE). 1105 #See Bug 2591 for details. 1106 organism_data = data 1107 lineage_data = "" 1108 while True: 1109 line = line_iter.next() 1110 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1111 if lineage_data or ";" in line: 1112 lineage_data += " " + line[GENBANK_INDENT:] 1113 else: 1114 organism_data += " " + line[GENBANK_INDENT:].strip() 1115 else: 1116 #End of organism and taxonomy 1117 break 1118 consumer.organism(organism_data) 1119 if lineage_data.strip() == "" and self.debug > 1: 1120 print "Taxonomy line(s) missing or blank" 1121 consumer.taxonomy(lineage_data.strip()) 1122 del organism_data, lineage_data 1123 elif line_type == 'COMMENT': 1124 if self.debug > 1 : print "Found comment" 1125 #This can be multiline, and should call consumer.comment() once 1126 #with a list where each entry is a line. 1127 comment_list=[] 1128 comment_list.append(data) 1129 while True: 1130 line = line_iter.next() 1131 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1132 data = line[GENBANK_INDENT:] 1133 comment_list.append(data) 1134 if self.debug > 2 : print "Comment continuation [" + data + "]" 1135 else: 1136 #End of the comment 1137 break 1138 consumer.comment(comment_list) 1139 del comment_list 1140 elif line_type in consumer_dict: 1141 #Its a semi-automatic entry! 1142 #Now, this may be a multi line entry... 1143 while True: 1144 line = line_iter.next() 1145 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1146 data += ' ' + line[GENBANK_INDENT:] 1147 else: 1148 #We now have all the data for this entry: 1149 getattr(consumer, consumer_dict[line_type])(data) 1150 #End of continuation - return to top of loop! 1151 break 1152 else: 1153 if self.debug: 1154 print "Ignoring GenBank header line:\n" % line 1155 #Read in next line 1156 line = line_iter.next() 1157 except StopIteration: 1158 raise ValueError("Problem in header")
1159
1160 - def _feed_misc_lines(self, consumer, lines):
1161 #Deals with a few misc lines between the features and the sequence 1162 GENBANK_INDENT = self.HEADER_WIDTH 1163 GENBANK_SPACER = " "*GENBANK_INDENT 1164 lines.append("") 1165 line_iter = iter(lines) 1166 try: 1167 for line in line_iter: 1168 if line.find('BASE COUNT')==0: 1169 line = line[10:].strip() 1170 if line: 1171 if self.debug : print "base_count = " + line 1172 consumer.base_count(line) 1173 if line.find("ORIGIN")==0: 1174 line = line[6:].strip() 1175 if line: 1176 if self.debug : print "origin_name = " + line 1177 consumer.origin_name(line) 1178 if line.find("WGS ")==0 : 1179 line = line[3:].strip() 1180 consumer.wgs(line) 1181 if line.find("WGS_SCAFLD")==0 : 1182 line = line[10:].strip() 1183 consumer.add_wgs_scafld(line) 1184 if line.find("CONTIG")==0: 1185 line = line[6:].strip() 1186 contig_location = line 1187 while True: 1188 line = line_iter.next() 1189 if not line: 1190 break 1191 elif line[:GENBANK_INDENT]==GENBANK_SPACER: 1192 #Don't need to preseve the whitespace here. 1193 contig_location += line[GENBANK_INDENT:].rstrip() 1194 else: 1195 raise ValueError('Expected CONTIG continuation line, got:\n' + line) 1196 consumer.contig_location(contig_location) 1197 return 1198 except StopIteration: 1199 raise ValueError("Problem in misc lines before sequence")
1200 1201 if __name__ == "__main__": 1202 from StringIO import StringIO 1203 1204 gbk_example = \ 1205 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999 1206 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p 1207 (AXL2) and Rev7p (REV7) genes, complete cds. 1208 ACCESSION U49845 1209 VERSION U49845.1 GI:1293613 1210 KEYWORDS . 1211 SOURCE Saccharomyces cerevisiae (baker's yeast) 1212 ORGANISM Saccharomyces cerevisiae 1213 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; 1214 Saccharomycetales; Saccharomycetaceae; Saccharomyces. 1215 REFERENCE 1 (bases 1 to 5028) 1216 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. 1217 TITLE Cloning and sequence of REV7, a gene whose function is required for 1218 DNA damage-induced mutagenesis in Saccharomyces cerevisiae 1219 JOURNAL Yeast 10 (11), 1503-1509 (1994) 1220 PUBMED 7871890 1221 REFERENCE 2 (bases 1 to 5028) 1222 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. 1223 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel 1224 plasma membrane glycoprotein 1225 JOURNAL Genes Dev. 10 (7), 777-793 (1996) 1226 PUBMED 8846915 1227 REFERENCE 3 (bases 1 to 5028) 1228 AUTHORS Roemer,T. 1229 TITLE Direct Submission 1230 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New 1231 Haven, CT, USA 1232 FEATURES Location/Qualifiers 1233 source 1..5028 1234 /organism="Saccharomyces cerevisiae" 1235 /db_xref="taxon:4932" 1236 /chromosome="IX" 1237 /map="9" 1238 CDS <1..206 1239 /codon_start=3 1240 /product="TCP1-beta" 1241 /protein_id="AAA98665.1" 1242 /db_xref="GI:1293614" 1243 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA 1244 AEVLLRVDNIIRARPRTANRQHM" 1245 gene 687..3158 1246 /gene="AXL2" 1247 CDS 687..3158 1248 /gene="AXL2" 1249 /note="plasma membrane glycoprotein" 1250 /codon_start=1 1251 /function="required for axial budding pattern of S. 1252 cerevisiae" 1253 /product="Axl2p" 1254 /protein_id="AAA98666.1" 1255 /db_xref="GI:1293615" 1256 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF 1257 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN 1258 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE 1259 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE 1260 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV 1261 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG 1262 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ 1263 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA 1264 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA 1265 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN 1266 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ 1267 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS 1268 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK 1269 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL 1270 VDFSNKSNVNVGQVKDIHGRIPEML" 1271 gene complement(3300..4037) 1272 /gene="REV7" 1273 CDS complement(3300..4037) 1274 /gene="REV7" 1275 /codon_start=1 1276 /product="Rev7p" 1277 /protein_id="AAA98667.1" 1278 /db_xref="GI:1293616" 1279 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ 1280 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD 1281 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR 1282 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK 1283 LISGDDKILNGVYSQYEEGESIFGSLF" 1284 ORIGIN 1285 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg 1286 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct 1287 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa 1288 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg 1289 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa 1290 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa 1291 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat 1292 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga 1293 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc 1294 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga 1295 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta 1296 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag 1297 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa 1298 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata 1299 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga 1300 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac 1301 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg 1302 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc 1303 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa 1304 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca 1305 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac 1306 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa 1307 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag 1308 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct 1309 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac 1310 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa 1311 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc 1312 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata 1313 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca 1314 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc 1315 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc 1316 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca 1317 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc 1318 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg 1319 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt 1320 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc 1321 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg 1322 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca 1323 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata 1324 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg 1325 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga 1326 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt 1327 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat 1328 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt 1329 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc 1330 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag 1331 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta 1332 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa 1333 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact 1334 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt 1335 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa 1336 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag 1337 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct 1338 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt 1339 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact 1340 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa 1341 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg 1342 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt 1343 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc 1344 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca 1345 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc 1346 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc 1347 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat 1348 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa 1349 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga 1350 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat 1351 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc 1352 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc 1353 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa 1354 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg 1355 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc 1356 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt 1357 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg 1358 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg 1359 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt 1360 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt 1361 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat 1362 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc 1363 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct 1364 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta 1365 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac 1366 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct 1367 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct 1368 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc 1369 //""" 1370 1371 # GenBank format protein (aka GenPept) file from: 1372 # http://www.molecularevolution.org/resources/fileformats/ 1373 gbk_example2 = \ 1374 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001 1375 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica]. 1376 ACCESSION AAD51968 1377 VERSION AAD51968.1 GI:5805369 1378 DBSOURCE locus AF171097 accession AF171097.1 1379 KEYWORDS . 1380 SOURCE Yersinia enterocolitica 1381 ORGANISM Yersinia enterocolitica 1382 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; 1383 Enterobacteriaceae; Yersinia. 1384 REFERENCE 1 (residues 1 to 143) 1385 AUTHORS Revell,P.A. and Miller,V.L. 1386 TITLE A chromosomally encoded regulator is required for expression of the 1387 Yersinia enterocolitica inv gene and for virulence 1388 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000) 1389 MEDLINE 20138369 1390 PUBMED 10672189 1391 REFERENCE 2 (residues 1 to 143) 1392 AUTHORS Revell,P.A. and Miller,V.L. 1393 TITLE Direct Submission 1394 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington 1395 University School of Medicine, Campus Box 8230, 660 South Euclid, 1396 St. Louis, MO 63110, USA 1397 COMMENT Method: conceptual translation. 1398 FEATURES Location/Qualifiers 1399 source 1..143 1400 /organism="Yersinia enterocolitica" 1401 /mol_type="unassigned DNA" 1402 /strain="JB580v" 1403 /serotype="O:8" 1404 /db_xref="taxon:630" 1405 Protein 1..143 1406 /product="transcriptional regulator RovA" 1407 /name="regulates inv expression" 1408 CDS 1..143 1409 /gene="rovA" 1410 /coded_by="AF171097.1:380..811" 1411 /note="regulator of virulence" 1412 /transl_table=11 1413 ORIGIN 1414 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq 1415 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp 1416 121 deiellsgli dklerniiql qsk 1417 // 1418 """ 1419 1420 embl_example="""ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 1421 XX 1422 AC X56734; S46826; 1423 XX 1424 DT 12-SEP-1991 (Rel. 29, Created) 1425 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) 1426 XX 1427 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase 1428 XX 1429 KW beta-glucosidase. 1430 XX 1431 OS Trifolium repens (white clover) 1432 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; 1433 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; 1434 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. 1435 XX 1436 RN [5] 1437 RP 1-1859 1438 RX PUBMED; 1907511. 1439 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; 1440 RT "Nucleotide and derived amino acid sequence of the cyanogenic 1441 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; 1442 RL Plant Mol. Biol. 17(2):209-219(1991). 1443 XX 1444 RN [6] 1445 RP 1-1859 1446 RA Hughes M.A.; 1447 RT ; 1448 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases. 1449 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle 1450 RL Upon Tyne, NE2 4HH, UK 1451 XX 1452 FH Key Location/Qualifiers 1453 FH 1454 FT source 1..1859 1455 FT /organism="Trifolium repens" 1456 FT /mol_type="mRNA" 1457 FT /clone_lib="lambda gt10" 1458 FT /clone="TRE361" 1459 FT /tissue_type="leaves" 1460 FT /db_xref="taxon:3899" 1461 FT CDS 14..1495 1462 FT /product="beta-glucosidase" 1463 FT /EC_number="3.2.1.21" 1464 FT /note="non-cyanogenic" 1465 FT /db_xref="GOA:P26204" 1466 FT /db_xref="InterPro:IPR001360" 1467 FT /db_xref="InterPro:IPR013781" 1468 FT /db_xref="UniProtKB/Swiss-Prot:P26204" 1469 FT /protein_id="CAA40058.1" 1470 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI 1471 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK 1472 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ 1473 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR 1474 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD 1475 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF 1476 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ 1477 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA 1478 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" 1479 FT mRNA 1..1859 1480 FT /experiment="experimental evidence, no additional details 1481 FT recorded" 1482 XX 1483 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; 1484 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 1485 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 1486 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 1487 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 1488 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 1489 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 1490 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 1491 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 1492 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 1493 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 1494 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 1495 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 1496 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 1497 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 1498 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 1499 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 1500 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 1501 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 1502 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 1503 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 1504 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 1505 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 1506 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 1507 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 1508 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 1509 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 1510 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 1511 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 1512 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 1513 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 1514 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 1515 // 1516 """ 1517 1518 print "GenBank CDS Iteration" 1519 print "=====================" 1520 1521 g = GenBankScanner() 1522 for record in g.parse_cds_features(StringIO(gbk_example)): 1523 print record 1524 1525 g = GenBankScanner() 1526 for record in g.parse_cds_features(StringIO(gbk_example2), 1527 tags2id=('gene','locus_tag','product')): 1528 print record 1529 1530 g = GenBankScanner() 1531 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2), 1532 tags2id=('gene','locus_tag','product')): 1533 print record 1534 1535 print 1536 print "GenBank Iteration" 1537 print "=================" 1538 g = GenBankScanner() 1539 for record in g.parse_records(StringIO(gbk_example),do_features=False): 1540 print record.id, record.name, record.description 1541 print record.seq 1542 1543 g = GenBankScanner() 1544 for record in g.parse_records(StringIO(gbk_example),do_features=True): 1545 print record.id, record.name, record.description 1546 print record.seq 1547 1548 g = GenBankScanner() 1549 for record in g.parse_records(StringIO(gbk_example2),do_features=False): 1550 print record.id, record.name, record.description 1551 print record.seq 1552 1553 g = GenBankScanner() 1554 for record in g.parse_records(StringIO(gbk_example2),do_features=True): 1555 print record.id, record.name, record.description 1556 print record.seq 1557 1558 print 1559 print "EMBL CDS Iteration" 1560 print "==================" 1561 1562 e = EmblScanner() 1563 for record in e.parse_cds_features(StringIO(embl_example)): 1564 print record 1565 1566 print 1567 print "EMBL Iteration" 1568 print "==============" 1569 e = EmblScanner() 1570 for record in e.parse_records(StringIO(embl_example),do_features=True): 1571 print record.id, record.name, record.description 1572 print record.seq 1573