Package Bio :: Package GenBank
[hide private]
[frames] | no frames]

Source Code for Package Bio.GenBank

   1  # Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
   2  # Copyright 2006-2008 by Peter Cock.  All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6   
   7  """Code to work with GenBank formatted files. 
   8   
   9  Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with 
  10  the "genbank" or "embl" format names to parse GenBank or EMBL files into 
  11  SeqRecord and SeqFeature objects (see the Biopython tutorial for details). 
  12   
  13  Also, rather than using Bio.GenBank to search or download files from the NCBI, 
  14  you are now encouraged to use Bio.Entrez instead (again, see the Biopython 
  15  tutorial for details). 
  16   
  17  Currently the ONLY reason to use Bio.GenBank directly is for the RecordParser 
  18  which turns a GenBank file into GenBank-specific Record objects.  This is a 
  19  much closer representation to the raw file contents that the SeqRecord 
  20  alternative from the FeatureParser (used in Bio.SeqIO). 
  21   
  22  Classes: 
  23  Iterator              Iterate through a file of GenBank entries 
  24  ErrorFeatureParser    Catch errors caused during parsing. 
  25  FeatureParser         Parse GenBank data in SeqRecord and SeqFeature objects. 
  26  RecordParser          Parse GenBank data into a Record object. 
  27   
  28  Exceptions: 
  29  ParserFailureError    Exception indicating a failure in the parser (ie. 
  30                        scanner or consumer) 
  31  LocationParserError   Exception indiciating a problem with the spark based 
  32                        location parser. 
  33   
  34   
  35  17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. 
  36  These are GenBank files that summarize the content of a project, and provide lists of 
  37  scaffold and contig files in the project. These will be in annotations['wgs'] and 
  38  annotations['wgs_scafld']. These GenBank files do not have sequences. See 
  39  http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 
  40   
  41  http://is.gd/nNgk 
  42  for more details of this format, and an example. 
  43  Added by Ying Huang & Iddo Friedberg 
  44  """ 
  45  import cStringIO 
  46   
  47  # other Biopython stuff 
  48  from Bio import SeqFeature 
  49  from Bio.ParserSupport import AbstractConsumer 
  50  from Bio import Entrez 
  51   
  52  # other Bio.GenBank stuff 
  53  import LocationParser 
  54  from utils import FeatureValueCleaner 
  55  from Scanner import GenBankScanner 
  56   
  57  #Constants used to parse GenBank header lines 
  58  GENBANK_INDENT = 12 
  59  GENBANK_SPACER = " " * GENBANK_INDENT 
  60   
  61  #Constants for parsing GenBank feature lines 
  62  FEATURE_KEY_INDENT = 5 
  63  FEATURE_QUALIFIER_INDENT = 21 
  64  FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT 
  65  FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 
  66   
67 -class Iterator:
68 """Iterator interface to move over a file of GenBank entries one at a time. 69 """
70 - def __init__(self, handle, parser = None):
71 """Initialize the iterator. 72 73 Arguments: 74 o handle - A handle with GenBank entries to iterate through. 75 o parser - An optional parser to pass the entries through before 76 returning them. If None, then the raw entry will be returned. 77 """ 78 self.handle = handle 79 self._parser = parser
80
81 - def next(self):
82 """Return the next GenBank record from the handle. 83 84 Will return None if we ran out of records. 85 """ 86 if self._parser is None: 87 lines = [] 88 while True: 89 line = self.handle.readline() 90 if not line : return None #Premature end of file? 91 lines.append(line) 92 if line.rstrip() == "//" : break 93 return "".join(lines) 94 try: 95 return self._parser.parse(self.handle) 96 except StopIteration: 97 return None
98
99 - def __iter__(self):
100 return iter(self.next, None)
101
102 -class ParserFailureError(Exception):
103 """Failure caused by some kind of problem in the parser. 104 """ 105 pass
106
107 -class LocationParserError(Exception):
108 """Could not Properly parse out a location from a GenBank file. 109 """ 110 pass
111
112 -class FeatureParser:
113 """Parse GenBank files into Seq + Feature objects. 114 """
115 - def __init__(self, debug_level = 0, use_fuzziness = 1, 116 feature_cleaner = FeatureValueCleaner()):
117 """Initialize a GenBank parser and Feature consumer. 118 119 Arguments: 120 o debug_level - An optional argument that species the amount of 121 debugging information the parser should spit out. By default we have 122 no debugging info (the fastest way to do things), but if you want 123 you can set this as high as two and see exactly where a parse fails. 124 o use_fuzziness - Specify whether or not to use fuzzy representations. 125 The default is 1 (use fuzziness). 126 o feature_cleaner - A class which will be used to clean out the 127 values of features. This class must implement the function 128 clean_value. GenBank.utils has a "standard" cleaner class, which 129 is used by default. 130 """ 131 self._scanner = GenBankScanner(debug_level) 132 self.use_fuzziness = use_fuzziness 133 self._cleaner = feature_cleaner
134
135 - def parse(self, handle):
136 """Parse the specified handle. 137 """ 138 self._consumer = _FeatureConsumer(self.use_fuzziness, 139 self._cleaner) 140 self._scanner.feed(handle, self._consumer) 141 return self._consumer.data
142
143 -class RecordParser:
144 """Parse GenBank files into Record objects 145 """
146 - def __init__(self, debug_level = 0):
147 """Initialize the parser. 148 149 Arguments: 150 o debug_level - An optional argument that species the amount of 151 debugging information the parser should spit out. By default we have 152 no debugging info (the fastest way to do things), but if you want 153 you can set this as high as two and see exactly where a parse fails. 154 """ 155 self._scanner = GenBankScanner(debug_level)
156
157 - def parse(self, handle):
158 """Parse the specified handle into a GenBank record. 159 """ 160 self._consumer = _RecordConsumer() 161 self._scanner.feed(handle, self._consumer) 162 return self._consumer.data
163
164 -class _BaseGenBankConsumer(AbstractConsumer):
165 """Abstract GenBank consumer providing useful general functions. 166 167 This just helps to eliminate some duplication in things that most 168 GenBank consumers want to do. 169 """ 170 # Special keys in GenBank records that we should remove spaces from 171 # For instance, \translation keys have values which are proteins and 172 # should have spaces and newlines removed from them. This class 173 # attribute gives us more control over specific formatting problems. 174 remove_space_keys = ["translation"] 175
176 - def __init__(self):
177 pass
178
179 - def _split_keywords(self, keyword_string):
180 """Split a string of keywords into a nice clean list. 181 """ 182 # process the keywords into a python list 183 if keyword_string == "" or keyword_string == ".": 184 keywords = "" 185 elif keyword_string[-1] == '.': 186 keywords = keyword_string[:-1] 187 else: 188 keywords = keyword_string 189 keyword_list = keywords.split(';') 190 clean_keyword_list = [x.strip() for x in keyword_list] 191 return clean_keyword_list
192
193 - def _split_accessions(self, accession_string):
194 """Split a string of accession numbers into a list. 195 """ 196 # first replace all line feeds with spaces 197 # Also, EMBL style accessions are split with ';' 198 accession = accession_string.replace("\n", " ").replace(";"," ") 199 200 return [x.strip() for x in accession.split() if x.strip()]
201
202 - def _split_taxonomy(self, taxonomy_string):
203 """Split a string with taxonomy info into a list. 204 """ 205 if not taxonomy_string or taxonomy_string==".": 206 #Missing data, no taxonomy 207 return [] 208 209 if taxonomy_string[-1] == '.': 210 tax_info = taxonomy_string[:-1] 211 else: 212 tax_info = taxonomy_string 213 tax_list = tax_info.split(';') 214 new_tax_list = [] 215 for tax_item in tax_list: 216 new_items = tax_item.split("\n") 217 new_tax_list.extend(new_items) 218 while '' in new_tax_list: 219 new_tax_list.remove('') 220 clean_tax_list = [x.strip() for x in new_tax_list] 221 222 return clean_tax_list
223
224 - def _clean_location(self, location_string):
225 """Clean whitespace out of a location string. 226 227 The location parser isn't a fan of whitespace, so we clean it out 228 before feeding it into the parser. 229 """ 230 #Originally this imported string.whitespace and did a replace 231 #via a loop. It's simpler to just split on whitespace and rejoin 232 #the string - and this avoids importing string too. See Bug 2684. 233 return ''.join(location_string.split())
234
235 - def _remove_newlines(self, text):
236 """Remove any newlines in the passed text, returning the new string. 237 """ 238 # get rid of newlines in the qualifier value 239 newlines = ["\n", "\r"] 240 for ws in newlines: 241 text = text.replace(ws, "") 242 243 return text
244
245 - def _normalize_spaces(self, text):
246 """Replace multiple spaces in the passed text with single spaces. 247 """ 248 # get rid of excessive spaces 249 text_parts = text.split(" ") 250 text_parts = filter(None, text_parts) 251 return ' '.join(text_parts)
252
253 - def _remove_spaces(self, text):
254 """Remove all spaces from the passed text. 255 """ 256 return text.replace(" ", "")
257
258 - def _convert_to_python_numbers(self, start, end):
259 """Convert a start and end range to python notation. 260 261 In GenBank, starts and ends are defined in "biological" coordinates, 262 where 1 is the first base and [i, j] means to include both i and j. 263 264 In python, 0 is the first base and [i, j] means to include i, but 265 not j. 266 267 So, to convert "biological" to python coordinates, we need to 268 subtract 1 from the start, and leave the end and things should 269 be converted happily. 270 """ 271 new_start = start - 1 272 new_end = end 273 274 return new_start, new_end
275
276 -class _FeatureConsumer(_BaseGenBankConsumer):
277 """Create a SeqRecord object with Features to return. 278 279 Attributes: 280 o use_fuzziness - specify whether or not to parse with fuzziness in 281 feature locations. 282 o feature_cleaner - a class that will be used to provide specialized 283 cleaning-up of feature values. 284 """
285 - def __init__(self, use_fuzziness, feature_cleaner = None):
286 from Bio.SeqRecord import SeqRecord 287 _BaseGenBankConsumer.__init__(self) 288 self.data = SeqRecord(None, id = None) 289 self.data.id = None 290 self.data.description = "" 291 292 self._use_fuzziness = use_fuzziness 293 self._feature_cleaner = feature_cleaner 294 295 self._seq_type = '' 296 self._seq_data = [] 297 self._cur_reference = None 298 self._cur_feature = None 299 self._cur_qualifier_key = None 300 self._cur_qualifier_value = None 301 self._expected_size = None
302
303 - def locus(self, locus_name):
304 """Set the locus name is set as the name of the Sequence. 305 """ 306 self.data.name = locus_name
307
308 - def size(self, content):
309 """Record the sequence length.""" 310 self._expected_size = int(content)
311
312 - def residue_type(self, type):
313 """Record the sequence type so we can choose an appropriate alphabet. 314 """ 315 self._seq_type = type
316
317 - def data_file_division(self, division):
318 self.data.annotations['data_file_division'] = division
319
320 - def date(self, submit_date):
321 self.data.annotations['date'] = submit_date
322
323 - def definition(self, definition):
324 """Set the definition as the description of the sequence. 325 """ 326 if self.data.description: 327 #Append to any existing description 328 #e.g. EMBL files with two DE lines. 329 self.data.description += " " + definition 330 else: 331 self.data.description = definition
332
333 - def accession(self, acc_num):
334 """Set the accession number as the id of the sequence. 335 336 If we have multiple accession numbers, the first one passed is 337 used. 338 """ 339 new_acc_nums = self._split_accessions(acc_num) 340 341 #Also record them ALL in the annotations 342 try: 343 #On the off chance there was more than one accession line: 344 for acc in new_acc_nums: 345 #Prevent repeat entries 346 if acc not in self.data.annotations['accessions']: 347 self.data.annotations['accessions'].append(acc) 348 except KeyError: 349 self.data.annotations['accessions'] = new_acc_nums 350 351 # if we haven't set the id information yet, add the first acc num 352 if self.data.id is None: 353 if len(new_acc_nums) > 0: 354 #self.data.id = new_acc_nums[0] 355 #Use the FIRST accession as the ID, not the first on this line! 356 self.data.id = self.data.annotations['accessions'][0]
357
358 - def wgs(self, content):
359 self.data.annotations['wgs'] = content.split('-')
360
361 - def add_wgs_scafld(self, content):
362 self.data.annotations.setdefault('wgs_scafld',[]).append(content.split('-'))
363
364 - def nid(self, content):
365 self.data.annotations['nid'] = content
366
367 - def pid(self, content):
368 self.data.annotations['pid'] = content
369
370 - def version(self, version_id):
371 #Want to use the versioned accession as the record.id 372 #This comes from the VERSION line in GenBank files, or the 373 #obsolete SV line in EMBL. For the new EMBL files we need 374 #both the version suffix from the ID line and the accession 375 #from the AC line. 376 if version_id.count(".")==1 and version_id.split(".")[1].isdigit(): 377 self.accession(version_id.split(".")[0]) 378 self.version_suffix(version_id.split(".")[1]) 379 else: 380 #For backwards compatibility... 381 self.data.id = version_id
382
383 - def project(self, content):
384 """Handle the information from the PROJECT line as a list of projects. 385 386 e.g. 387 PROJECT GenomeProject:28471 388 389 or: 390 PROJECT GenomeProject:13543 GenomeProject:99999 391 392 This is stored as dbxrefs in the SeqRecord to be consistent with the 393 projected switch of this line to DBLINK in future GenBank versions. 394 Note the NCBI plan to replace "GenomeProject:28471" with the shorter 395 "Project:28471" as part of this transition. 396 """ 397 content = content.replace("GenomeProject:", "Project:") 398 self.data.dbxrefs.extend([p for p in content.split() if p])
399 425
426 - def version_suffix(self, version):
427 """Set the version to overwrite the id. 428 429 Since the verison provides the same information as the accession 430 number, plus some extra info, we set this as the id if we have 431 a version. 432 """ 433 #e.g. GenBank line: 434 #VERSION U49845.1 GI:1293613 435 #or the obsolete EMBL line: 436 #SV U49845.1 437 #Scanner calls consumer.version("U49845.1") 438 #which then calls consumer.version_suffix(1) 439 # 440 #e.g. EMBL new line: 441 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 442 #Scanner calls consumer.version_suffix(1) 443 assert version.isdigit() 444 self.data.annotations['sequence_version'] = int(version)
445
446 - def db_source(self, content):
447 self.data.annotations['db_source'] = content.rstrip()
448
449 - def gi(self, content):
450 self.data.annotations['gi'] = content
451
452 - def keywords(self, content):
453 self.data.annotations['keywords'] = self._split_keywords(content)
454
455 - def segment(self, content):
456 self.data.annotations['segment'] = content
457
458 - def source(self, content):
459 #Note that some software (e.g. VectorNTI) may produce an empty 460 #source (rather than using a dot/period as might be expected). 461 if content == "": 462 source_info = "" 463 elif content[-1] == '.': 464 source_info = content[:-1] 465 else: 466 source_info = content 467 self.data.annotations['source'] = source_info
468
469 - def organism(self, content):
470 self.data.annotations['organism'] = content
471
472 - def taxonomy(self, content):
473 """Records (another line of) the taxonomy lineage. 474 """ 475 lineage = self._split_taxonomy(content) 476 try: 477 self.data.annotations['taxonomy'].extend(lineage) 478 except KeyError: 479 self.data.annotations['taxonomy'] = lineage
480
481 - def reference_num(self, content):
482 """Signal the beginning of a new reference object. 483 """ 484 # if we have a current reference that hasn't been added to 485 # the list of references, add it. 486 if self._cur_reference is not None: 487 self.data.annotations['references'].append(self._cur_reference) 488 else: 489 self.data.annotations['references'] = [] 490 491 self._cur_reference = SeqFeature.Reference()
492
493 - def reference_bases(self, content):
494 """Attempt to determine the sequence region the reference entails. 495 496 Possible types of information we may have to deal with: 497 498 (bases 1 to 86436) 499 (sites) 500 (bases 1 to 105654; 110423 to 111122) 501 1 (residues 1 to 182) 502 """ 503 # first remove the parentheses or other junk 504 ref_base_info = content[1:-1] 505 506 all_locations = [] 507 # parse if we've got 'bases' and 'to' 508 if ref_base_info.find('bases') != -1 and \ 509 ref_base_info.find('to') != -1: 510 # get rid of the beginning 'bases' 511 ref_base_info = ref_base_info[5:] 512 locations = self._split_reference_locations(ref_base_info) 513 all_locations.extend(locations) 514 elif (ref_base_info.find("residues") >= 0 and 515 ref_base_info.find("to") >= 0): 516 residues_start = ref_base_info.find("residues") 517 # get only the information after "residues" 518 ref_base_info = ref_base_info[(residues_start + len("residues ")):] 519 locations = self._split_reference_locations(ref_base_info) 520 all_locations.extend(locations) 521 522 # make sure if we are not finding information then we have 523 # the string 'sites' or the string 'bases' 524 elif (ref_base_info == 'sites' or 525 ref_base_info.strip() == 'bases'): 526 pass 527 # otherwise raise an error 528 else: 529 raise ValueError("Could not parse base info %s in record %s" % 530 (ref_base_info, self.data.id)) 531 532 self._cur_reference.location = all_locations
533
534 - def _split_reference_locations(self, location_string):
535 """Get reference locations out of a string of reference information 536 537 The passed string should be of the form: 538 539 1 to 20; 20 to 100 540 541 This splits the information out and returns a list of location objects 542 based on the reference locations. 543 """ 544 # split possibly multiple locations using the ';' 545 all_base_info = location_string.split(';') 546 547 new_locations = [] 548 for base_info in all_base_info: 549 start, end = base_info.split('to') 550 new_start, new_end = \ 551 self._convert_to_python_numbers(int(start.strip()), 552 int(end.strip())) 553 this_location = SeqFeature.FeatureLocation(new_start, new_end) 554 new_locations.append(this_location) 555 return new_locations
556
557 - def authors(self, content):
558 if self._cur_reference.authors: 559 self._cur_reference.authors += ' ' + content 560 else: 561 self._cur_reference.authors = content
562
563 - def consrtm(self, content):
564 if self._cur_reference.consrtm: 565 self._cur_reference.consrtm += ' ' + content 566 else: 567 self._cur_reference.consrtm = content
568
569 - def title(self, content):
570 if self._cur_reference is None: 571 import warnings 572 warnings.warn("GenBank TITLE line without REFERENCE line.") 573 elif self._cur_reference.title: 574 self._cur_reference.title += ' ' + content 575 else: 576 self._cur_reference.title = content
577
578 - def journal(self, content):
579 if self._cur_reference.journal: 580 self._cur_reference.journal += ' ' + content 581 else: 582 self._cur_reference.journal = content
583
584 - def medline_id(self, content):
585 self._cur_reference.medline_id = content
586
587 - def pubmed_id(self, content):
588 self._cur_reference.pubmed_id = content
589
590 - def remark(self, content):
591 """Deal with a reference comment.""" 592 if self._cur_reference.comment: 593 self._cur_reference.comment += ' ' + content 594 else: 595 self._cur_reference.comment = content
596
597 - def comment(self, content):
598 try: 599 self.data.annotations['comment'] += "\n" + "\n".join(content) 600 except KeyError: 601 self.data.annotations['comment'] = "\n".join(content)
602
603 - def features_line(self, content):
604 """Get ready for the feature table when we reach the FEATURE line. 605 """ 606 self.start_feature_table()
607
608 - def start_feature_table(self):
609 """Indicate we've got to the start of the feature table. 610 """ 611 # make sure we've added on our last reference object 612 if self._cur_reference is not None: 613 self.data.annotations['references'].append(self._cur_reference) 614 self._cur_reference = None
615
616 - def _add_feature(self):
617 """Utility function to add a feature to the SeqRecord. 618 619 This does all of the appropriate checking to make sure we haven't 620 left any info behind, and that we are only adding info if it 621 exists. 622 """ 623 if self._cur_feature: 624 # if we have a left over qualifier, add it to the qualifiers 625 # on the current feature 626 self._add_qualifier() 627 628 self._cur_qualifier_key = '' 629 self._cur_qualifier_value = '' 630 self.data.features.append(self._cur_feature)
631
632 - def feature_key(self, content):
633 # if we already have a feature, add it on 634 self._add_feature() 635 636 # start a new feature 637 self._cur_feature = SeqFeature.SeqFeature() 638 self._cur_feature.type = content 639 640 # assume positive strand to start with if we have DNA or cDNA 641 # (labelled as mRNA). The complement in the location will 642 # change this later if something is on the reverse strand 643 if self._seq_type.find("DNA") >= 0 or self._seq_type.find("mRNA") >= 0: 644 self._cur_feature.strand = 1
645
646 - def location(self, content):
647 """Parse out location information from the location string. 648 649 This uses a comprehensive but slow spark based parser to do the 650 parsing, and then translates the results of the parse into appropriate 651 Location objects. 652 """ 653 # --- first preprocess the location for the spark parser 654 655 # we need to clean up newlines and other whitespace inside 656 # the location before feeding it to the parser. 657 # locations should have no whitespace whatsoever based on the 658 # grammer 659 location_line = self._clean_location(content) 660 661 # Older records have junk like replace(266,"c") in the 662 # location line. Newer records just replace this with 663 # the number 266 and have the information in a more reasonable 664 # place. So we'll just grab out the number and feed this to the 665 # parser. We shouldn't really be losing any info this way. 666 if location_line.find('replace') != -1: 667 comma_pos = location_line.find(',') 668 location_line = location_line[8:comma_pos] 669 670 # feed everything into the scanner and parser 671 try: 672 parse_info = \ 673 LocationParser.parse(LocationParser.scan(location_line)) 674 # spark raises SystemExit errors when parsing fails 675 except SystemExit: 676 raise LocationParserError(location_line) 677 678 # print "parse_info:", repr(parse_info) 679 680 # add the parser information the current feature 681 self._set_location_info(parse_info, self._cur_feature)
682
683 - def _set_function(self, function, cur_feature):
684 """Set the location information based on a function. 685 686 This handles all of the location functions like 'join', 'complement' 687 and 'order'. 688 689 Arguments: 690 o function - A LocationParser.Function object specifying the 691 function we are acting on. 692 o cur_feature - The feature to add information to. 693 """ 694 assert isinstance(function, LocationParser.Function), \ 695 "Expected a Function object, got %s" % function 696 697 if function.name == "complement": 698 # mark the current feature as being on the opposite strand 699 cur_feature.strand = -1 700 # recursively deal with whatever is left inside the complement 701 for inner_info in function.args: 702 self._set_location_info(inner_info, cur_feature) 703 # deal with functions that have multipe internal segments that 704 # are connected somehow. 705 # join and order are current documented functions. 706 # one-of is something I ran across in old files. Treating it 707 # as a sub sequence feature seems appropriate to me. 708 # bond is some piece of junk I found in RefSeq files. I have 709 # no idea how to interpret it, so I jam it in here 710 elif (function.name == "join" or function.name == "order" or 711 function.name == "one-of" or function.name == "bond"): 712 self._set_ordering_info(function, cur_feature) 713 elif (function.name == "gap"): 714 assert len(function.args) == 1, \ 715 "Unexpected number of arguments in gap %s" % function.args 716 # make the cur information location a gap object 717 position = self._get_position(function.args[0].local_location) 718 cur_feature.location = SeqFeature.PositionGap(position) 719 else: 720 raise ValueError("Unexpected function name: %s" % function.name)
721
722 - def _set_ordering_info(self, function, cur_feature):
723 """Parse a join or order and all of the information in it. 724 725 This deals with functions that order a bunch of locations, 726 specifically 'join' and 'order'. The inner locations are 727 added as subfeatures of the top level feature 728 """ 729 # for each inner element, create a sub SeqFeature within the 730 # current feature, then get the information for this feature 731 cur_feature.location_operator = function.name 732 for inner_element in function.args: 733 new_sub_feature = SeqFeature.SeqFeature() 734 # inherit the type from the parent 735 new_sub_feature.type = cur_feature.type 736 # add the join or order info to the location_operator 737 new_sub_feature.location_operator = function.name 738 # inherit references and strand from the parent feature 739 new_sub_feature.ref = cur_feature.ref 740 new_sub_feature.ref_db = cur_feature.ref_db 741 new_sub_feature.strand = cur_feature.strand 742 743 # set the information for the inner element 744 self._set_location_info(inner_element, new_sub_feature) 745 746 # now add the feature to the sub_features 747 cur_feature.sub_features.append(new_sub_feature) 748 749 # set the location of the top -- this should be a combination of 750 # the start position of the first sub_feature and the end position 751 # of the last sub_feature 752 753 # these positions are already converted to python coordinates 754 # (when the sub_features were added) so they don't need to 755 # be converted again 756 feature_start = cur_feature.sub_features[0].location.start 757 feature_end = cur_feature.sub_features[-1].location.end 758 cur_feature.location = SeqFeature.FeatureLocation(feature_start, 759 feature_end) 760 # Historically a join on the reverse strand has been represented 761 # in Biopython with both the parent SeqFeature and its children 762 # (the exons for a CDS) all given a strand of -1. Likewise, for 763 # a join feature on the forward strand they all have strand +1. 764 # However, we must also consider evil mixed strand examples like 765 # this, join(complement(69611..69724),139856..140087,140625..140650) 766 strands = set(sf.strand for sf in cur_feature.sub_features) 767 if len(strands)==1: 768 cur_feature.strand = cur_feature.sub_features[0].strand 769 else: 770 cur_feature.strand = None # i.e. mixed strands
771
772 - def _set_location_info(self, parse_info, cur_feature):
773 """Set the location information for a feature from the parse info. 774 775 Arguments: 776 o parse_info - The classes generated by the LocationParser. 777 o cur_feature - The feature to add the information to. 778 """ 779 # base case -- we are out of information 780 if parse_info is None: 781 return 782 # parse a location -- this is another base_case -- we assume 783 # we have no information after a single location 784 elif isinstance(parse_info, LocationParser.AbsoluteLocation): 785 self._set_location(parse_info, cur_feature) 786 return 787 # parse any of the functions (join, complement, etc) 788 elif isinstance(parse_info, LocationParser.Function): 789 self._set_function(parse_info, cur_feature) 790 # otherwise we are stuck and should raise an error 791 else: 792 raise ValueError("Could not parse location info: %s" 793 % parse_info)
794
795 - def _set_location(self, location, cur_feature):
796 """Set the location information for a feature. 797 798 Arguments: 799 o location - An AbsoluteLocation object specifying the info 800 about the location. 801 o cur_feature - The feature to add the information to. 802 """ 803 # check to see if we have a cross reference to another accession 804 # ie. U05344.1:514..741 805 if location.path is not None: 806 cur_feature.ref = location.path.accession 807 cur_feature.ref_db = location.path.database 808 # now get the actual location information 809 cur_feature.location = self._get_location(location.local_location)
810
811 - def _get_location(self, range_info):
812 """Return a (possibly fuzzy) location from a Range object. 813 814 Arguments: 815 o range_info - A location range (ie. something like 67..100). This 816 may also be a single position (ie 27). 817 818 This returns a FeatureLocation object. 819 If parser.use_fuzziness is set at one, the positions for the 820 end points will possibly be fuzzy. 821 """ 822 if isinstance(range_info, LocationParser.Between) \ 823 and range_info.low.val+1 == range_info.high.val: 824 #A between location like "67^68" (one based counting) is a 825 #special case (note it has zero length). In python slice 826 #notation this is 67:67, a zero length slice. See Bug 2622 827 pos = self._get_position(range_info.low) 828 return SeqFeature.FeatureLocation(pos, pos) 829 #NOTE - We can imagine between locations like "2^4", but this 830 #is just "3". Similarly, "2^5" is just "3..4" 831 # check if we just have a single base 832 elif not(isinstance(range_info, LocationParser.Range)): 833 #A single base like "785" becomes [784:785] in python 834 s_pos = self._get_position(range_info) 835 # move the single position back one to be consistent with how 836 # python indexes numbers (starting at 0) 837 s_pos.position = s_pos.position - 1 838 e_pos = self._get_position(range_info) 839 return SeqFeature.FeatureLocation(s_pos, e_pos) 840 # otherwise we need to get both sides of the range 841 else: 842 # get *Position objects for the start and end 843 start_pos = self._get_position(range_info.low) 844 end_pos = self._get_position(range_info.high) 845 846 start_pos.position, end_pos.position = \ 847 self._convert_to_python_numbers(start_pos.position, 848 end_pos.position) 849 #If the start location is a one-of position, we also need to 850 #adjust their positions to use python counting. 851 if isinstance(start_pos, SeqFeature.OneOfPosition): 852 for p in start_pos.position_choices: 853 p.position -= 1 854 855 return SeqFeature.FeatureLocation(start_pos, end_pos)
856
857 - def _get_position(self, position):
858 """Return a (possibly fuzzy) position for a single coordinate. 859 860 Arguments: 861 o position - This is a LocationParser.* object that specifies 862 a single coordinate. We will examine the object to determine 863 the fuzziness of the position. 864 865 This is used with _get_location to parse out a location of any 866 end_point of arbitrary fuzziness. 867 """ 868 # case 1 -- just a normal number 869 if (isinstance(position, LocationParser.Integer)): 870 final_pos = SeqFeature.ExactPosition(position.val) 871 # case 2 -- we've got a > sign 872 elif isinstance(position, LocationParser.LowBound): 873 final_pos = SeqFeature.AfterPosition(position.base.val) 874 # case 3 -- we've got a < sign 875 elif isinstance(position, LocationParser.HighBound): 876 final_pos = SeqFeature.BeforePosition(position.base.val) 877 # case 4 -- we've got 100^101 878 # Is the extension is zero in this example? 879 elif isinstance(position, LocationParser.Between): 880 #NOTE - We don't *expect* this code to get called! 881 #We only except between locations like 3^4 (consecutive) 882 #which are handled in _get_location. We don't expect 883 #non consecutive variants like "2^5" as this is just "3..4". 884 #Similarly there is no reason to expect composite locations 885 #like "(3^4)..6" which should just be "4..6". 886 final_pos = SeqFeature.BetweenPosition(position.low.val, 887 position.high.val-position.low.val) 888 # case 5 -- we've got (100.101) 889 elif isinstance(position, LocationParser.TwoBound): 890 final_pos = SeqFeature.WithinPosition(position.low.val, 891 position.high.val-position.low.val) 892 # case 6 -- we've got a one-of(100, 110) location 893 elif isinstance(position, LocationParser.Function) and \ 894 position.name == "one-of": 895 # first convert all of the arguments to positions 896 position_choices = [] 897 for arg in position.args: 898 # we only handle AbsoluteLocations with no path 899 # right now. Not sure if other cases will pop up 900 assert isinstance(arg, LocationParser.AbsoluteLocation), \ 901 "Unhandled Location type %r" % arg 902 assert arg.path is None, "Unhandled path in location" 903 position = self._get_position(arg.local_location) 904 position_choices.append(position) 905 final_pos = SeqFeature.OneOfPosition(position_choices) 906 # if it is none of these cases we've got a problem! 907 else: 908 raise ValueError("Unexpected LocationParser object %r" % 909 position) 910 911 # if we are using fuzziness return what we've got 912 if self._use_fuzziness: 913 return final_pos 914 # otherwise return an ExactPosition equivalent 915 else: 916 return SeqFeature.ExactPosition(final_pos.location)
917
918 - def _add_qualifier(self):
919 """Add a qualifier to the current feature without loss of info. 920 921 If there are multiple qualifier keys with the same name we 922 would lose some info in the dictionary, so we append a unique 923 number to the end of the name in case of conflicts. 924 """ 925 # if we've got a key from before, add it to the dictionary of 926 # qualifiers 927 if self._cur_qualifier_key: 928 key = self._cur_qualifier_key 929 value = "".join(self._cur_qualifier_value) 930 if self._feature_cleaner is not None: 931 value = self._feature_cleaner.clean_value(key, value) 932 # if the qualifier name exists, append the value 933 if key in self._cur_feature.qualifiers: 934 self._cur_feature.qualifiers[key].append(value) 935 # otherwise start a new list of the key with its values 936 else: 937 self._cur_feature.qualifiers[key] = [value]
938
939 - def feature_qualifier_name(self, content_list):
940 """When we get a qualifier key, use it as a dictionary key. 941 942 We receive a list of keys, since you can have valueless keys such as 943 /pseudo which would be passed in with the next key (since no other 944 tags separate them in the file) 945 """ 946 for content in content_list: 947 # add a qualifier if we've got one 948 self._add_qualifier() 949 950 # remove the / and = from the qualifier if they're present 951 qual_key = content.replace('/', '') 952 qual_key = qual_key.replace('=', '') 953 qual_key = qual_key.strip() 954 955 self._cur_qualifier_key = qual_key 956 self._cur_qualifier_value = []
957
958 - def feature_qualifier_description(self, content):
959 # get rid of the quotes surrounding the qualifier if we've got 'em 960 qual_value = content.replace('"', '') 961 962 self._cur_qualifier_value.append(qual_value)
963
964 - def contig_location(self, content):
965 """Deal with CONTIG information.""" 966 #Historically this was stored as a SeqFeature object, but it was 967 #stored under record.annotations["contig"] and not under 968 #record.features with the other SeqFeature objects. 969 # 970 #The CONTIG location line can include additional tokens like 971 #Gap(), Gap(100) or Gap(unk100) which are not used in the feature 972 #location lines, so storing it using SeqFeature based location 973 #objects is difficult. 974 # 975 #We now store this a string, which means for BioSQL we are now in 976 #much better agreement with how BioPerl records the CONTIG line 977 #in the database. 978 # 979 #NOTE - This code assumes the scanner will return all the CONTIG 980 #lines already combined into one long string! 981 self.data.annotations["contig"] = content
982
983 - def origin_name(self, content):
984 pass
985
986 - def base_count(self, content):
987 pass
988
989 - def base_number(self, content):
990 pass
991
992 - def sequence(self, content):
993 """Add up sequence information as we get it. 994 995 To try and make things speedier, this puts all of the strings 996 into a list of strings, and then uses string.join later to put 997 them together. Supposedly, this is a big time savings 998 """ 999 new_seq = content.replace(' ', '') 1000 new_seq = new_seq.upper() 1001 1002 self._seq_data.append(new_seq)
1003
1004 - def record_end(self, content):
1005 """Clean up when we've finished the record. 1006 """ 1007 from Bio import Alphabet 1008 from Bio.Alphabet import IUPAC 1009 from Bio.Seq import Seq, UnknownSeq 1010 1011 #Try and append the version number to the accession for the full id 1012 if self.data.id is None: 1013 assert 'accessions' not in self.data.annotations, \ 1014 self.data.annotations['accessions'] 1015 self.data.id = self.data.name #Good fall back? 1016 elif self.data.id.count('.') == 0: 1017 try: 1018 self.data.id+='.%i' % self.data.annotations['sequence_version'] 1019 except KeyError: 1020 pass 1021 1022 # add the last feature in the table which hasn't been added yet 1023 self._add_feature() 1024 1025 # add the sequence information 1026 # first, determine the alphabet 1027 # we default to an generic alphabet if we don't have a 1028 # seq type or have strange sequence information. 1029 seq_alphabet = Alphabet.generic_alphabet 1030 1031 # now set the sequence 1032 sequence = "".join(self._seq_data) 1033 1034 if self._expected_size is not None \ 1035 and len(sequence) != 0 \ 1036 and self._expected_size != len(sequence): 1037 import warnings 1038 warnings.warn("Expected sequence length %i, found %i (%s)." \ 1039 % (self._expected_size, len(sequence), self.data.id)) 1040 1041 if self._seq_type: 1042 # mRNA is really also DNA, since it is actually cDNA 1043 if self._seq_type.find('DNA') != -1 or \ 1044 self._seq_type.find('mRNA') != -1: 1045 seq_alphabet = IUPAC.ambiguous_dna 1046 # are there ever really RNA sequences in GenBank? 1047 elif self._seq_type.find('RNA') != -1: 1048 #Even for data which was from RNA, the sequence string 1049 #is usually given as DNA (T not U). Bug 2408 1050 if "T" in sequence and "U" not in sequence: 1051 seq_alphabet = IUPAC.ambiguous_dna 1052 else: 1053 seq_alphabet = IUPAC.ambiguous_rna 1054 elif self._seq_type.find('PROTEIN') != -1: 1055 seq_alphabet = IUPAC.protein # or extended protein? 1056 # work around ugly GenBank records which have circular or 1057 # linear but no indication of sequence type 1058 elif self._seq_type in ["circular", "linear"]: 1059 pass 1060 # we have a bug if we get here 1061 else: 1062 raise ValueError("Could not determine alphabet for seq_type %s" 1063 % self._seq_type) 1064 1065 if not sequence and self.__expected_size: 1066 self.data.seq = UnknownSeq(self._expected_size, seq_alphabet) 1067 else: 1068 self.data.seq = Seq(sequence, seq_alphabet)
1069
1070 -class _RecordConsumer(_BaseGenBankConsumer):
1071 """Create a GenBank Record object from scanner generated information. 1072 """
1073 - def __init__(self):
1074 _BaseGenBankConsumer.__init__(self) 1075 import Record 1076 self.data = Record.Record() 1077 1078 self._seq_data = [] 1079 self._cur_reference = None 1080 self._cur_feature = None 1081 self._cur_qualifier = None
1082
1083 - def wgs(self, content):
1084 self.data.wgs = content.split('-')
1085
1086 - def add_wgs_scafld(self, content):
1087 self.data.wgs_scafld.append(content.split('-'))
1088
1089 - def locus(self, content):
1090 self.data.locus = content
1091
1092 - def size(self, content):
1093 self.data.size = content
1094
1095 - def residue_type(self, content):
1096 self.data.residue_type = content
1097
1098 - def data_file_division(self, content):
1099 self.data.data_file_division = content
1100
1101 - def date(self, content):
1102 self.data.date = content
1103
1104 - def definition(self, content):
1105 self.data.definition = content
1106
1107 - def accession(self, content):
1108 for acc in self._split_accessions(content): 1109 if acc not in self.data.accession: 1110 self.data.accession.append(acc)
1111
1112 - def nid(self, content):
1113 self.data.nid = content
1114
1115 - def pid(self, content):
1116 self.data.pid = content
1117
1118 - def version(self, content):
1119 self.data.version = content
1120
1121 - def db_source(self, content):
1122 self.data.db_source = content.rstrip()
1123
1124 - def gi(self, content):
1125 self.data.gi = content
1126
1127 - def keywords(self, content):
1128 self.data.keywords = self._split_keywords(content)
1129
1130 - def project(self, content):
1131 self.data.projects.extend([p for p in content.split() if p])
1132 1135
1136 - def segment(self, content):
1137 self.data.segment = content
1138
1139 - def source(self, content):
1140 self.data.source = content
1141
1142 - def organism(self, content):
1143 self.data.organism = content
1144
1145 - def taxonomy(self, content):
1146 self.data.taxonomy = self._split_taxonomy(content)
1147
1148 - def reference_num(self, content):
1149 """Grab the reference number and signal the start of a new reference. 1150 """ 1151 # check if we have a reference to add 1152 if self._cur_reference is not None: 1153 self.data.references.append(self._cur_reference) 1154 1155 import Record 1156 self._cur_reference = Record.Reference() 1157 self._cur_reference.number = content
1158
1159 - def reference_bases(self, content):
1160 self._cur_reference.bases = content
1161
1162 - def authors(self, content):
1163 self._cur_reference.authors = content
1164
1165 - def consrtm(self, content):
1166 self._cur_reference.consrtm = content
1167
1168 - def title(self, content):
1169 if self._cur_reference is None: 1170 import warnings 1171 warnings.warn("GenBank TITLE line without REFERENCE line.") 1172 return 1173 self._cur_reference.title = content
1174
1175 - def journal(self, content):
1176 self._cur_reference.journal = content
1177
1178 - def medline_id(self, content):
1179 self._cur_reference.medline_id = content
1180
1181 - def pubmed_id(self, content):
1182 self._cur_reference.pubmed_id = content
1183
1184 - def remark(self, content):
1185 self._cur_reference.remark = content
1186
1187 - def comment(self, content):
1188 self.data.comment += "\n".join(content)
1189
1190 - def primary_ref_line(self,content):
1191 """Data for the PRIMARY line""" 1192 self.data.primary.append(content)
1193
1194 - def primary(self,content):
1195 pass
1196
1197 - def features_line(self, content):
1198 """Get ready for the feature table when we reach the FEATURE line. 1199 """ 1200 self.start_feature_table()
1201
1202 - def start_feature_table(self):
1203 """Signal the start of the feature table. 1204 """ 1205 # we need to add on the last reference 1206 if self._cur_reference is not None: 1207 self.data.references.append(self._cur_reference)
1208
1209 - def feature_key(self, content):
1210 """Grab the key of the feature and signal the start of a new feature. 1211 """ 1212 # first add on feature information if we've got any 1213 self._add_feature() 1214 1215 import Record 1216 self._cur_feature = Record.Feature() 1217 self._cur_feature.key = content
1218
1219 - def _add_feature(self):
1220 """Utility function to add a feature to the Record. 1221 1222 This does all of the appropriate checking to make sure we haven't 1223 left any info behind, and that we are only adding info if it 1224 exists. 1225 """ 1226 if self._cur_feature is not None: 1227 # if we have a left over qualifier, add it to the qualifiers 1228 # on the current feature 1229 if self._cur_qualifier is not None: 1230 self._cur_feature.qualifiers.append(self._cur_qualifier) 1231 1232 self._cur_qualifier = None 1233 self.data.features.append(self._cur_feature)
1234
1235 - def location(self, content):
1236 self._cur_feature.location = self._clean_location(content)
1237
1238 - def feature_qualifier_name(self, content_list):
1239 """Deal with qualifier names 1240 1241 We receive a list of keys, since you can have valueless keys such as 1242 /pseudo which would be passed in with the next key (since no other 1243 tags separate them in the file) 1244 """ 1245 import Record 1246 for content in content_list: 1247 # the record parser keeps the /s -- add them if we don't have 'em 1248 if content.find("/") != 0: 1249 content = "/%s" % content 1250 # add on a qualifier if we've got one 1251 if self._cur_qualifier is not None: 1252 self._cur_feature.qualifiers.append(self._cur_qualifier) 1253 1254 self._cur_qualifier = Record.Qualifier() 1255 self._cur_qualifier.key = content
1256
1257 - def feature_qualifier_description(self, content):
1258 # if we have info then the qualifier key should have a ='s 1259 if self._cur_qualifier.key.find("=") == -1: 1260 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key 1261 cur_content = self._remove_newlines(content) 1262 # remove all spaces from the value if it is a type where spaces 1263 # are not important 1264 for remove_space_key in self.__class__.remove_space_keys: 1265 if self._cur_qualifier.key.find(remove_space_key) >= 0: 1266 cur_content = self._remove_spaces(cur_content) 1267 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1268
1269 - def base_count(self, content):
1270 self.data.base_counts = content
1271
1272 - def origin_name(self, content):
1273 self.data.origin = content
1274
1275 - def contig_location(self, content):
1276 """Signal that we have contig information to add to the record. 1277 """ 1278 self.data.contig = self._clean_location(content)
1279
1280 - def sequence(self, content):
1281 """Add sequence information to a list of sequence strings. 1282 1283 This removes spaces in the data and uppercases the sequence, and 1284 then adds it to a list of sequences. Later on we'll join this 1285 list together to make the final sequence. This is faster than 1286 adding on the new string every time. 1287 """ 1288 new_seq = content.replace(' ', '') 1289 self._seq_data.append(new_seq.upper())
1290
1291 - def record_end(self, content):
1292 """Signal the end of the record and do any necessary clean-up. 1293 """ 1294 # add together all of the sequence parts to create the 1295 # final sequence string 1296 self.data.sequence = "".join(self._seq_data) 1297 # add on the last feature 1298 self._add_feature()
1299