1
2
3
4
5
6
7 """Code to work with GenBank formatted files.
8
9 Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with
10 the "genbank" or "embl" format names to parse GenBank or EMBL files into
11 SeqRecord and SeqFeature objects (see the Biopython tutorial for details).
12
13 Also, rather than using Bio.GenBank to search or download files from the NCBI,
14 you are now encouraged to use Bio.Entrez instead (again, see the Biopython
15 tutorial for details).
16
17 Currently the ONLY reason to use Bio.GenBank directly is for the RecordParser
18 which turns a GenBank file into GenBank-specific Record objects. This is a
19 much closer representation to the raw file contents that the SeqRecord
20 alternative from the FeatureParser (used in Bio.SeqIO).
21
22 Classes:
23 Iterator Iterate through a file of GenBank entries
24 ErrorFeatureParser Catch errors caused during parsing.
25 FeatureParser Parse GenBank data in SeqRecord and SeqFeature objects.
26 RecordParser Parse GenBank data into a Record object.
27
28 Exceptions:
29 ParserFailureError Exception indicating a failure in the parser (ie.
30 scanner or consumer)
31 LocationParserError Exception indiciating a problem with the spark based
32 location parser.
33
34
35 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records.
36 These are GenBank files that summarize the content of a project, and provide lists of
37 scaffold and contig files in the project. These will be in annotations['wgs'] and
38 annotations['wgs_scafld']. These GenBank files do not have sequences. See
39 http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36
40
41 http://is.gd/nNgk
42 for more details of this format, and an example.
43 Added by Ying Huang & Iddo Friedberg
44 """
45 import cStringIO
46
47
48 from Bio import SeqFeature
49 from Bio.ParserSupport import AbstractConsumer
50 from Bio import Entrez
51
52
53 import LocationParser
54 from utils import FeatureValueCleaner
55 from Scanner import GenBankScanner
56
57
58 GENBANK_INDENT = 12
59 GENBANK_SPACER = " " * GENBANK_INDENT
60
61
62 FEATURE_KEY_INDENT = 5
63 FEATURE_QUALIFIER_INDENT = 21
64 FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT
65 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
66
68 """Iterator interface to move over a file of GenBank entries one at a time.
69 """
70 - def __init__(self, handle, parser = None):
71 """Initialize the iterator.
72
73 Arguments:
74 o handle - A handle with GenBank entries to iterate through.
75 o parser - An optional parser to pass the entries through before
76 returning them. If None, then the raw entry will be returned.
77 """
78 self.handle = handle
79 self._parser = parser
80
82 """Return the next GenBank record from the handle.
83
84 Will return None if we ran out of records.
85 """
86 if self._parser is None:
87 lines = []
88 while True:
89 line = self.handle.readline()
90 if not line : return None
91 lines.append(line)
92 if line.rstrip() == "//" : break
93 return "".join(lines)
94 try:
95 return self._parser.parse(self.handle)
96 except StopIteration:
97 return None
98
100 return iter(self.next, None)
101
103 """Failure caused by some kind of problem in the parser.
104 """
105 pass
106
108 """Could not Properly parse out a location from a GenBank file.
109 """
110 pass
111
113 """Parse GenBank files into Seq + Feature objects.
114 """
117 """Initialize a GenBank parser and Feature consumer.
118
119 Arguments:
120 o debug_level - An optional argument that species the amount of
121 debugging information the parser should spit out. By default we have
122 no debugging info (the fastest way to do things), but if you want
123 you can set this as high as two and see exactly where a parse fails.
124 o use_fuzziness - Specify whether or not to use fuzzy representations.
125 The default is 1 (use fuzziness).
126 o feature_cleaner - A class which will be used to clean out the
127 values of features. This class must implement the function
128 clean_value. GenBank.utils has a "standard" cleaner class, which
129 is used by default.
130 """
131 self._scanner = GenBankScanner(debug_level)
132 self.use_fuzziness = use_fuzziness
133 self._cleaner = feature_cleaner
134
135 - def parse(self, handle):
136 """Parse the specified handle.
137 """
138 self._consumer = _FeatureConsumer(self.use_fuzziness,
139 self._cleaner)
140 self._scanner.feed(handle, self._consumer)
141 return self._consumer.data
142
144 """Parse GenBank files into Record objects
145 """
147 """Initialize the parser.
148
149 Arguments:
150 o debug_level - An optional argument that species the amount of
151 debugging information the parser should spit out. By default we have
152 no debugging info (the fastest way to do things), but if you want
153 you can set this as high as two and see exactly where a parse fails.
154 """
155 self._scanner = GenBankScanner(debug_level)
156
157 - def parse(self, handle):
158 """Parse the specified handle into a GenBank record.
159 """
160 self._consumer = _RecordConsumer()
161 self._scanner.feed(handle, self._consumer)
162 return self._consumer.data
163
165 """Abstract GenBank consumer providing useful general functions.
166
167 This just helps to eliminate some duplication in things that most
168 GenBank consumers want to do.
169 """
170
171
172
173
174 remove_space_keys = ["translation"]
175
178
180 """Split a string of keywords into a nice clean list.
181 """
182
183 if keyword_string == "" or keyword_string == ".":
184 keywords = ""
185 elif keyword_string[-1] == '.':
186 keywords = keyword_string[:-1]
187 else:
188 keywords = keyword_string
189 keyword_list = keywords.split(';')
190 clean_keyword_list = [x.strip() for x in keyword_list]
191 return clean_keyword_list
192
194 """Split a string of accession numbers into a list.
195 """
196
197
198 accession = accession_string.replace("\n", " ").replace(";"," ")
199
200 return [x.strip() for x in accession.split() if x.strip()]
201
203 """Split a string with taxonomy info into a list.
204 """
205 if not taxonomy_string or taxonomy_string==".":
206
207 return []
208
209 if taxonomy_string[-1] == '.':
210 tax_info = taxonomy_string[:-1]
211 else:
212 tax_info = taxonomy_string
213 tax_list = tax_info.split(';')
214 new_tax_list = []
215 for tax_item in tax_list:
216 new_items = tax_item.split("\n")
217 new_tax_list.extend(new_items)
218 while '' in new_tax_list:
219 new_tax_list.remove('')
220 clean_tax_list = [x.strip() for x in new_tax_list]
221
222 return clean_tax_list
223
225 """Clean whitespace out of a location string.
226
227 The location parser isn't a fan of whitespace, so we clean it out
228 before feeding it into the parser.
229 """
230
231
232
233 return ''.join(location_string.split())
234
236 """Remove any newlines in the passed text, returning the new string.
237 """
238
239 newlines = ["\n", "\r"]
240 for ws in newlines:
241 text = text.replace(ws, "")
242
243 return text
244
246 """Replace multiple spaces in the passed text with single spaces.
247 """
248
249 text_parts = text.split(" ")
250 text_parts = filter(None, text_parts)
251 return ' '.join(text_parts)
252
254 """Remove all spaces from the passed text.
255 """
256 return text.replace(" ", "")
257
259 """Convert a start and end range to python notation.
260
261 In GenBank, starts and ends are defined in "biological" coordinates,
262 where 1 is the first base and [i, j] means to include both i and j.
263
264 In python, 0 is the first base and [i, j] means to include i, but
265 not j.
266
267 So, to convert "biological" to python coordinates, we need to
268 subtract 1 from the start, and leave the end and things should
269 be converted happily.
270 """
271 new_start = start - 1
272 new_end = end
273
274 return new_start, new_end
275
277 """Create a SeqRecord object with Features to return.
278
279 Attributes:
280 o use_fuzziness - specify whether or not to parse with fuzziness in
281 feature locations.
282 o feature_cleaner - a class that will be used to provide specialized
283 cleaning-up of feature values.
284 """
285 - def __init__(self, use_fuzziness, feature_cleaner = None):
286 from Bio.SeqRecord import SeqRecord
287 _BaseGenBankConsumer.__init__(self)
288 self.data = SeqRecord(None, id = None)
289 self.data.id = None
290 self.data.description = ""
291
292 self._use_fuzziness = use_fuzziness
293 self._feature_cleaner = feature_cleaner
294
295 self._seq_type = ''
296 self._seq_data = []
297 self._cur_reference = None
298 self._cur_feature = None
299 self._cur_qualifier_key = None
300 self._cur_qualifier_value = None
301 self._expected_size = None
302
303 - def locus(self, locus_name):
304 """Set the locus name is set as the name of the Sequence.
305 """
306 self.data.name = locus_name
307
308 - def size(self, content):
309 """Record the sequence length."""
310 self._expected_size = int(content)
311
313 """Record the sequence type so we can choose an appropriate alphabet.
314 """
315 self._seq_type = type
316
319
320 - def date(self, submit_date):
322
332
334 """Set the accession number as the id of the sequence.
335
336 If we have multiple accession numbers, the first one passed is
337 used.
338 """
339 new_acc_nums = self._split_accessions(acc_num)
340
341
342 try:
343
344 for acc in new_acc_nums:
345
346 if acc not in self.data.annotations['accessions']:
347 self.data.annotations['accessions'].append(acc)
348 except KeyError:
349 self.data.annotations['accessions'] = new_acc_nums
350
351
352 if self.data.id is None:
353 if len(new_acc_nums) > 0:
354
355
356 self.data.id = self.data.annotations['accessions'][0]
357
358 - def wgs(self, content):
360
363
364 - def nid(self, content):
366
367 - def pid(self, content):
369
382
384 """Handle the information from the PROJECT line as a list of projects.
385
386 e.g.
387 PROJECT GenomeProject:28471
388
389 or:
390 PROJECT GenomeProject:13543 GenomeProject:99999
391
392 This is stored as dbxrefs in the SeqRecord to be consistent with the
393 projected switch of this line to DBLINK in future GenBank versions.
394 Note the NCBI plan to replace "GenomeProject:28471" with the shorter
395 "Project:28471" as part of this transition.
396 """
397 content = content.replace("GenomeProject:", "Project:")
398 self.data.dbxrefs.extend([p for p in content.split() if p])
399
401 """Store DBLINK cross references as dbxrefs in our record object.
402
403 This line type is expected to replace the PROJECT line in 2009. e.g.
404
405 During transition:
406
407 PROJECT GenomeProject:28471
408 DBLINK Project:28471
409 Trace Assembly Archive:123456
410
411 Once the project line is dropped:
412
413 DBLINK Project:28471
414 Trace Assembly Archive:123456
415
416 Note GenomeProject -> Project.
417
418 We'll have to see some real examples to be sure, but based on the
419 above example we can expect one reference per line.
420 """
421
422
423 if content.strip() not in self.data.dbxrefs:
424 self.data.dbxrefs.append(content.strip())
425
427 """Set the version to overwrite the id.
428
429 Since the verison provides the same information as the accession
430 number, plus some extra info, we set this as the id if we have
431 a version.
432 """
433
434
435
436
437
438
439
440
441
442
443 assert version.isdigit()
444 self.data.annotations['sequence_version'] = int(version)
445
448
449 - def gi(self, content):
451
454
457
459
460
461 if content == "":
462 source_info = ""
463 elif content[-1] == '.':
464 source_info = content[:-1]
465 else:
466 source_info = content
467 self.data.annotations['source'] = source_info
468
471
480
482 """Signal the beginning of a new reference object.
483 """
484
485
486 if self._cur_reference is not None:
487 self.data.annotations['references'].append(self._cur_reference)
488 else:
489 self.data.annotations['references'] = []
490
491 self._cur_reference = SeqFeature.Reference()
492
494 """Attempt to determine the sequence region the reference entails.
495
496 Possible types of information we may have to deal with:
497
498 (bases 1 to 86436)
499 (sites)
500 (bases 1 to 105654; 110423 to 111122)
501 1 (residues 1 to 182)
502 """
503
504 ref_base_info = content[1:-1]
505
506 all_locations = []
507
508 if ref_base_info.find('bases') != -1 and \
509 ref_base_info.find('to') != -1:
510
511 ref_base_info = ref_base_info[5:]
512 locations = self._split_reference_locations(ref_base_info)
513 all_locations.extend(locations)
514 elif (ref_base_info.find("residues") >= 0 and
515 ref_base_info.find("to") >= 0):
516 residues_start = ref_base_info.find("residues")
517
518 ref_base_info = ref_base_info[(residues_start + len("residues ")):]
519 locations = self._split_reference_locations(ref_base_info)
520 all_locations.extend(locations)
521
522
523
524 elif (ref_base_info == 'sites' or
525 ref_base_info.strip() == 'bases'):
526 pass
527
528 else:
529 raise ValueError("Could not parse base info %s in record %s" %
530 (ref_base_info, self.data.id))
531
532 self._cur_reference.location = all_locations
533
535 """Get reference locations out of a string of reference information
536
537 The passed string should be of the form:
538
539 1 to 20; 20 to 100
540
541 This splits the information out and returns a list of location objects
542 based on the reference locations.
543 """
544
545 all_base_info = location_string.split(';')
546
547 new_locations = []
548 for base_info in all_base_info:
549 start, end = base_info.split('to')
550 new_start, new_end = \
551 self._convert_to_python_numbers(int(start.strip()),
552 int(end.strip()))
553 this_location = SeqFeature.FeatureLocation(new_start, new_end)
554 new_locations.append(this_location)
555 return new_locations
556
558 if self._cur_reference.authors:
559 self._cur_reference.authors += ' ' + content
560 else:
561 self._cur_reference.authors = content
562
564 if self._cur_reference.consrtm:
565 self._cur_reference.consrtm += ' ' + content
566 else:
567 self._cur_reference.consrtm = content
568
569 - def title(self, content):
570 if self._cur_reference is None:
571 import warnings
572 warnings.warn("GenBank TITLE line without REFERENCE line.")
573 elif self._cur_reference.title:
574 self._cur_reference.title += ' ' + content
575 else:
576 self._cur_reference.title = content
577
579 if self._cur_reference.journal:
580 self._cur_reference.journal += ' ' + content
581 else:
582 self._cur_reference.journal = content
583
586
589
591 """Deal with a reference comment."""
592 if self._cur_reference.comment:
593 self._cur_reference.comment += ' ' + content
594 else:
595 self._cur_reference.comment = content
596
602
604 """Get ready for the feature table when we reach the FEATURE line.
605 """
606 self.start_feature_table()
607
609 """Indicate we've got to the start of the feature table.
610 """
611
612 if self._cur_reference is not None:
613 self.data.annotations['references'].append(self._cur_reference)
614 self._cur_reference = None
615
617 """Utility function to add a feature to the SeqRecord.
618
619 This does all of the appropriate checking to make sure we haven't
620 left any info behind, and that we are only adding info if it
621 exists.
622 """
623 if self._cur_feature:
624
625
626 self._add_qualifier()
627
628 self._cur_qualifier_key = ''
629 self._cur_qualifier_value = ''
630 self.data.features.append(self._cur_feature)
631
645
647 """Parse out location information from the location string.
648
649 This uses a comprehensive but slow spark based parser to do the
650 parsing, and then translates the results of the parse into appropriate
651 Location objects.
652 """
653
654
655
656
657
658
659 location_line = self._clean_location(content)
660
661
662
663
664
665
666 if location_line.find('replace') != -1:
667 comma_pos = location_line.find(',')
668 location_line = location_line[8:comma_pos]
669
670
671 try:
672 parse_info = \
673 LocationParser.parse(LocationParser.scan(location_line))
674
675 except SystemExit:
676 raise LocationParserError(location_line)
677
678
679
680
681 self._set_location_info(parse_info, self._cur_feature)
682
684 """Set the location information based on a function.
685
686 This handles all of the location functions like 'join', 'complement'
687 and 'order'.
688
689 Arguments:
690 o function - A LocationParser.Function object specifying the
691 function we are acting on.
692 o cur_feature - The feature to add information to.
693 """
694 assert isinstance(function, LocationParser.Function), \
695 "Expected a Function object, got %s" % function
696
697 if function.name == "complement":
698
699 cur_feature.strand = -1
700
701 for inner_info in function.args:
702 self._set_location_info(inner_info, cur_feature)
703
704
705
706
707
708
709
710 elif (function.name == "join" or function.name == "order" or
711 function.name == "one-of" or function.name == "bond"):
712 self._set_ordering_info(function, cur_feature)
713 elif (function.name == "gap"):
714 assert len(function.args) == 1, \
715 "Unexpected number of arguments in gap %s" % function.args
716
717 position = self._get_position(function.args[0].local_location)
718 cur_feature.location = SeqFeature.PositionGap(position)
719 else:
720 raise ValueError("Unexpected function name: %s" % function.name)
721
723 """Parse a join or order and all of the information in it.
724
725 This deals with functions that order a bunch of locations,
726 specifically 'join' and 'order'. The inner locations are
727 added as subfeatures of the top level feature
728 """
729
730
731 cur_feature.location_operator = function.name
732 for inner_element in function.args:
733 new_sub_feature = SeqFeature.SeqFeature()
734
735 new_sub_feature.type = cur_feature.type
736
737 new_sub_feature.location_operator = function.name
738
739 new_sub_feature.ref = cur_feature.ref
740 new_sub_feature.ref_db = cur_feature.ref_db
741 new_sub_feature.strand = cur_feature.strand
742
743
744 self._set_location_info(inner_element, new_sub_feature)
745
746
747 cur_feature.sub_features.append(new_sub_feature)
748
749
750
751
752
753
754
755
756 feature_start = cur_feature.sub_features[0].location.start
757 feature_end = cur_feature.sub_features[-1].location.end
758 cur_feature.location = SeqFeature.FeatureLocation(feature_start,
759 feature_end)
760
761
762
763
764
765
766 strands = set(sf.strand for sf in cur_feature.sub_features)
767 if len(strands)==1:
768 cur_feature.strand = cur_feature.sub_features[0].strand
769 else:
770 cur_feature.strand = None
771
773 """Set the location information for a feature from the parse info.
774
775 Arguments:
776 o parse_info - The classes generated by the LocationParser.
777 o cur_feature - The feature to add the information to.
778 """
779
780 if parse_info is None:
781 return
782
783
784 elif isinstance(parse_info, LocationParser.AbsoluteLocation):
785 self._set_location(parse_info, cur_feature)
786 return
787
788 elif isinstance(parse_info, LocationParser.Function):
789 self._set_function(parse_info, cur_feature)
790
791 else:
792 raise ValueError("Could not parse location info: %s"
793 % parse_info)
794
796 """Set the location information for a feature.
797
798 Arguments:
799 o location - An AbsoluteLocation object specifying the info
800 about the location.
801 o cur_feature - The feature to add the information to.
802 """
803
804
805 if location.path is not None:
806 cur_feature.ref = location.path.accession
807 cur_feature.ref_db = location.path.database
808
809 cur_feature.location = self._get_location(location.local_location)
810
812 """Return a (possibly fuzzy) location from a Range object.
813
814 Arguments:
815 o range_info - A location range (ie. something like 67..100). This
816 may also be a single position (ie 27).
817
818 This returns a FeatureLocation object.
819 If parser.use_fuzziness is set at one, the positions for the
820 end points will possibly be fuzzy.
821 """
822 if isinstance(range_info, LocationParser.Between) \
823 and range_info.low.val+1 == range_info.high.val:
824
825
826
827 pos = self._get_position(range_info.low)
828 return SeqFeature.FeatureLocation(pos, pos)
829
830
831
832 elif not(isinstance(range_info, LocationParser.Range)):
833
834 s_pos = self._get_position(range_info)
835
836
837 s_pos.position = s_pos.position - 1
838 e_pos = self._get_position(range_info)
839 return SeqFeature.FeatureLocation(s_pos, e_pos)
840
841 else:
842
843 start_pos = self._get_position(range_info.low)
844 end_pos = self._get_position(range_info.high)
845
846 start_pos.position, end_pos.position = \
847 self._convert_to_python_numbers(start_pos.position,
848 end_pos.position)
849
850
851 if isinstance(start_pos, SeqFeature.OneOfPosition):
852 for p in start_pos.position_choices:
853 p.position -= 1
854
855 return SeqFeature.FeatureLocation(start_pos, end_pos)
856
858 """Return a (possibly fuzzy) position for a single coordinate.
859
860 Arguments:
861 o position - This is a LocationParser.* object that specifies
862 a single coordinate. We will examine the object to determine
863 the fuzziness of the position.
864
865 This is used with _get_location to parse out a location of any
866 end_point of arbitrary fuzziness.
867 """
868
869 if (isinstance(position, LocationParser.Integer)):
870 final_pos = SeqFeature.ExactPosition(position.val)
871
872 elif isinstance(position, LocationParser.LowBound):
873 final_pos = SeqFeature.AfterPosition(position.base.val)
874
875 elif isinstance(position, LocationParser.HighBound):
876 final_pos = SeqFeature.BeforePosition(position.base.val)
877
878
879 elif isinstance(position, LocationParser.Between):
880
881
882
883
884
885
886 final_pos = SeqFeature.BetweenPosition(position.low.val,
887 position.high.val-position.low.val)
888
889 elif isinstance(position, LocationParser.TwoBound):
890 final_pos = SeqFeature.WithinPosition(position.low.val,
891 position.high.val-position.low.val)
892
893 elif isinstance(position, LocationParser.Function) and \
894 position.name == "one-of":
895
896 position_choices = []
897 for arg in position.args:
898
899
900 assert isinstance(arg, LocationParser.AbsoluteLocation), \
901 "Unhandled Location type %r" % arg
902 assert arg.path is None, "Unhandled path in location"
903 position = self._get_position(arg.local_location)
904 position_choices.append(position)
905 final_pos = SeqFeature.OneOfPosition(position_choices)
906
907 else:
908 raise ValueError("Unexpected LocationParser object %r" %
909 position)
910
911
912 if self._use_fuzziness:
913 return final_pos
914
915 else:
916 return SeqFeature.ExactPosition(final_pos.location)
917
919 """Add a qualifier to the current feature without loss of info.
920
921 If there are multiple qualifier keys with the same name we
922 would lose some info in the dictionary, so we append a unique
923 number to the end of the name in case of conflicts.
924 """
925
926
927 if self._cur_qualifier_key:
928 key = self._cur_qualifier_key
929 value = "".join(self._cur_qualifier_value)
930 if self._feature_cleaner is not None:
931 value = self._feature_cleaner.clean_value(key, value)
932
933 if key in self._cur_feature.qualifiers:
934 self._cur_feature.qualifiers[key].append(value)
935
936 else:
937 self._cur_feature.qualifiers[key] = [value]
938
940 """When we get a qualifier key, use it as a dictionary key.
941
942 We receive a list of keys, since you can have valueless keys such as
943 /pseudo which would be passed in with the next key (since no other
944 tags separate them in the file)
945 """
946 for content in content_list:
947
948 self._add_qualifier()
949
950
951 qual_key = content.replace('/', '')
952 qual_key = qual_key.replace('=', '')
953 qual_key = qual_key.strip()
954
955 self._cur_qualifier_key = qual_key
956 self._cur_qualifier_value = []
957
959
960 qual_value = content.replace('"', '')
961
962 self._cur_qualifier_value.append(qual_value)
963
965 """Deal with CONTIG information."""
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981 self.data.annotations["contig"] = content
982
985
988
991
993 """Add up sequence information as we get it.
994
995 To try and make things speedier, this puts all of the strings
996 into a list of strings, and then uses string.join later to put
997 them together. Supposedly, this is a big time savings
998 """
999 new_seq = content.replace(' ', '')
1000 new_seq = new_seq.upper()
1001
1002 self._seq_data.append(new_seq)
1003
1005 """Clean up when we've finished the record.
1006 """
1007 from Bio import Alphabet
1008 from Bio.Alphabet import IUPAC
1009 from Bio.Seq import Seq, UnknownSeq
1010
1011
1012 if self.data.id is None:
1013 assert 'accessions' not in self.data.annotations, \
1014 self.data.annotations['accessions']
1015 self.data.id = self.data.name
1016 elif self.data.id.count('.') == 0:
1017 try:
1018 self.data.id+='.%i' % self.data.annotations['sequence_version']
1019 except KeyError:
1020 pass
1021
1022
1023 self._add_feature()
1024
1025
1026
1027
1028
1029 seq_alphabet = Alphabet.generic_alphabet
1030
1031
1032 sequence = "".join(self._seq_data)
1033
1034 if self._expected_size is not None \
1035 and len(sequence) != 0 \
1036 and self._expected_size != len(sequence):
1037 import warnings
1038 warnings.warn("Expected sequence length %i, found %i (%s)." \
1039 % (self._expected_size, len(sequence), self.data.id))
1040
1041 if self._seq_type:
1042
1043 if self._seq_type.find('DNA') != -1 or \
1044 self._seq_type.find('mRNA') != -1:
1045 seq_alphabet = IUPAC.ambiguous_dna
1046
1047 elif self._seq_type.find('RNA') != -1:
1048
1049
1050 if "T" in sequence and "U" not in sequence:
1051 seq_alphabet = IUPAC.ambiguous_dna
1052 else:
1053 seq_alphabet = IUPAC.ambiguous_rna
1054 elif self._seq_type.find('PROTEIN') != -1:
1055 seq_alphabet = IUPAC.protein
1056
1057
1058 elif self._seq_type in ["circular", "linear"]:
1059 pass
1060
1061 else:
1062 raise ValueError("Could not determine alphabet for seq_type %s"
1063 % self._seq_type)
1064
1065 if not sequence and self.__expected_size:
1066 self.data.seq = UnknownSeq(self._expected_size, seq_alphabet)
1067 else:
1068 self.data.seq = Seq(sequence, seq_alphabet)
1069
1071 """Create a GenBank Record object from scanner generated information.
1072 """
1082
1083 - def wgs(self, content):
1085
1088
1089 - def locus(self, content):
1091
1092 - def size(self, content):
1094
1097
1100
1101 - def date(self, content):
1103
1106
1111
1112 - def nid(self, content):
1114
1115 - def pid(self, content):
1117
1120
1123
1124 - def gi(self, content):
1126
1129
1132
1135
1138
1141
1144
1147
1149 """Grab the reference number and signal the start of a new reference.
1150 """
1151
1152 if self._cur_reference is not None:
1153 self.data.references.append(self._cur_reference)
1154
1155 import Record
1156 self._cur_reference = Record.Reference()
1157 self._cur_reference.number = content
1158
1160 self._cur_reference.bases = content
1161
1163 self._cur_reference.authors = content
1164
1166 self._cur_reference.consrtm = content
1167
1168 - def title(self, content):
1169 if self._cur_reference is None:
1170 import warnings
1171 warnings.warn("GenBank TITLE line without REFERENCE line.")
1172 return
1173 self._cur_reference.title = content
1174
1176 self._cur_reference.journal = content
1177
1180
1183
1185 self._cur_reference.remark = content
1186
1189
1193
1196
1198 """Get ready for the feature table when we reach the FEATURE line.
1199 """
1200 self.start_feature_table()
1201
1203 """Signal the start of the feature table.
1204 """
1205
1206 if self._cur_reference is not None:
1207 self.data.references.append(self._cur_reference)
1208
1210 """Grab the key of the feature and signal the start of a new feature.
1211 """
1212
1213 self._add_feature()
1214
1215 import Record
1216 self._cur_feature = Record.Feature()
1217 self._cur_feature.key = content
1218
1220 """Utility function to add a feature to the Record.
1221
1222 This does all of the appropriate checking to make sure we haven't
1223 left any info behind, and that we are only adding info if it
1224 exists.
1225 """
1226 if self._cur_feature is not None:
1227
1228
1229 if self._cur_qualifier is not None:
1230 self._cur_feature.qualifiers.append(self._cur_qualifier)
1231
1232 self._cur_qualifier = None
1233 self.data.features.append(self._cur_feature)
1234
1237
1239 """Deal with qualifier names
1240
1241 We receive a list of keys, since you can have valueless keys such as
1242 /pseudo which would be passed in with the next key (since no other
1243 tags separate them in the file)
1244 """
1245 import Record
1246 for content in content_list:
1247
1248 if content.find("/") != 0:
1249 content = "/%s" % content
1250
1251 if self._cur_qualifier is not None:
1252 self._cur_feature.qualifiers.append(self._cur_qualifier)
1253
1254 self._cur_qualifier = Record.Qualifier()
1255 self._cur_qualifier.key = content
1256
1268
1270 self.data.base_counts = content
1271
1273 self.data.origin = content
1274
1276 """Signal that we have contig information to add to the record.
1277 """
1278 self.data.contig = self._clean_location(content)
1279
1281 """Add sequence information to a list of sequence strings.
1282
1283 This removes spaces in the data and uppercases the sequence, and
1284 then adds it to a list of sequences. Later on we'll join this
1285 list together to make the final sequence. This is faster than
1286 adding on the new string every time.
1287 """
1288 new_seq = content.replace(' ', '')
1289 self._seq_data.append(new_seq.upper())
1290
1292 """Signal the end of the record and do any necessary clean-up.
1293 """
1294
1295
1296 self.data.sequence = "".join(self._seq_data)
1297
1298 self._add_feature()
1299