1
2
3
4
5
6
7 """Bio.SeqIO support for the "genbank" and "embl" file formats.
8
9 You are expected to use this module via the Bio.SeqIO functions.
10 Note that internally this module calls Bio.GenBank to do the actual
11 parsing of both GenBank and EMBL files.
12
13 See also:
14
15 International Nucleotide Sequence Database Collaboration
16 http://www.insdc.org/
17
18 GenBank
19 http://www.ncbi.nlm.nih.gov/Genbank/
20
21 EMBL Nucleotide Sequence Database
22 http://www.ebi.ac.uk/embl/
23
24 DDBJ (DNA Data Bank of Japan)
25 http://www.ddbj.nig.ac.jp/
26 """
27
28 from Bio.Seq import UnknownSeq
29 from Bio.GenBank.Scanner import GenBankScanner, EmblScanner
30 from Bio import Alphabet
31 from Interfaces import SequentialSequenceWriter
32 from Bio import SeqFeature
33
34
35
36
37
38
39
40
41
43 """Breaks up a Genbank file into SeqRecord objects.
44
45 Every section from the LOCUS line to the terminating // becomes
46 a single SeqRecord with associated annotation and features.
47
48 Note that for genomes or chromosomes, there is typically only
49 one record."""
50
51 return GenBankScanner(debug=0).parse_records(handle)
52
54 """Breaks up an EMBL file into SeqRecord objects.
55
56 Every section from the LOCUS line to the terminating // becomes
57 a single SeqRecord with associated annotation and features.
58
59 Note that for genomes or chromosomes, there is typically only
60 one record."""
61
62 return EmblScanner(debug=0).parse_records(handle)
63
65 """Breaks up a Genbank file into SeqRecord objects for each CDS feature.
66
67 Every section from the LOCUS line to the terminating // can contain
68 many CDS features. These are returned as with the stated amino acid
69 translation sequence (if given).
70 """
71
72 return GenBankScanner(debug=0).parse_cds_features(handle, alphabet)
73
75 """Breaks up a EMBL file into SeqRecord objects for each CDS feature.
76
77 Every section from the LOCUS line to the terminating // can contain
78 many CDS features. These are returned as with the stated amino acid
79 translation sequence (if given).
80 """
81
82 return EmblScanner(debug=0).parse_cds_features(handle, alphabet)
83
85 """Build a GenBank/EMBL position string (PRIVATE).
86
87 Use offset=1 to add one to convert a start position from python counting.
88 """
89 if isinstance(pos, SeqFeature.ExactPosition):
90 return "%i" % (pos.position+offset)
91 elif isinstance(pos, SeqFeature.WithinPosition):
92 return "(%i.%i)" % (pos.position + offset,
93 pos.position + pos.extension + offset)
94 elif isinstance(pos, SeqFeature.BetweenPosition):
95 return "(%i^%i)" % (pos.position + offset,
96 pos.position + pos.extension + offset)
97 elif isinstance(pos, SeqFeature.BeforePosition):
98 return "<%i" % (pos.position + offset)
99 elif isinstance(pos, SeqFeature.AfterPosition):
100 return ">%i" % (pos.position + offset)
101 elif isinstance(pos, SeqFeature.OneOfPosition):
102 return "one-of(%s)" \
103 % ",".join([_insdc_feature_position_string(p,offset) \
104 for p in pos.position_choices])
105 elif isinstance(pos, SeqFeature.AbstractPosition):
106 raise NotImplementedError("Please report this as a bug in Biopython.")
107 else:
108 raise ValueError("Expected a SeqFeature position object.")
109
110
136
138 """Build a GenBank/EMBL location string from a SeqFeature (PRIVATE).
139
140 There is a choice of how to show joins on the reverse complement strand,
141 GenBank used "complement(join(1,10),(20,100))" while EMBL used to use
142 "join(complement(20,100),complement(1,10))" instead (but appears to have
143 now adopted the GenBank convention). Notice that the order of the entries
144 is reversed! This function therefore uses the first form. In this situation
145 we expect the parent feature and the two children to all be marked as
146 strand == -1, and in the order 0:10 then 19:100.
147
148 Also need to consider dual-strand examples like these from the Arabidopsis
149 thaliana chloroplast NC_000932: join(complement(69611..69724),139856..140650)
150 gene ArthCp047, GeneID:844801 or its CDS (protein NP_051038.1 GI:7525057)
151 which is further complicated by a splice:
152 join(complement(69611..69724),139856..140087,140625..140650)
153
154 For mixed this mixed strand feature, the parent SeqFeature should have
155 no strand (either 0 or None) while the child features should have either
156 strand +1 or -1 as appropriate, and be listed in the order given here.
157 """
158
159 if not feature.sub_features:
160
161
162
163
164 location = _insdc_location_string_ignoring_strand_and_subfeatures(feature)
165 if feature.strand == -1:
166 location = "complement(%s)" % location
167 return location
168
169 if feature.strand == -1:
170 for f in feature.sub_features:
171 assert f.strand == -1
172 return "complement(%s(%s))" \
173 % (feature.location_operator,
174 ",".join(_insdc_location_string_ignoring_strand_and_subfeatures(f) \
175 for f in feature.sub_features))
176
177
178
179
180 assert feature.location_operator != ""
181 return "%s(%s)" % (feature.location_operator,
182 ",".join([_insdc_feature_location_string(f) \
183 for f in feature.sub_features]))
184
185
187 """Base class for GenBank and EMBL writers (PRIVATE)."""
188 MAX_WIDTH = 80
189 QUALIFIER_INDENT = 21
190 QUALIFIER_INDENT_STR = " "*QUALIFIER_INDENT
191 QUALIFIER_INDENT_TMP = " %s "
192
194 if not value:
195 self.handle.write("%s/%s\n" % (self.QUALIFIER_INDENT_STR, key))
196 return
197
198
199 if quote is None:
200
201 if isinstance(value, int) or isinstance(value, long):
202 quote = False
203 else:
204 quote = True
205 if quote:
206 line = '%s/%s="%s"' % (self.QUALIFIER_INDENT_STR, key, value)
207 else:
208 line = '%s/%s=%s' % (self.QUALIFIER_INDENT_STR, key, value)
209 if len(line) <= self.MAX_WIDTH:
210 self.handle.write(line+"\n")
211 return
212 while line.lstrip():
213 if len(line) <= self.MAX_WIDTH:
214 self.handle.write(line+"\n")
215 return
216
217 for index in range(min(len(line)-1, self.MAX_WIDTH),
218 self.QUALIFIER_INDENT+1,-1):
219 if line[index] == " " : break
220 if line[index] != " ":
221
222 index = self.MAX_WIDTH
223 assert index <= self.MAX_WIDTH
224 self.handle.write(line[:index] + "\n")
225 line = self.QUALIFIER_INDENT_STR + line[index:].lstrip()
226
241
260
262 """Get an annotation dictionary entry (as a string).
263
264 Some entries are lists, in which case if just_first=True the first entry
265 is returned. If just_first=False (default) this verifies there is only
266 one entry before returning it."""
267 try:
268 answer = record.annotations[key]
269 except KeyError:
270 return default
271 if isinstance(answer, list):
272 if not just_first : assert len(answer) == 1
273 return str(answer[0])
274 else:
275 return str(answer)
276
278 "Returns a list of strings."""
279
280 text = text.strip()
281 if len(text) < max_len:
282 return [text]
283
284 words = text.split()
285 assert max([len(w) for w in words]) < max_len, \
286 "Your description cannot be broken into nice lines!:\n%s" \
287 % repr(text)
288 text = ""
289 while words and len(text) + 1 + len(words[0]) < max_len:
290 text += " " + words.pop(0)
291 text = text.strip()
292 assert len(text) < max_len
293 answer = [text]
294 while words:
295 text = ""
296 while words and len(text) + 1 + len(words[0]) < max_len:
297 text += " " + words.pop(0)
298 text = text.strip()
299 assert len(text) < max_len
300 answer.append(text)
301 assert not words
302 return answer
303
305 "Returns a list of strings, splits on commas."""
306
307
308
309 contig = record.annotations.get("contig", "")
310 if isinstance(contig, list) or isinstance(contig, tuple):
311 contig = "".join(contig)
312 contig = self.clean(contig)
313 i = 0
314 answer = []
315 while contig:
316 if len(contig) > max_len:
317
318 pos = contig[:max_len-1].rfind(",")
319 if pos == -1:
320 raise ValueError("Could not break up CONTIG")
321 text, contig = contig[:pos+1], contig[pos+1:]
322 else:
323 text, contig = contig, ""
324 answer.append(text)
325 return answer
326
328 HEADER_WIDTH = 12
329 QUALIFIER_INDENT = 21
330
338
348
357
359 default = "01-JAN-1980"
360 try :
361 date = record.annotations["date"]
362 except KeyError :
363 return default
364
365 if isinstance(date, list) and len(date)==1 :
366 date = date[0]
367
368 if not isinstance(date, str) or len(date) != 11 \
369 or date[2] != "-" or date[6] != "-" \
370 or not date[:2].isdigit() or not date[7:].isdigit() \
371 or int(date[:2]) > 31 \
372 or date[3:6] not in ["JAN", "FEB", "MAR", "APR", "MAY", "JUN",
373 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"] :
374
375 return default
376 return date
377
379 try:
380 division = record.annotations["data_file_division"]
381 except KeyError:
382 division = "UNK"
383 if division in ["PRI", "ROD", "MAM", "VRT", "INV", "PLN", "BCT",
384 "VRL", "PHG", "SYN", "UNA", "EST", "PAT", "STS",
385 "GSS", "HTG", "HTC", "ENV", "CON"]:
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408 pass
409 else:
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430 embl_to_gbk = {"FUN":"PLN",
431 "HUM":"PRI",
432 "MUS":"ROD",
433 "PRO":"BCT",
434 "UNC":"UNK",
435 "XXX":"UNK",
436 }
437 try:
438 division = embl_to_gbk[division]
439 except KeyError:
440 division = "UNK"
441 assert len(division)==3
442 return division
443
445 """Write the LOCUS line."""
446
447 locus = record.name
448 if not locus or locus == "<unknown name>":
449 locus = record.id
450 if not locus or locus == "<unknown id>":
451 locus = self._get_annotation_str(record, "accession", just_first=True)
452 if len(locus) > 16:
453 raise ValueError("Locus identifier %s is too long" % repr(locus))
454
455 if len(record) > 99999999999:
456
457
458 raise ValueError("Sequence too long!")
459
460
461 a = Alphabet._get_base_alphabet(record.seq.alphabet)
462 if not isinstance(a, Alphabet.Alphabet):
463 raise TypeError("Invalid alphabet")
464 elif isinstance(a, Alphabet.ProteinAlphabet):
465 units = "aa"
466 elif isinstance(a, Alphabet.NucleotideAlphabet):
467 units = "bp"
468 else:
469
470
471 raise ValueError("Need a Nucleotide or Protein alphabet")
472
473
474
475 if isinstance(a, Alphabet.ProteinAlphabet):
476 mol_type = ""
477 elif isinstance(a, Alphabet.DNAAlphabet):
478 mol_type = "DNA"
479 elif isinstance(a, Alphabet.RNAAlphabet):
480 mol_type = "RNA"
481 else:
482
483
484 raise ValueError("Need a DNA, RNA or Protein alphabet")
485
486 division = self._get_data_division(record)
487
488 assert len(units) == 2
489 assert len(division) == 3
490
491
492 line = "LOCUS %s %s %s %s %s %s\n" \
493 % (locus.ljust(16),
494 str(len(record)).rjust(11),
495 units,
496 mol_type.ljust(6),
497 division,
498 self._get_date(record))
499 assert len(line) == 79+1, repr(line)
500
501 assert line[12:28].rstrip() == locus, \
502 'LOCUS line does not contain the locus at the expected position:\n' + line
503 assert line[28:29] == " "
504 assert line[29:40].lstrip() == str(len(record)), \
505 'LOCUS line does not contain the length at the expected position:\n' + line
506
507
508 assert line[40:44] in [' bp ', ' aa '] , \
509 'LOCUS line does not contain size units at expected position:\n' + line
510 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \
511 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
512 assert line[47:54].strip() == "" \
513 or line[47:54].strip().find('DNA') != -1 \
514 or line[47:54].strip().find('RNA') != -1, \
515 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
516 assert line[54:55] == ' ', \
517 'LOCUS line does not contain space at position 55:\n' + line
518 assert line[55:63].strip() in ['', 'linear', 'circular'], \
519 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
520 assert line[63:64] == ' ', \
521 'LOCUS line does not contain space at position 64:\n' + line
522 assert line[67:68] == ' ', \
523 'LOCUS line does not contain space at position 68:\n' + line
524 assert line[70:71] == '-', \
525 'LOCUS line does not contain - at position 71 in date:\n' + line
526 assert line[74:75] == '-', \
527 'LOCUS line does not contain - at position 75 in date:\n' + line
528
529 self.handle.write(line)
530
532 number = 0
533 for ref in record.annotations["references"]:
534 if not isinstance(ref, SeqFeature.Reference):
535 continue
536 number += 1
537 data = str(number)
538
539 if ref.location and len(ref.location)==1:
540 a = Alphabet._get_base_alphabet(record.seq.alphabet)
541 if isinstance(a, Alphabet.ProteinAlphabet):
542 units = "residues"
543 else:
544 units = "bases"
545 data += " (%s %i to %i)" % (units,
546 ref.location[0].nofuzzy_start+1,
547 ref.location[0].nofuzzy_end)
548 self._write_single_line("REFERENCE", data)
549 if ref.authors:
550
551 self._write_multi_line(" AUTHORS", ref.authors)
552 if ref.consrtm:
553
554 self._write_multi_line(" CONSRTM", ref.consrtm)
555 if ref.title:
556
557 self._write_multi_line(" TITLE", ref.title)
558 if ref.journal:
559
560
561 self._write_multi_line(" JOURNAL", ref.journal)
562 if ref.medline_id:
563
564
565
566 self._write_multi_line(" MEDLINE", ref.medline_id)
567 if ref.pubmed_id:
568
569 self._write_multi_line(" PUBMED", ref.pubmed_id)
570 if ref.comment:
571 self._write_multi_line(" REMARK", ref.comment)
572
573
591
598
624
626 """Write a single record to the output file."""
627 handle = self.handle
628 self._write_the_first_line(record)
629
630 accession = self._get_annotation_str(record, "accession",
631 record.id.split(".", 1)[0],
632 just_first=True)
633 acc_with_version = accession
634 if record.id.startswith(accession+"."):
635 try:
636 acc_with_version = "%s.%i" \
637 % (accession,
638 int(record.id.split(".", 1)[1]))
639 except ValueError:
640 pass
641 gi = self._get_annotation_str(record, "gi", just_first=True)
642
643 descr = record.description
644 if descr == "<unknown description>" : descr = "."
645 self._write_multi_line("DEFINITION", descr)
646
647 self._write_single_line("ACCESSION", accession)
648 if gi != ".":
649 self._write_single_line("VERSION", "%s GI:%s" \
650 % (acc_with_version, gi))
651 else:
652 self._write_single_line("VERSION", "%s" % (acc_with_version))
653
654
655
656
657 self._write_multi_entries("DBLINK", record.dbxrefs)
658
659 try:
660
661
662 keywords = "; ".join(record.annotations["keywords"])
663
664 if not keywords.endswith(".") :
665 keywords += "."
666 except KeyError:
667
668 keywords = "."
669 self._write_multi_line("KEYWORDS", keywords)
670
671 if "segment" in record.annotations:
672
673
674 segment = record.annotations["segment"]
675 if isinstance(segment, list):
676 assert len(segment)==1, segment
677 segment = segment[0]
678 self._write_single_line("SEGMENT", segment)
679
680 self._write_multi_line("SOURCE", \
681 self._get_annotation_str(record, "source"))
682
683 org = self._get_annotation_str(record, "organism")
684 if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH:
685 org = org[:self.MAX_WIDTH - self.HEADER_WIDTH-4]+"..."
686 self._write_single_line(" ORGANISM", org)
687 try:
688
689
690 taxonomy = "; ".join(record.annotations["taxonomy"])
691
692 if not taxonomy.endswith(".") :
693 taxonomy += "."
694 except KeyError:
695 taxonomy = "."
696 self._write_multi_line("", taxonomy)
697
698 if "references" in record.annotations:
699 self._write_references(record)
700
701 if "comment" in record.annotations:
702 self._write_comment(record)
703
704 handle.write("FEATURES Location/Qualifiers\n")
705 for feature in record.features:
706 self._write_feature(feature)
707 self._write_sequence(record)
708 handle.write("//\n")
709
711 HEADER_WIDTH = 5
712 QUALIFIER_INDENT = 21
713 QUALIFIER_INDENT_STR = "FT" + " "*(QUALIFIER_INDENT-2)
714 QUALIFIER_INDENT_TMP = "FT %s "
715
721
768
770 assert len(tag)==2
771 line = tag+" "+text
772 assert len(line) <= self.MAX_WIDTH, line
773 self.handle.write(line+"\n")
774
780
782 """Write the ID and AC lines."""
783 if "." in record.id and record.id.rsplit(".", 1)[1].isdigit():
784 version = "SV " + record.id.rsplit(".", 1)[1]
785 accession = self._get_annotation_str(record, "accession",
786 record.id.rsplit(".", 1)[0],
787 just_first=True)
788 else :
789 version = ""
790 accession = self._get_annotation_str(record, "accession",
791 record.id,
792 just_first=True)
793
794 if ";" in accession :
795 raise ValueError("Cannot have semi-colon in EMBL accession, %s" \
796 % repr(accession))
797 if " " in accession :
798
799 raise ValueError("Cannot have spaces in EMBL accession, %s" \
800 % repr(accession))
801
802
803
804
805 a = Alphabet._get_base_alphabet(record.seq.alphabet)
806 if not isinstance(a, Alphabet.Alphabet):
807 raise TypeError("Invalid alphabet")
808 elif not isinstance(a, Alphabet.NucleotideAlphabet):
809 raise ValueError("Need a Nucleotide alphabet")
810 elif isinstance(a, Alphabet.DNAAlphabet):
811 mol_type = "DNA"
812 elif isinstance(a, Alphabet.RNAAlphabet):
813 mol_type = "RNA"
814 else:
815
816 raise ValueError("Need a DNA or RNA alphabet")
817
818
819 division = self._get_data_division(record)
820
821
822 handle = self.handle
823
824
825
826
827
828
829
830
831 self._write_single_line("ID", "%s; %s; ; %s; ; %s; %i BP." \
832 % (accession, version, mol_type,
833 division, len(record)))
834 handle.write("XX\n")
835 self._write_single_line("AC", accession+";")
836 handle.write("XX\n")
837
839 try:
840 division = record.annotations["data_file_division"]
841 except KeyError:
842 division = "UNC"
843 if division in ["PHG", "ENV", "FUN", "HUM", "INV", "MAM", "VRT",
844 "MUS", "PLN", "PRO", "ROD", "SYN", "TGN", "UNC",
845 "VRL", "XXX"]:
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866 pass
867 else:
868
869
870
871
872
873 gbk_to_embl = {"BCT":"PRO",
874 "UNK":"UNC",
875 }
876 try:
877 division = gbk_to_embl[division]
878 except KeyError:
879 division = "UNC"
880 assert len(division)==3
881 return division
882
912
932
975
976
977 if __name__ == "__main__":
978 print "Quick self test"
979 import os
980 from StringIO import StringIO
981
1005
1007 """Check two lists of SeqRecords agree, raises a ValueError if mismatch."""
1008 if len(old_list) != len(new_list):
1009 raise ValueError("%i vs %i records" % (len(old_list), len(new_list)))
1010 for old, new in zip(old_list, new_list):
1011 if not compare_record(old, new):
1012 return False
1013 return True
1014
1016 """Check two SeqFeatures agree."""
1017 if old.type != new.type:
1018 raise ValueError("Type %s versus %s" % (old.type, new.type))
1019 if old.location.nofuzzy_start != new.location.nofuzzy_start \
1020 or old.location.nofuzzy_end != new.location.nofuzzy_end:
1021 raise ValueError("%s versus %s:\n%s\nvs:\n%s" \
1022 % (old.location, new.location, str(old), str(new)))
1023 if old.strand != new.strand:
1024 raise ValueError("Different strand:\n%s\nvs:\n%s" % (str(old), str(new)))
1025 if old.location.start != new.location.start:
1026 raise ValueError("Start %s versus %s:\n%s\nvs:\n%s" \
1027 % (old.location.start, new.location.start, str(old), str(new)))
1028 if old.location.end != new.location.end:
1029 raise ValueError("End %s versus %s:\n%s\nvs:\n%s" \
1030 % (old.location.end, new.location.end, str(old), str(new)))
1031 if not ignore_sub_features:
1032 if len(old.sub_features) != len(new.sub_features):
1033 raise ValueError("Different sub features")
1034 for a, b in zip(old.sub_features, new.sub_features):
1035 if not compare_feature(a, b):
1036 return False
1037
1038
1039
1040 for key in set(old.qualifiers.keys()).intersection(new.qualifiers.keys()):
1041 if key in ["db_xref", "protein_id", "product", "note"]:
1042
1043 continue
1044 if old.qualifiers[key] != new.qualifiers[key]:
1045 raise ValueError("Qualifier mis-match for %s:\n%s\n%s" \
1046 % (key, old.qualifiers[key], new.qualifiers[key]))
1047 return True
1048
1050 """Check two lists of SeqFeatures agree, raises a ValueError if mismatch."""
1051 if len(old_list) != len(new_list):
1052 raise ValueError("%i vs %i features" % (len(old_list), len(new_list)))
1053 for old, new in zip(old_list, new_list):
1054
1055 if not compare_feature(old, new, ignore_sub_features):
1056 return False
1057 return True
1058
1066
1078
1079 for filename in os.listdir("../../Tests/GenBank"):
1080 if not filename.endswith(".gbk") and not filename.endswith(".gb"):
1081 continue
1082 print filename
1083
1084 handle = open("../../Tests/GenBank/%s" % filename)
1085 records = list(GenBankIterator(handle))
1086 handle.close()
1087
1088 check_genbank_writer(records)
1089 check_embl_writer(records)
1090
1091 for filename in os.listdir("../../Tests/EMBL"):
1092 if not filename.endswith(".embl"):
1093 continue
1094 print filename
1095
1096 handle = open("../../Tests/EMBL/%s" % filename)
1097 records = list(EmblIterator(handle))
1098 handle.close()
1099
1100 check_genbank_writer(records)
1101 check_embl_writer(records)
1102
1103 from Bio import SeqIO
1104 for filename in os.listdir("../../Tests/SwissProt"):
1105 if not filename.startswith("sp"):
1106 continue
1107 print filename
1108
1109 handle = open("../../Tests/SwissProt/%s" % filename)
1110 records = list(SeqIO.parse(handle, "swiss"))
1111 handle.close()
1112
1113 check_genbank_writer(records)
1114