1
2
3
4
5
6
7 """Bio.SeqIO support for the "ace" file format.
8
9 You are expected to use this module via the Bio.SeqIO functions.
10 See also the Bio.Sequencing.Ace module which offers more than just accessing
11 the contig consensus sequences in an ACE file as SeqRecord objects.
12 """
13
14 from Bio.Seq import Seq
15 from Bio.SeqRecord import SeqRecord
16 from Bio.Alphabet import generic_nucleotide, generic_dna, generic_rna, Gapped
17 from Bio.Sequencing import Ace
18
19
21 """Returns SeqRecord objects from an ACE file.
22
23 This uses the Bio.Sequencing.Ace module to do the hard work. Note that
24 by iterating over the file in a single pass, we are forced to ignore any
25 WA, CT, RT or WR footer tags.
26
27 Ace files include the base quality for each position, which are taken
28 to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file
29 using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's
30 letter_annotations dictionary under the "phred_quality" key.
31
32 >>> from Bio import SeqIO
33 >>> handle = open("Ace/consed_sample.ace", "rU")
34 >>> for record in SeqIO.parse(handle, "ace"):
35 ... print record.id, record.seq[:10]+"...", len(record)
36 ... print max(record.letter_annotations["phred_quality"])
37 Contig1 agccccgggc... 1475
38 90
39
40 However, ACE files do not include a base quality for any gaps in the
41 consensus sequence, and these are represented in Biopython with a quality
42 of None. Using zero would be misleading as there may be very strong
43 evidence to support the gap in the consensus.
44
45 >>> from Bio import SeqIO
46 >>> handle = open("Ace/contig1.ace", "rU")
47 >>> for record in SeqIO.parse(handle, "ace"):
48 ... print record.id, "..." + record.seq[85:95]+"..."
49 ... print record.letter_annotations["phred_quality"][85:95]
50 ... print max(record.letter_annotations["phred_quality"])
51 Contig1 ...AGAGG-ATGC...
52 [57, 57, 54, 57, 57, None, 57, 72, 72, 72]
53 90
54 Contig2 ...GAATTACTAT...
55 [68, 68, 68, 68, 68, 68, 68, 68, 68, 68]
56 90
57
58 """
59 for ace_contig in Ace.parse(handle):
60
61 consensus_seq_str = ace_contig.sequence
62
63 if "U" in consensus_seq_str:
64 if "T" in consensus_seq_str:
65
66 alpha = generic_nucleotide
67 else:
68 alpha = generic_rna
69 else:
70 alpha = generic_dna
71
72 if "*" in consensus_seq_str:
73
74
75 assert "-" not in consensus_seq_str
76 consensus_seq = Seq(consensus_seq_str.replace("*","-"),
77 Gapped(alpha, gap_char="-"))
78 else:
79 consensus_seq = Seq(consensus_seq_str, alpha)
80
81
82
83
84
85
86
87
88 seq_record = SeqRecord(consensus_seq,
89 id = ace_contig.name,
90 name = ace_contig.name)
91
92
93
94
95
96 quals = []
97 i = 0
98 for base in consensus_seq:
99 if base == "-":
100 quals.append(None)
101 else:
102 quals.append(ace_contig.quality[i])
103 i += 1
104 assert i == len(ace_contig.quality)
105 seq_record.letter_annotations["phred_quality"] = quals
106
107 yield seq_record
108
109
111 """Run the Bio.SeqIO module's doctests.
112
113 This will try and locate the unit tests directory, and run the doctests
114 from there in order that the relative paths used in the examples work.
115 """
116 import doctest
117 import os
118 if os.path.isdir(os.path.join("..", "..", "Tests", "Ace")):
119 print "Runing doctests..."
120 cur_dir = os.path.abspath(os.curdir)
121 os.chdir(os.path.join("..", "..", "Tests"))
122 assert os.path.isfile("Ace/consed_sample.ace")
123 doctest.testmod()
124 os.chdir(cur_dir)
125 del cur_dir
126 print "Done"
127 elif os.path.isdir(os.path.join("Tests", "Ace")):
128 print "Runing doctests..."
129 cur_dir = os.path.abspath(os.curdir)
130 os.chdir(os.path.join("Tests"))
131 doctest.testmod()
132 os.chdir(cur_dir)
133 del cur_dir
134 print "Done"
135
136 if __name__ == "__main__":
137 _test()
138