1
2
3
4
5 """
6 AlignIO support for the "phylip" format used in Joe Felsenstein's PHYLIP tools.
7
8 You are expected to use this module via the Bio.AlignIO functions (or the
9 Bio.SeqIO functions if you want to work directly with the gapped sequences).
10
11 Note
12 ====
13 In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003)
14 a dot/period (".") in a sequence is interpreted as meaning the same
15 character as in the first sequence. The PHYLIP 3.6 documentation says:
16
17 "a period was also previously allowed but it is no longer allowed,
18 because it sometimes is used in different senses in other programs"
19
20 At the time of writing, we do nothing special with a dot/period.
21 """
22
23 from Bio.Alphabet import single_letter_alphabet
24 from Bio.Align import MultipleSeqAlignment
25 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
26
28 """Phylip alignment writer."""
30 """Use this to write (another) single alignment to an open file.
31
32 This code will write interlaced alignments (when the sequences are
33 longer than 50 characters).
34
35 Note that record identifiers are strictly truncated at 10 characters.
36
37 For more information on the file format, please see:
38 http://evolution.genetics.washington.edu/phylip/doc/sequence.html
39 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
40 """
41 truncate=10
42 handle = self.handle
43
44 if len(alignment)==0:
45 raise ValueError("Must have at least one sequence")
46 length_of_seqs = alignment.get_alignment_length()
47 for record in alignment:
48 if length_of_seqs != len(record.seq):
49 raise ValueError("Sequences must all be the same length")
50 if length_of_seqs <= 0:
51 raise ValueError("Non-empty sequences are required")
52
53 if len(alignment) > len(set([r.id[:truncate] for r in alignment])):
54 raise ValueError("Repeated identifier, possibly due to truncation")
55
56
57
58
59
60
61
62 handle.write(" %i %s\n" % (len(alignment), length_of_seqs))
63 block=0
64 while True:
65 for record in alignment:
66 if block==0:
67
68 """
69 Quoting the PHYLIP version 3.6 documentation:
70
71 The name should be ten characters in length, filled out to
72 the full ten characters by blanks if shorter. Any printable
73 ASCII/ISO character is allowed in the name, except for
74 parentheses ("(" and ")"), square brackets ("[" and "]"),
75 colon (":"), semicolon (";") and comma (","). If you forget
76 to extend the names to ten characters in length by blanks,
77 the program [i.e. PHYLIP] will get out of synchronization
78 with the contents of the data file, and an error message will
79 result.
80
81 Note that Tab characters count as only one character in the
82 species names. Their inclusion can cause trouble.
83 """
84 name = record.id.strip()
85
86
87 for char in "[](),":
88 name = name.replace(char,"")
89 for char in ":;":
90 name = name.replace(char,"|")
91
92
93 handle.write(name[:truncate].ljust(truncate))
94 else:
95
96 handle.write(" "*truncate)
97
98 for chunk in range(0,5):
99 i = block*50 + chunk*10
100 seq_segment = record.seq.tostring()[i:i+10]
101
102
103 handle.write(" %s" % seq_segment)
104 if i+10 > length_of_seqs : break
105 handle.write("\n")
106 block=block+1
107 if block*50 > length_of_seqs : break
108 handle.write("\n")
109
111 """Reads a Phylip alignment file returning a MultipleSeqAlignment iterator.
112
113 Record identifiers are limited to at most 10 characters.
114
115 It only copes with interlaced phylip files! Sequential files won't work
116 where the sequences are split over multiple lines.
117
118 For more information on the file format, please see:
119 http://evolution.genetics.washington.edu/phylip/doc/sequence.html
120 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
121 """
122
124 line = line.strip()
125 parts = filter(None, line.split())
126 if len(parts)!=2:
127 return False
128 try:
129 number_of_seqs = int(parts[0])
130 length_of_seqs = int(parts[1])
131 return True
132 except ValueError:
133 return False
134
136 handle = self.handle
137
138 try:
139
140
141 line = self._header
142 del self._header
143 except AttributeError:
144 line = handle.readline()
145
146 if not line: return
147 line = line.strip()
148 parts = filter(None, line.split())
149 if len(parts)!=2:
150 raise ValueError("First line should have two integers")
151 try:
152 number_of_seqs = int(parts[0])
153 length_of_seqs = int(parts[1])
154 except ValueError:
155 raise ValueError("First line should have two integers")
156
157 assert self._is_header(line)
158
159 if self.records_per_alignment is not None \
160 and self.records_per_alignment != number_of_seqs:
161 raise ValueError("Found %i records in this alignment, told to expect %i" \
162 % (number_of_seqs, self.records_per_alignment))
163
164 ids = []
165 seqs = []
166
167
168
169 for i in range(0,number_of_seqs):
170 line = handle.readline().rstrip()
171 ids.append(line[:10].strip())
172 seqs.append([line[10:].strip().replace(" ","")])
173
174
175 line=""
176 while True:
177
178 while ""==line.strip():
179 line = handle.readline()
180 if not line : break
181 if not line : break
182
183 if self._is_header(line):
184
185 self._header = line
186 break
187
188
189 for i in range(0,number_of_seqs):
190 seqs[i].append(line.strip().replace(" ",""))
191 line = handle.readline()
192 if (not line) and i+1 < number_of_seqs:
193 raise ValueError("End of file mid-block")
194 if not line : break
195
196 alignment = MultipleSeqAlignment(self.alphabet)
197 for i in range(0,number_of_seqs):
198 seq = "".join(seqs[i])
199 if len(seq)!=length_of_seqs:
200 raise ValueError("Sequence %i length %i, expected length %i" \
201 % (i+1, len(seq), length_of_seqs))
202 alignment.add_sequence(ids[i], seq)
203
204 record = alignment[-1]
205 assert ids[i] == record.id or ids[i] == record.description
206 record.id = ids[i]
207 record.name = ids[i]
208 record.description = ids[i]
209 return alignment
210
211 if __name__=="__main__":
212 print "Running short mini-test"
213
214 phylip_text=""" 8 286
215 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG
216 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG
217 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG
218 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG
219 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG
220 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA
221 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA
222 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG
223
224 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL
225 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE
226 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG
227 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG
228 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS
229 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA
230 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG
231 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS
232
233 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE
234 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD
235 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA
236 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE
237 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD
238 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK
239 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA
240 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE
241
242 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA
243 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA
244 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM
245 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA
246 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA
247 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA
248 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA
249 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA
250
251 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK
252 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK
253 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE
254 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA
255 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED
256 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE
257 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA
258 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE
259
260 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK----
261 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH
262 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK----
263 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ----
264 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK----
265 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K-----
266 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP---
267 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG---
268 """
269
270 from cStringIO import StringIO
271 handle = StringIO(phylip_text)
272 count=0
273 for alignment in PhylipIterator(handle):
274 for record in alignment:
275 count=count+1
276 print record.id
277
278 assert count == 8
279
280 expected="""mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg
281 fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly
282 aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys
283 dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre
284 alnkafaemradgtyeklak kyfdfdvygg""".replace(" ","").replace("\n","").upper()
285 assert record.seq.tostring().replace("-","") == expected
286
287
288
289 phylip_text2="""5 60
290 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAG
291 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGG
292 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGG
293 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGG
294 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGG
295
296 GAAATGGTCAATATTACAAGGT
297 GAAATGGTCAACATTAAAAGAT
298 GAAATCGTCAATATTAAAAGGT
299 GAAATGGTCAATCTTAAAAGGT
300 GAAATGGTCAATATTAAAAGGT"""
301
302 phylip_text3="""5 60
303 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAGGAAATGGTCAATATTACAAGGT
304 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAACATTAAAAGAT
305 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGGGAAATCGTCAATATTAAAAGGT
306 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGGGAAATGGTCAATCTTAAAAGGT
307 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAATATTAAAAGGT"""
308
309 handle = StringIO(phylip_text2)
310 list2 = list(PhylipIterator(handle))
311 handle.close()
312 assert len(list2)==1
313 assert len(list2[0])==5
314
315 handle = StringIO(phylip_text3)
316 list3 = list(PhylipIterator(handle))
317 handle.close()
318 assert len(list3)==1
319 assert len(list3[0])==5
320
321 for i in range(0,5):
322 list2[0][i].id == list3[0][i].id
323 list2[0][i].seq.tostring() == list3[0][i].seq.tostring()
324
325
326
327
328 phylip_text4=""" 5 42
329 Turkey AAGCTNGGGC ATTTCAGGGT
330 Salmo gairAAGCCTTGGC AGTGCAGGGT
331 H. SapiensACCGGTTGGC CGTTCAGGGT
332 Chimp AAACCCTTGC CGTTACGCTT
333 Gorilla AAACCCTTGC CGGTACGCTT
334
335 GAGCCCGGGC AATACAGGGT AT
336 GAGCCGTGGC CGGGCACGGT AT
337 ACAGGTTGGC CGTTCAGGGT AA
338 AAACCGAGGC CGGGACACTC AT
339 AAACCATTGC CGGTACGCTT AA"""
340
341
342
343 phylip_text5=""" 5 42
344 Turkey AAGCTNGGGC ATTTCAGGGT
345 GAGCCCGGGC AATACAGGGT AT
346 Salmo gairAAGCCTTGGC AGTGCAGGGT
347 GAGCCGTGGC CGGGCACGGT AT
348 H. SapiensACCGGTTGGC CGTTCAGGGT
349 ACAGGTTGGC CGTTCAGGGT AA
350 Chimp AAACCCTTGC CGTTACGCTT
351 AAACCGAGGC CGGGACACTC AT
352 Gorilla AAACCCTTGC CGGTACGCTT
353 AAACCATTGC CGGTACGCTT AA"""
354
355 phylip_text5a=""" 5 42
356 Turkey AAGCTNGGGC ATTTCAGGGT GAGCCCGGGC AATACAGGGT AT
357 Salmo gairAAGCCTTGGC AGTGCAGGGT GAGCCGTGGC CGGGCACGGT AT
358 H. SapiensACCGGTTGGC CGTTCAGGGT ACAGGTTGGC CGTTCAGGGT AA
359 Chimp AAACCCTTGC CGTTACGCTT AAACCGAGGC CGGGACACTC AT
360 Gorilla AAACCCTTGC CGGTACGCTT AAACCATTGC CGGTACGCTT AA"""
361
362 handle = StringIO(phylip_text4)
363 list4 = list(PhylipIterator(handle))
364 handle.close()
365 assert len(list4)==1
366 assert len(list4[0])==5
367
368 handle = StringIO(phylip_text5)
369 try:
370 list5 = list(PhylipIterator(handle))
371 assert len(list5)==1
372 assert len(list5[0])==5
373 print "That should have failed..."
374 except ValueError:
375 print "Evil multiline non-interlaced example failed as expected"
376 handle.close()
377
378 handle = StringIO(phylip_text5a)
379 list5 = list(PhylipIterator(handle))
380 handle.close()
381 assert len(list5)==1
382 assert len(list4[0])==5
383
384 print "Concatenation"
385 handle = StringIO(phylip_text4 + "\n" + phylip_text4)
386 assert len(list(PhylipIterator(handle))) == 2
387
388 handle = StringIO(phylip_text3 + "\n" + phylip_text4 + "\n\n\n" + phylip_text)
389 assert len(list(PhylipIterator(handle))) == 3
390
391 print "OK"
392
393 print "Checking write/read"
394 handle = StringIO()
395 PhylipWriter(handle).write_file(list5)
396 handle.seek(0)
397 list6 = list(PhylipIterator(handle))
398 assert len(list5) == len(list6)
399 for a1,a2 in zip(list5, list6):
400 assert len(a1) == len(a2)
401 for r1, r2 in zip(a1, a2):
402 assert r1.id == r2.id
403 assert r1.seq.tostring() == r2.seq.tostring()
404 print "Done"
405