1
2
3
4
5
6 """Multiple sequence alignment input/output as alignment objects.
7
8 The Bio.AlignIO interface is deliberately very similar to Bio.SeqIO, and in
9 fact the two are connected internally. Both modules use the same set of file
10 format names (lower case strings). From the user's perspective, you can read
11 in a PHYLIP file containing one or more alignments using Bio.AlignIO, or you
12 can read in the sequences within these alignmenta using Bio.SeqIO.
13
14 Bio.AlignIO is also documented at U{http://biopython.org/wiki/AlignIO} and by
15 a whole chapter in our tutorial:
16 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
17 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
18
19 Input
20 =====
21 For the typical special case when your file or handle contains one and only
22 one alignment, use the function Bio.AlignIO.read(). This takes an input file
23 handle (or in recent versions of Biopython a filename as a string), format
24 string and optional number of sequences per alignment. It will return a single
25 MultipleSeqAlignment object (or raise an exception if there isn't just one
26 alignment):
27
28 >>> from Bio import AlignIO
29 >>> align = AlignIO.read("Phylip/interlaced.phy", "phylip")
30 >>> print align
31 SingleLetterAlphabet() alignment with 3 rows and 384 columns
32 -----MKVILLFVLAVFTVFVSS---------------RGIPPE...I-- CYS1_DICDI
33 MAHARVLLLALAVLATAAVAVASSSSFADSNPIRPVTDRAASTL...VAA ALEU_HORVU
34 ------MWATLPLLCAGAWLLGV--------PVCGAAELSVNSL...PLV CATH_HUMAN
35
36 For the general case, when the handle could contain any number of alignments,
37 use the function Bio.AlignIO.parse(...) which takes the same arguments, but
38 returns an iterator giving MultipleSeqAlignment objects (typically used in a
39 for loop). If you want random access to the alignments by number, turn this
40 into a list:
41
42 >>> from Bio import AlignIO
43 >>> alignments = list(AlignIO.parse("Emboss/needle.txt", "emboss"))
44 >>> print alignments[2]
45 SingleLetterAlphabet() alignment with 2 rows and 120 columns
46 -KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQALDIVTKER...--- ref_rec
47 LHIVVVDDDPGTCVYIESVFAELGHTCKSFVRPEAAEEYILTHP...HKE gi|94967506|receiver
48
49 Most alignment file formats can be concatenated so as to hold as many
50 different multiple sequence alignments as possible. One common example
51 is the output of the tool seqboot in the PHLYIP suite. Sometimes there
52 can be a file header and footer, as seen in the EMBOSS alignment output.
53
54 Output
55 ======
56 Use the function Bio.AlignIO.write(...), which takes a complete set of
57 Alignment objects (either as a list, or an iterator), an output file handle
58 (or filename in recent versions of Biopython) and of course the file format::
59
60 from Bio import AlignIO
61 alignments = ...
62 count = SeqIO.write(alignments, "example.faa", "fasta")
63
64 If using a handle make sure to close it to flush the data to the disk::
65
66 from Bio import AlignIO
67 alignments = ...
68 handle = open("example.faa", "w")
69 count = SeqIO.write(alignments, handle, "fasta")
70 handle.close()
71
72 In general, you are expected to call this function once (with all your
73 alignments) and then close the file handle. However, for file formats
74 like PHYLIP where multiple alignments are stored sequentially (with no file
75 header and footer), then multiple calls to the write function should work as
76 expected when using handles.
77
78 If you are using a filename, the repeated calls to the write functions will
79 overwrite the existing file each time.
80
81 Conversion
82 ==========
83 The Bio.AlignIO.convert(...) function allows an easy interface for simple
84 alignnment file format conversions. Additionally, it may use file format
85 specific optimisations so this should be the fastest way too.
86
87 In general however, you can combine the Bio.AlignIO.parse(...) function with
88 the Bio.AlignIO.write(...) function for sequence file conversion. Using
89 generator expressions provides a memory efficient way to perform filtering or
90 other extra operations as part of the process.
91
92 File Formats
93 ============
94 When specifying the file format, use lowercase strings. The same format
95 names are also used in Bio.SeqIO and include the following:
96
97 - clustal - Ouput from Clustal W or X, see also the module Bio.Clustalw
98 which can be used to run the command line tool from Biopython.
99 - emboss - EMBOSS tools' "pairs" and "simple" alignment formats.
100 - fasta - The generic sequence file format where each record starts with
101 an identifer line starting with a ">" character, followed by
102 lines of sequence.
103 - fasta-m10 - For the pairswise alignments output by Bill Pearson's FASTA
104 tools when used with the -m 10 command line option for machine
105 readable output.
106 - ig - The IntelliGenetics file format, apparently the same as the
107 MASE alignment format.
108 - nexus - Output from NEXUS, see also the module Bio.Nexus which can also
109 read any phylogenetic trees in these files.
110 - phylip - Used by the PHLIP tools.
111 - stockholm - A richly annotated alignment file format used by PFAM.
112
113 Note that while Bio.AlignIO can read all the above file formats, it cannot
114 write to all of them.
115
116 You can also use any file format supported by Bio.SeqIO, such as "fasta" or
117 "ig" (which are listed above), PROVIDED the sequences in your file are all the
118 same length.
119 """
120 __docformat__ = "epytext en"
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137 from StringIO import StringIO
138 from Bio.Seq import Seq
139 from Bio.SeqRecord import SeqRecord
140 from Bio.Align import MultipleSeqAlignment
141 from Bio.Align.Generic import Alignment
142 from Bio.Alphabet import Alphabet, AlphabetEncoder, _get_base_alphabet
143
144 import StockholmIO
145 import ClustalIO
146 import NexusIO
147 import PhylipIO
148 import EmbossIO
149 import FastaIO
150
151
152
153
154 _FormatToIterator = {
155 "clustal" : ClustalIO.ClustalIterator,
156 "emboss" : EmbossIO.EmbossIterator,
157 "fasta-m10" : FastaIO.FastaM10Iterator,
158 "nexus" : NexusIO.NexusIterator,
159 "phylip" : PhylipIO.PhylipIterator,
160 "stockholm" : StockholmIO.StockholmIterator,
161 }
162
163 _FormatToWriter = {
164
165 "nexus" : NexusIO.NexusWriter,
166 "phylip" : PhylipIO.PhylipWriter,
167 "stockholm" : StockholmIO.StockholmWriter,
168 "clustal" : ClustalIO.ClustalWriter,
169 }
170
171 -def write(alignments, handle, format):
172 """Write complete set of alignments to a file.
173
174 Arguments:
175 - alignments - A list (or iterator) of Alignment objects (ideally the
176 new MultipleSeqAlignment objects), or (if using Biopython
177 1.54 or later) a single alignment object.
178 - handle - File handle object to write to, or filename as string
179 (note older versions of Biopython only took a handle).
180 - format - lower case string describing the file format to write.
181
182 You should close the handle after calling this function.
183
184 Returns the number of alignments written (as an integer).
185 """
186 from Bio import SeqIO
187
188
189 if not isinstance(format, basestring):
190 raise TypeError("Need a string for the file format (lower case)")
191 if not format:
192 raise ValueError("Format required (lower case string)")
193 if format != format.lower():
194 raise ValueError("Format string '%s' should be lower case" % format)
195
196 if isinstance(alignments, Alignment):
197
198 alignments = [alignments]
199
200 if isinstance(handle, basestring):
201 handle = open(handle, "w")
202 handle_close = True
203 else:
204 handle_close = False
205
206
207 if format in _FormatToIterator:
208 writer_class = _FormatToWriter[format]
209 count = writer_class(handle).write_file(alignments)
210 elif format in SeqIO._FormatToWriter:
211
212
213 count = 0
214 for alignment in alignments:
215 if not isinstance(alignment, Alignment):
216 raise TypeError(\
217 "Expect a list or iterator of Alignment objects.")
218 SeqIO.write(alignment, handle, format)
219 count += 1
220 elif format in _FormatToIterator or format in SeqIO._FormatToIterator:
221 raise ValueError("Reading format '%s' is supported, but not writing" \
222 % format)
223 else:
224 raise ValueError("Unknown format '%s'" % format)
225
226 assert isinstance(count, int), "Internal error - the underlying %s " \
227 "writer should have returned the alignment count, not %s" \
228 % (format, repr(count))
229
230 if handle_close:
231 handle.close()
232
233 return count
234
235
237 """Uses Bio.SeqIO to create an MultipleSeqAlignment iterator (PRIVATE).
238
239 Arguments:
240 - handle - handle to the file.
241 - format - string describing the file format.
242 - alphabet - optional Alphabet object, useful when the sequence type
243 cannot be automatically inferred from the file itself
244 (e.g. fasta, phylip, clustal)
245 - seq_count - Optional integer, number of sequences expected in each
246 alignment. Recommended for fasta format files.
247
248 If count is omitted (default) then all the sequences in the file are
249 combined into a single MultipleSeqAlignment.
250 """
251 from Bio import SeqIO
252 assert format in SeqIO._FormatToIterator
253
254 if seq_count:
255
256 seq_record_iterator = SeqIO.parse(handle, format, alphabet)
257
258 records = []
259 for record in seq_record_iterator:
260 records.append(record)
261 if len(records) == seq_count:
262 yield MultipleSeqAlignment(records, alphabet)
263 records = []
264 if len(records) > 0:
265 raise ValueError("Check seq_count argument, not enough sequences?")
266 else:
267
268
269 records = list(SeqIO.parse(handle, format, alphabet))
270 if records:
271 yield MultipleSeqAlignment(records, alphabet)
272 else:
273
274 pass
275
295
296 -def parse(handle, format, seq_count=None, alphabet=None):
297 """Iterate over an alignment file as MultipleSeqAlignment objects.
298
299 Arguments:
300 - handle - handle to the file, or the filename as a string
301 (note older verions of Biopython only took a handle).
302 - format - string describing the file format.
303 - alphabet - optional Alphabet object, useful when the sequence type
304 cannot be automatically inferred from the file itself
305 (e.g. fasta, phylip, clustal)
306 - seq_count - Optional integer, number of sequences expected in each
307 alignment. Recommended for fasta format files.
308
309 If you have the file name in a string 'filename', use:
310
311 >>> from Bio import AlignIO
312 >>> filename = "Emboss/needle.txt"
313 >>> format = "emboss"
314 >>> for alignment in AlignIO.parse(filename, format):
315 ... print "Alignment of length", alignment.get_alignment_length()
316 Alignment of length 124
317 Alignment of length 119
318 Alignment of length 120
319 Alignment of length 118
320 Alignment of length 125
321
322 If you have a string 'data' containing the file contents, use:
323
324 from Bio import AlignIO
325 from StringIO import StringIO
326 my_iterator = AlignIO.parse(StringIO(data), format)
327
328 Use the Bio.AlignIO.read() function when you expect a single record only.
329 """
330 from Bio import SeqIO
331
332 if isinstance(handle, basestring):
333 handle = open(handle, "rU")
334
335
336
337 if not isinstance(format, basestring):
338 raise TypeError("Need a string for the file format (lower case)")
339 if not format:
340 raise ValueError("Format required (lower case string)")
341 if format != format.lower():
342 raise ValueError("Format string '%s' should be lower case" % format)
343 if alphabet is not None and not (isinstance(alphabet, Alphabet) or \
344 isinstance(alphabet, AlphabetEncoder)):
345 raise ValueError("Invalid alphabet, %s" % repr(alphabet))
346 if seq_count is not None and not isinstance(seq_count, int):
347 raise TypeError("Need integer for seq_count (sequences per alignment)")
348
349
350 if format in _FormatToIterator:
351 iterator_generator = _FormatToIterator[format]
352 if alphabet is None :
353 return iterator_generator(handle, seq_count)
354 try:
355
356 return iterator_generator(handle, seq_count, alphabet=alphabet)
357 except TypeError:
358
359 return _force_alphabet(iterator_generator(handle, seq_count),
360 alphabet)
361
362 elif format in SeqIO._FormatToIterator:
363
364 return _SeqIO_to_alignment_iterator(handle, format,
365 alphabet=alphabet,
366 seq_count=seq_count)
367 else:
368 raise ValueError("Unknown format '%s'" % format)
369
370 -def read(handle, format, seq_count=None, alphabet=None):
371 """Turns an alignment file into a single MultipleSeqAlignment object.
372
373 Arguments:
374 - handle - handle to the file, or the filename as a string
375 (note older verions of Biopython only took a handle).
376 - format - string describing the file format.
377 - alphabet - optional Alphabet object, useful when the sequence type
378 cannot be automatically inferred from the file itself
379 (e.g. fasta, phylip, clustal)
380 - seq_count - Optional integer, number of sequences expected in each
381 alignment. Recommended for fasta format files.
382
383 If the handle contains no alignments, or more than one alignment,
384 an exception is raised. For example, using a PFAM/Stockholm file
385 containing one alignment:
386
387 >>> from Bio import AlignIO
388 >>> filename = "Clustalw/protein.aln"
389 >>> format = "clustal"
390 >>> alignment = AlignIO.read(filename, format)
391 >>> print "Alignment of length", alignment.get_alignment_length()
392 Alignment of length 411
393
394 If however you want the first alignment from a file containing
395 multiple alignments this function would raise an exception.
396
397 >>> from Bio import AlignIO
398 >>> filename = "Emboss/needle.txt"
399 >>> format = "emboss"
400 >>> alignment = AlignIO.read(filename, format)
401 Traceback (most recent call last):
402 ...
403 ValueError: More than one record found in handle
404
405 Instead use:
406
407 >>> from Bio import AlignIO
408 >>> filename = "Emboss/needle.txt"
409 >>> format = "emboss"
410 >>> alignment = AlignIO.parse(filename, format).next()
411 >>> print "First alignment has length", alignment.get_alignment_length()
412 First alignment has length 124
413
414 You must use the Bio.AlignIO.parse() function if you want to read multiple
415 records from the handle.
416 """
417 iterator = parse(handle, format, seq_count, alphabet)
418 try:
419 first = iterator.next()
420 except StopIteration:
421 first = None
422 if first is None:
423 raise ValueError("No records found in handle")
424 try:
425 second = iterator.next()
426 except StopIteration:
427 second = None
428 if second is not None:
429 raise ValueError("More than one record found in handle")
430 if seq_count:
431 assert len(first)==seq_count
432 return first
433
434 -def convert(in_file, in_format, out_file, out_format, alphabet=None):
435 """Convert between two alignment files, returns number of alignments.
436
437 - in_file - an input handle or filename
438 - in_format - input file format, lower case string
439 - output - an output handle or filename
440 - out_file - output file format, lower case string
441 - alphabet - optional alphabet to assume
442
443 NOTE - If you provide an output filename, it will be opened which will
444 overwrite any existing file without warning. This may happen if even the
445 conversion is aborted (e.g. an invalid out_format name is given).
446 """
447
448
449 if isinstance(in_file, basestring):
450 in_handle = open(in_file, "rU")
451 in_close = True
452 else:
453 in_handle = in_file
454 in_close = False
455
456 alignments = parse(in_handle, in_format, None, alphabet)
457
458 if isinstance(out_file, basestring):
459 out_handle = open(out_file, "w")
460 out_close = True
461 else:
462 out_handle = out_file
463 out_close = False
464
465
466 count = write(alignments, out_handle, out_format)
467
468 if in_close:
469 in_handle.close()
470 if out_close:
471 out_handle.close()
472 return count
473
475 """Run the Bio.AlignIO module's doctests.
476
477 This will try and locate the unit tests directory, and run the doctests
478 from there in order that the relative paths used in the examples work.
479 """
480 import doctest
481 import os
482 if os.path.isdir(os.path.join("..", "..", "Tests")):
483 print "Runing doctests..."
484 cur_dir = os.path.abspath(os.curdir)
485 os.chdir(os.path.join("..", "..", "Tests"))
486 doctest.testmod()
487 os.chdir(cur_dir)
488 del cur_dir
489 print "Done"
490 elif os.path.isdir(os.path.join("Tests", "Fasta")):
491 print "Runing doctests..."
492 cur_dir = os.path.abspath(os.curdir)
493 os.chdir(os.path.join("Tests"))
494 doctest.testmod()
495 os.chdir(cur_dir)
496 del cur_dir
497 print "Done"
498
499 if __name__ == "__main__":
500 _test()
501