1
2
3
4
5
6
7
8 """
9 Contains classes to deal with generic sequence alignment stuff not
10 specific to a particular program or format.
11
12 classes:
13 o Alignment
14 """
15
16
17 from Bio.Seq import Seq
18 from Bio.SeqRecord import SeqRecord
19 from Bio import Alphabet
20
22 """Represent a set of alignments (OBSOLETE?).
23
24 This is a base class to represent alignments, which can be subclassed
25 to deal with an alignment in a specific format.
26
27 With the introduction of the MultipleSeqAlignment class in Bio.Align,
28 this base class is effectively obsolete and will likely be deprecated and
29 later removed in future releases of Biopython.
30 """
32 """Initialize a new Alignment object.
33
34 Arguments:
35 o alphabet - The alphabet to use for the sequence objects that are
36 created. This alphabet must be a gapped type.
37
38 e.g.
39 >>> from Bio.Alphabet import IUPAC, Gapped
40 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
41 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
42 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
43 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
44 >>> print align
45 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
46 ACTGCTAGCTAG Alpha
47 ACT-CTAGCTAG Beta
48 ACTGCTAGATAG Gamma
49 """
50 if not (isinstance(alphabet, Alphabet.Alphabet) \
51 or isinstance(alphabet, Alphabet.AlphabetEncoder)):
52 raise ValueError("Invalid alphabet argument")
53 self._alphabet = alphabet
54
55 self._records = []
56
58 """Returns a truncated string representation of a SeqRecord (PRIVATE).
59
60 This is a PRIVATE function used by the __str__ method.
61 """
62 if len(record.seq) <= 50:
63 return "%s %s" % (record.seq, record.id)
64 else:
65 return "%s...%s %s" \
66 % (record.seq[:44], record.seq[-3:], record.id)
67
69 """Returns a multi-line string summary of the alignment.
70
71 This output is intended to be readable, but large alignments are
72 shown truncated. A maximum of 20 rows (sequences) and 50 columns
73 are shown, with the record identifiers. This should fit nicely on a
74 single screen. e.g.
75
76 >>> from Bio.Alphabet import IUPAC, Gapped
77 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
78 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
79 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
80 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
81 >>> print align
82 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
83 ACTGCTAGCTAG Alpha
84 ACT-CTAGCTAG Beta
85 ACTGCTAGATAG Gamma
86
87 See also the alignment's format method.
88 """
89 rows = len(self._records)
90 lines = ["%s alignment with %i rows and %i columns" \
91 % (str(self._alphabet), rows, self.get_alignment_length())]
92 if rows <= 20:
93 lines.extend([self._str_line(rec) for rec in self._records])
94 else:
95 lines.extend([self._str_line(rec) for rec in self._records[:18]])
96 lines.append("...")
97 lines.append(self._str_line(self._records[-1]))
98 return "\n".join(lines)
99
101 """Returns a representation of the object for debugging.
102
103 The representation cannot be used with eval() to recreate the object,
104 which is usually possible with simple python ojects. For example:
105
106 <Bio.Align.Generic.Alignment instance (2 records of length 14,
107 SingleLetterAlphabet()) at a3c184c>
108
109 The hex string is the memory address of the object, see help(id).
110 This provides a simple way to visually distinguish alignments of
111 the same size.
112 """
113
114
115 return "<%s instance (%i records of length %i, %s) at %x>" % \
116 (self.__class__, len(self._records),
117 self.get_alignment_length(), repr(self._alphabet), id(self))
118
119
120
121
122
157
158
175
177 """Return all of the sequences involved in the alignment (DEPRECATED).
178
179 The return value is a list of SeqRecord objects.
180
181 This method is deprecated, as the Alignment object itself now offers
182 much of the functionality of a list of SeqRecord objects (e.g.
183 iteration or slicing to create a sub-alignment). Instead use the
184 Python builtin function list, i.e. my_list = list(my_align)
185 """
186 import warnings
187 warnings.warn("This method is deprecated, since the alignment object"
188 "now acts more like a list. Instead of calling "
189 "align.get_all_seqs() you can use list(align)",
190 DeprecationWarning)
191 return self._records
192
194 """Iterate over alignment rows as SeqRecord objects.
195
196 e.g.
197 >>> from Bio.Alphabet import IUPAC, Gapped
198 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
199 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
200 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
201 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
202 >>> for record in align:
203 ... print record.id
204 ... print record.seq
205 Alpha
206 ACTGCTAGCTAG
207 Beta
208 ACT-CTAGCTAG
209 Gamma
210 ACTGCTAGATAG
211 """
212 return iter(self._records)
213
215 """Retrieve a sequence by row number (OBSOLETE).
216
217 Returns:
218 o A Seq object for the requested sequence.
219
220 Raises:
221 o IndexError - If the specified number is out of range.
222
223 NOTE: This is a legacy method. In new code where you need to access
224 the rows of the alignment (i.e. the sequences) consider iterating
225 over them or accessing them as SeqRecord objects. e.g.
226
227 for record in alignment:
228 print record.id
229 print record.seq
230 first_record = alignment[0]
231 last_record = alignment[-1]
232 """
233 return self._records[number].seq
234
236 """Returns the number of sequences in the alignment.
237
238 Use len(alignment) to get the number of sequences (i.e. the number of
239 rows), and alignment.get_alignment_length() to get the length of the
240 longest sequence (i.e. the number of columns).
241
242 This is easy to remember if you think of the alignment as being like a
243 list of SeqRecord objects.
244 """
245 return len(self._records)
246
248 """Return the maximum length of the alignment.
249
250 All objects in the alignment should (hopefully) have the same
251 length. This function will go through and find this length
252 by finding the maximum length of sequences in the alignment.
253
254 >>> from Bio.Alphabet import IUPAC, Gapped
255 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
256 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
257 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
258 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
259 >>> align.get_alignment_length()
260 12
261
262 If you want to know the number of sequences in the alignment,
263 use len(align) instead:
264
265 >>> len(align)
266 3
267
268 """
269 max_length = 0
270
271 for record in self._records:
272 if len(record.seq) > max_length:
273 max_length = len(record.seq)
274
275 return max_length
276
277 - def add_sequence(self, descriptor, sequence, start = None, end = None,
278 weight = 1.0):
279 """Add a sequence to the alignment.
280
281 This doesn't do any kind of alignment, it just adds in the sequence
282 object, which is assumed to be prealigned with the existing
283 sequences.
284
285 Arguments:
286 o descriptor - The descriptive id of the sequence being added.
287 This will be used as the resulting SeqRecord's
288 .id property (and, for historical compatibility,
289 also the .description property)
290 o sequence - A string with sequence info.
291 o start - You can explicitly set the start point of the sequence.
292 This is useful (at least) for BLAST alignments, which can just
293 be partial alignments of sequences.
294 o end - Specify the end of the sequence, which is important
295 for the same reason as the start.
296 o weight - The weight to place on the sequence in the alignment.
297 By default, all sequences have the same weight. (0.0 => no weight,
298 1.0 => highest weight)
299 """
300 new_seq = Seq(sequence, self._alphabet)
301
302
303
304
305
306
307 new_record = SeqRecord(new_seq,
308 id = descriptor,
309 description = descriptor)
310
311
312
313
314
315
316
317
318 if start:
319 new_record.annotations['start'] = start
320 if end:
321 new_record.annotations['end'] = end
322
323
324 new_record.annotations['weight'] = weight
325
326 self._records.append(new_record)
327
329 """Returns a string containing a given column.
330
331 e.g.
332 >>> from Bio.Alphabet import IUPAC, Gapped
333 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
334 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
335 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
336 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
337 >>> align.get_column(0)
338 'AAA'
339 >>> align.get_column(3)
340 'G-G'
341 """
342
343 col_str = ''
344 assert col >= 0 and col <= self.get_alignment_length()
345 for rec in self._records:
346 col_str += rec.seq[col]
347 return col_str
348
350 """Access part of the alignment.
351
352 We'll use the following example alignment here for illustration:
353
354 >>> from Bio.Alphabet import IUPAC, Gapped
355 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
356 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
357 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
358 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
359 >>> align.add_sequence("Delta", "ACTGCTTGCTAG")
360 >>> align.add_sequence("Epsilon","ACTGCTTGATAG")
361
362 You can access a row of the alignment as a SeqRecord using an integer
363 index (think of the alignment as a list of SeqRecord objects here):
364
365 >>> first_record = align[0]
366 >>> print first_record.id, first_record.seq
367 Alpha ACTGCTAGCTAG
368 >>> last_record = align[-1]
369 >>> print last_record.id, last_record.seq
370 Epsilon ACTGCTTGATAG
371
372 You can also access use python's slice notation to create a sub-alignment
373 containing only some of the SeqRecord objects:
374
375 >>> sub_alignment = align[2:5]
376 >>> print sub_alignment
377 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
378 ACTGCTAGATAG Gamma
379 ACTGCTTGCTAG Delta
380 ACTGCTTGATAG Epsilon
381
382 This includes support for a step, i.e. align[start:end:step], which
383 can be used to select every second sequence:
384
385 >>> sub_alignment = align[::2]
386 >>> print sub_alignment
387 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
388 ACTGCTAGCTAG Alpha
389 ACTGCTAGATAG Gamma
390 ACTGCTTGATAG Epsilon
391
392 Or to get a copy of the alignment with the rows in reverse order:
393
394 >>> rev_alignment = align[::-1]
395 >>> print rev_alignment
396 Gapped(IUPACUnambiguousDNA(), '-') alignment with 5 rows and 12 columns
397 ACTGCTTGATAG Epsilon
398 ACTGCTTGCTAG Delta
399 ACTGCTAGATAG Gamma
400 ACT-CTAGCTAG Beta
401 ACTGCTAGCTAG Alpha
402
403 Right now, these are the ONLY indexing operations supported. The use of
404 a second column based index is under discussion for a future update.
405 """
406 if isinstance(index, int):
407
408
409 return self._records[index]
410 elif isinstance(index, slice):
411
412
413
414
415 sub_align = Alignment(self._alphabet)
416 sub_align._records = self._records[index]
417 return sub_align
418 elif len(index)==2:
419 raise TypeError("Row and Column indexing is not currently supported,"\
420 +"but may be in future.")
421 else:
422 raise TypeError("Invalid index type.")
423
425 """Run the Bio.Align.Generic module's doctests."""
426 print "Running doctests..."
427 import doctest
428 doctest.testmod()
429 print "Done"
430
431 if __name__ == "__main__":
432 _test()
433