1
2
3
4
5
6 """This package implements pairwise sequence alignment using a dynamic
7 programming algorithm.
8
9 This provides functions to get global and local alignments between two
10 sequences. A global alignment finds the best concordance between all
11 characters in two sequences. A local alignment finds just the
12 subsequences that align the best.
13
14 When doing alignments, you can specify the match score and gap
15 penalties. The match score indicates the compatibility between an
16 alignment of two characters in the sequences. Highly compatible
17 characters should be given positive scores, and incompatible ones
18 should be given negative scores or 0. The gap penalties should be
19 negative.
20
21 The names of the alignment functions in this module follow the
22 convention
23 <alignment type>XX
24 where <alignment type> is either "global" or "local" and XX is a 2
25 character code indicating the parameters it takes. The first
26 character indicates the parameters for matches (and mismatches), and
27 the second indicates the parameters for gap penalties.
28
29 The match parameters are
30 CODE DESCRIPTION
31 x No parameters. Identical characters have score of 1, otherwise 0.
32 m A match score is the score of identical chars, otherwise mismatch score.
33 d A dictionary returns the score of any pair of characters.
34 c A callback function returns scores.
35
36 The gap penalty parameters are
37 CODE DESCRIPTION
38 x No gap penalties.
39 s Same open and extend gap penalties for both sequences.
40 d The sequences have different open and extend gap penalties.
41 c A callback function returns the gap penalties.
42
43 All the different alignment functions are contained in an object
44 "align". For example:
45
46 >>> from Bio import pairwise2
47 >>> alignments = pairwise2.align.globalxx("ACCGT", "ACG")
48
49 will return a list of the alignments between the two strings. The
50 parameters of the alignment function depends on the function called.
51 Some examples:
52
53 >>> pairwise2.align.globalxx("ACCGT", "ACG")
54 # Find the best global alignment between the two sequences.
55 # Identical characters are given 1 point. No points are deducted
56 # for mismatches or gaps.
57
58 >>> pairwise2.align.localxx("ACCGT", "ACG")
59 # Same thing as before, but with a local alignment.
60
61 >>> pairwise2.align.globalmx("ACCGT", "ACG", 2, -1)
62 # Do a global alignment. Identical characters are given 2 points,
63 # 1 point is deducted for each non-identical character.
64
65 >>> pairwise2.align.globalms("ACCGT", "ACG", 2, -1, -.5, -.1)
66 # Same as above, except now 0.5 points are deducted when opening a
67 # gap, and 0.1 points are deducted when extending it.
68
69
70 To see a description of the parameters for a function, please look at
71 the docstring for the function.
72
73 >>> print newalign.align.localds.__doc__
74 localds(sequenceA, sequenceB, match_dict, open, extend) -> alignments
75
76 """
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98 from types import *
99
100 MAX_ALIGNMENTS = 1000
101
103 """This class provides functions that do alignments."""
104
106 """This class is callable impersonates an alignment function.
107 The constructor takes the name of the function. This class
108 will decode the name of the function to figure out how to
109 interpret the parameters.
110
111 """
112
113 match2args = {
114 'x' : ([], ''),
115 'm' : (['match', 'mismatch'],
116 """match is the score to given to identical characters. mismatch is
117 the score given to non-identical ones."""),
118 'd' : (['match_dict'],
119 """match_dict is a dictionary where the keys are tuples of pairs of
120 characters and the values are the scores, e.g. ("A", "C") : 2.5."""),
121 'c' : (['match_fn'],
122 """match_fn is a callback function that takes two characters and
123 returns the score between them."""),
124 }
125
126 penalty2args = {
127 'x' : ([], ''),
128 's' : (['open', 'extend'],
129 """open and extend are the gap penalties when a gap is opened and
130 extended. They should be negative."""),
131 'd' : (['openA', 'extendA', 'openB', 'extendB'],
132 """openA and extendA are the gap penalties for sequenceA, and openB
133 and extendB for sequeneB. The penalties should be negative."""),
134 'c' : (['gap_A_fn', 'gap_B_fn'],
135 """gap_A_fn and gap_B_fn are callback functions that takes 1) the
136 index where the gap is opened, and 2) the length of the gap. They
137 should return a gap penalty."""),
138 }
139
141
142
143 if name.startswith("global"):
144 if len(name) != 8:
145 raise AttributeError("function should be globalXX")
146 elif name.startswith("local"):
147 if len(name) != 7:
148 raise AttributeError("function should be localXX")
149 else:
150 raise AttributeError(name)
151 align_type, match_type, penalty_type = \
152 name[:-2], name[-2], name[-1]
153 try:
154 match_args, match_doc = self.match2args[match_type]
155 except KeyError, x:
156 raise AttributeError("unknown match type %r" % match_type)
157 try:
158 penalty_args, penalty_doc = self.penalty2args[penalty_type]
159 except KeyError, x:
160 raise AttributeError("unknown penalty type %r" % penalty_type)
161
162
163 param_names = ['sequenceA', 'sequenceB']
164 param_names.extend(match_args)
165 param_names.extend(penalty_args)
166 self.function_name = name
167 self.align_type = align_type
168 self.param_names = param_names
169
170 self.__name__ = self.function_name
171
172 doc = "%s(%s) -> alignments\n" % (
173 self.__name__, ', '.join(self.param_names))
174 if match_doc:
175 doc += "\n%s\n" % match_doc
176 if penalty_doc:
177 doc += "\n%s\n" % penalty_doc
178 doc += (
179 """\nalignments is a list of tuples (seqA, seqB, score, begin, end).
180 seqA and seqB are strings showing the alignment between the
181 sequences. score is the score of the alignment. begin and end
182 are indexes into seqA and seqB that indicate the where the
183 alignment occurs.
184 """)
185 self.__doc__ = doc
186
187 - def decode(self, *args, **keywds):
188
189
190
191 keywds = keywds.copy()
192 if len(args) != len(self.param_names):
193 raise TypeError("%s takes exactly %d argument (%d given)" \
194 % (self.function_name, len(self.param_names), len(args)))
195 i = 0
196 while i < len(self.param_names):
197 if self.param_names[i] in [
198 'sequenceA', 'sequenceB',
199 'gap_A_fn', 'gap_B_fn', 'match_fn']:
200 keywds[self.param_names[i]] = args[i]
201 i += 1
202 elif self.param_names[i] == 'match':
203 assert self.param_names[i+1] == 'mismatch'
204 match, mismatch = args[i], args[i+1]
205 keywds['match_fn'] = identity_match(match, mismatch)
206 i += 2
207 elif self.param_names[i] == 'match_dict':
208 keywds['match_fn'] = dictionary_match(args[i])
209 i += 1
210 elif self.param_names[i] == 'open':
211 assert self.param_names[i+1] == 'extend'
212 open, extend = args[i], args[i+1]
213 pe = keywds.get('penalize_extend_when_opening', 0)
214 keywds['gap_A_fn'] = affine_penalty(open, extend, pe)
215 keywds['gap_B_fn'] = affine_penalty(open, extend, pe)
216 i += 2
217 elif self.param_names[i] == 'openA':
218 assert self.param_names[i+3] == 'extendB'
219 openA, extendA, openB, extendB = args[i:i+4]
220 pe = keywds.get('penalize_extend_when_opening', 0)
221 keywds['gap_A_fn'] = affine_penalty(openA, extendA, pe)
222 keywds['gap_B_fn'] = affine_penalty(openB, extendB, pe)
223 i += 4
224 else:
225 raise ValueError("unknown parameter %r" \
226 % self.param_names[i])
227
228
229
230 pe = keywds.get('penalize_extend_when_opening', 0)
231 default_params = [
232 ('match_fn', identity_match(1, 0)),
233 ('gap_A_fn', affine_penalty(0, 0, pe)),
234 ('gap_B_fn', affine_penalty(0, 0, pe)),
235 ('penalize_extend_when_opening', 0),
236 ('penalize_end_gaps', self.align_type == 'global'),
237 ('align_globally', self.align_type == 'global'),
238 ('gap_char', '-'),
239 ('force_generic', 0),
240 ('score_only', 0),
241 ('one_alignment_only', 0)
242 ]
243 for name, default in default_params:
244 keywds[name] = keywds.get(name, default)
245 return keywds
246
248 keywds = self.decode(*args, **keywds)
249 return _align(**keywds)
250
252 return self.alignment_function(attr)
253 align = align()
254
255
256 -def _align(sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn,
257 penalize_extend_when_opening, penalize_end_gaps,
258 align_globally, gap_char, force_generic, score_only,
259 one_alignment_only):
260 if not sequenceA or not sequenceB:
261 return []
262
263 if (not force_generic) and \
264 type(gap_A_fn) is InstanceType and \
265 gap_A_fn.__class__ is affine_penalty and \
266 type(gap_B_fn) is InstanceType and \
267 gap_B_fn.__class__ is affine_penalty:
268 open_A, extend_A = gap_A_fn.open, gap_A_fn.extend
269 open_B, extend_B = gap_B_fn.open, gap_B_fn.extend
270 x = _make_score_matrix_fast(
271 sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B,
272 penalize_extend_when_opening, penalize_end_gaps, align_globally,
273 score_only)
274 else:
275 x = _make_score_matrix_generic(
276 sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn,
277 penalize_extend_when_opening, penalize_end_gaps, align_globally,
278 score_only)
279 score_matrix, trace_matrix = x
280
281
282
283
284
285
286 starts = _find_start(
287 score_matrix, sequenceA, sequenceB,
288 gap_A_fn, gap_B_fn, penalize_end_gaps, align_globally)
289
290 best_score = max([x[0] for x in starts])
291
292
293 if score_only:
294 return best_score
295
296 tolerance = 0
297
298
299 i = 0
300 while i < len(starts):
301 score, pos = starts[i]
302 if rint(abs(score-best_score)) > rint(tolerance):
303 del starts[i]
304 else:
305 i += 1
306
307
308 x = _recover_alignments(
309 sequenceA, sequenceB, starts, score_matrix, trace_matrix,
310 align_globally, penalize_end_gaps, gap_char, one_alignment_only)
311 return x
312
313 -def _make_score_matrix_generic(
314 sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn,
315 penalize_extend_when_opening, penalize_end_gaps, align_globally,
316 score_only):
317
318
319
320
321
322
323 lenA, lenB = len(sequenceA), len(sequenceB)
324 score_matrix, trace_matrix = [], []
325 for i in range(lenA):
326 score_matrix.append([None] * lenB)
327 trace_matrix.append([[None]] * lenB)
328
329
330
331
332 for i in range(lenA):
333
334
335
336 score = match_fn(sequenceA[i], sequenceB[0])
337 if penalize_end_gaps:
338 score += gap_B_fn(0, i)
339 score_matrix[i][0] = score
340 for i in range(1, lenB):
341 score = match_fn(sequenceA[0], sequenceB[i])
342 if penalize_end_gaps:
343 score += gap_A_fn(0, i)
344 score_matrix[0][i] = score
345
346
347
348
349
350
351
352
353 for row in range(1, lenA):
354 for col in range(1, lenB):
355
356
357 best_score = score_matrix[row-1][col-1]
358 best_score_rint = rint(best_score)
359 best_indexes = [(row-1, col-1)]
360
361
362
363
364
365
366 for i in range(0, col-1):
367 score = score_matrix[row-1][i] + gap_A_fn(i, col-1-i)
368 score_rint = rint(score)
369 if score_rint == best_score_rint:
370 best_score, best_score_rint = score, score_rint
371 best_indexes.append((row-1, i))
372 elif score_rint > best_score_rint:
373 best_score, best_score_rint = score, score_rint
374 best_indexes = [(row-1, i)]
375
376
377 for i in range(0, row-1):
378 score = score_matrix[i][col-1] + gap_B_fn(i, row-1-i)
379 score_rint = rint(score)
380 if score_rint == best_score_rint:
381 best_score, best_score_rint = score, score_rint
382 best_indexes.append((i, col-1))
383 elif score_rint > best_score_rint:
384 best_score, best_score_rint = score, score_rint
385 best_indexes = [(i, col-1)]
386
387 score_matrix[row][col] = best_score + \
388 match_fn(sequenceA[row], sequenceB[col])
389 if not align_globally and score_matrix[row][col] < 0:
390 score_matrix[row][col] = 0
391 trace_matrix[row][col] = best_indexes
392 return score_matrix, trace_matrix
393
394 -def _make_score_matrix_fast(
395 sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B,
396 penalize_extend_when_opening, penalize_end_gaps,
397 align_globally, score_only):
398 first_A_gap = calc_affine_penalty(1, open_A, extend_A,
399 penalize_extend_when_opening)
400 first_B_gap = calc_affine_penalty(1, open_B, extend_B,
401 penalize_extend_when_opening)
402
403
404
405
406 lenA, lenB = len(sequenceA), len(sequenceB)
407 score_matrix, trace_matrix = [], []
408 for i in range(lenA):
409 score_matrix.append([None] * lenB)
410 trace_matrix.append([[None]] * lenB)
411
412
413
414
415 for i in range(lenA):
416
417
418
419 score = match_fn(sequenceA[i], sequenceB[0])
420 if penalize_end_gaps:
421 score += calc_affine_penalty(
422 i, open_B, extend_B, penalize_extend_when_opening)
423 score_matrix[i][0] = score
424 for i in range(1, lenB):
425 score = match_fn(sequenceA[0], sequenceB[i])
426 if penalize_end_gaps:
427 score += calc_affine_penalty(
428 i, open_A, extend_A, penalize_extend_when_opening)
429 score_matrix[0][i] = score
430
431
432
433
434
435
436
437
438
439
440
441
442
443 row_cache_score, row_cache_index = [None]*(lenA-1), [None]*(lenA-1)
444
445 col_cache_score, col_cache_index = [None]*(lenB-1), [None]*(lenB-1)
446
447 for i in range(lenA-1):
448
449
450 row_cache_score[i] = score_matrix[i][0] + first_A_gap
451 row_cache_index[i] = [(i, 0)]
452 for i in range(lenB-1):
453 col_cache_score[i] = score_matrix[0][i] + first_B_gap
454 col_cache_index[i] = [(0, i)]
455
456
457 for row in range(1, lenA):
458 for col in range(1, lenB):
459
460
461 nogap_score = score_matrix[row-1][col-1]
462
463
464
465 if col > 1:
466 row_score = row_cache_score[row-1]
467 else:
468 row_score = nogap_score - 1
469
470
471 if row > 1:
472 col_score = col_cache_score[col-1]
473 else:
474 col_score = nogap_score - 1
475
476 best_score = max(nogap_score, row_score, col_score)
477 best_score_rint = rint(best_score)
478 best_index = []
479 if best_score_rint == rint(nogap_score):
480 best_index.append((row-1, col-1))
481 if best_score_rint == rint(row_score):
482 best_index.extend(row_cache_index[row-1])
483 if best_score_rint == rint(col_score):
484 best_index.extend(col_cache_index[col-1])
485
486
487 score = best_score + match_fn(sequenceA[row], sequenceB[col])
488 if not align_globally and score < 0:
489 score_matrix[row][col] = 0
490 else:
491 score_matrix[row][col] = score
492 trace_matrix[row][col] = best_index
493
494
495
496
497
498
499 open_score = score_matrix[row-1][col-1] + first_B_gap
500 extend_score = col_cache_score[col-1] + extend_B
501 open_score_rint, extend_score_rint = \
502 rint(open_score), rint(extend_score)
503 if open_score_rint > extend_score_rint:
504 col_cache_score[col-1] = open_score
505 col_cache_index[col-1] = [(row-1, col-1)]
506 elif extend_score_rint > open_score_rint:
507 col_cache_score[col-1] = extend_score
508 else:
509 col_cache_score[col-1] = open_score
510 if (row-1, col-1) not in col_cache_index[col-1]:
511 col_cache_index[col-1] = col_cache_index[col-1] + \
512 [(row-1, col-1)]
513
514
515 open_score = score_matrix[row-1][col-1] + first_A_gap
516 extend_score = row_cache_score[row-1] + extend_A
517 open_score_rint, extend_score_rint = \
518 rint(open_score), rint(extend_score)
519 if open_score_rint > extend_score_rint:
520 row_cache_score[row-1] = open_score
521 row_cache_index[row-1] = [(row-1, col-1)]
522 elif extend_score_rint > open_score_rint:
523 row_cache_score[row-1] = extend_score
524 else:
525 row_cache_score[row-1] = open_score
526 if (row-1, col-1) not in row_cache_index[row-1]:
527 row_cache_index[row-1] = row_cache_index[row-1] + \
528 [(row-1, col-1)]
529
530 return score_matrix, trace_matrix
531
532 -def _recover_alignments(sequenceA, sequenceB, starts,
533 score_matrix, trace_matrix, align_globally,
534 penalize_end_gaps, gap_char, one_alignment_only):
535
536
537
538 lenA, lenB = len(sequenceA), len(sequenceB)
539 tracebacks = []
540 in_process = []
541
542
543
544
545
546
547
548
549
550
551
552 for score, (row, col) in starts:
553 if align_globally:
554 begin, end = None, None
555 else:
556 begin, end = None, -max(lenA-row, lenB-col)+1
557 if not end:
558 end = None
559
560
561
562 in_process.append(
563 (sequenceA[0:0], sequenceB[0:0], score, begin, end,
564 (lenA, lenB), (row, col)))
565 if one_alignment_only:
566 break
567 while in_process and len(tracebacks) < MAX_ALIGNMENTS:
568 seqA, seqB, score, begin, end, prev_pos, next_pos = in_process.pop()
569 prevA, prevB = prev_pos
570 if next_pos is None:
571 prevlen = len(seqA)
572
573 seqA = sequenceA[:prevA] + seqA
574 seqB = sequenceB[:prevB] + seqB
575
576 seqA, seqB = _lpad_until_equal(seqA, seqB, gap_char)
577
578
579 if begin is None:
580 if align_globally:
581 begin = 0
582 else:
583 begin = len(seqA) - prevlen
584 tracebacks.append((seqA, seqB, score, begin, end))
585 else:
586 nextA, nextB = next_pos
587 nseqA, nseqB = prevA-nextA, prevB-nextB
588 maxseq = max(nseqA, nseqB)
589 ngapA, ngapB = maxseq-nseqA, maxseq-nseqB
590 seqA = sequenceA[nextA:nextA+nseqA] + gap_char*ngapA + seqA
591 seqB = sequenceB[nextB:nextB+nseqB] + gap_char*ngapB + seqB
592 prev_pos = next_pos
593
594 if not align_globally and score_matrix[nextA][nextB] <= 0:
595 begin = max(prevA, prevB)
596 in_process.append(
597 (seqA, seqB, score, begin, end, prev_pos, None))
598 else:
599 for next_pos in trace_matrix[nextA][nextB]:
600 in_process.append(
601 (seqA, seqB, score, begin, end, prev_pos, next_pos))
602 if one_alignment_only:
603 break
604
605 return _clean_alignments(tracebacks)
606
607 -def _find_start(score_matrix, sequenceA, sequenceB, gap_A_fn, gap_B_fn,
608 penalize_end_gaps, align_globally):
609
610
611 if align_globally:
612 if penalize_end_gaps:
613 starts = _find_global_start(
614 sequenceA, sequenceB, score_matrix, gap_A_fn, gap_B_fn, 1)
615 else:
616 starts = _find_global_start(
617 sequenceA, sequenceB, score_matrix, None, None, 0)
618 else:
619 starts = _find_local_start(score_matrix)
620 return starts
621
622 -def _find_global_start(sequenceA, sequenceB,
623 score_matrix, gap_A_fn, gap_B_fn, penalize_end_gaps):
642
644
645 positions = []
646 nrows, ncols = len(score_matrix), len(score_matrix[0])
647 for row in range(nrows):
648 for col in range(ncols):
649 score = score_matrix[row][col]
650 positions.append((score, (row, col)))
651 return positions
652
654
655
656
657 unique_alignments = []
658 for align in alignments :
659 if align not in unique_alignments :
660 unique_alignments.append(align)
661 i = 0
662 while i < len(unique_alignments):
663 seqA, seqB, score, begin, end = unique_alignments[i]
664
665 if end is None:
666 end = len(seqA)
667 elif end < 0:
668 end = end + len(seqA)
669
670 if begin >= end:
671 del unique_alignments[i]
672 continue
673 unique_alignments[i] = seqA, seqB, score, begin, end
674 i += 1
675 return unique_alignments
676
678
679 ls1, ls2 = len(s1), len(s2)
680 if ls1 < ls2:
681 s1 = _pad(s1, char, ls2-ls1)
682 elif ls2 < ls1:
683 s2 = _pad(s2, char, ls1-ls2)
684 return s1, s2
685
687
688
689 ls1, ls2 = len(s1), len(s2)
690 if ls1 < ls2:
691 s1 = _lpad(s1, char, ls2-ls1)
692 elif ls2 < ls1:
693 s2 = _lpad(s2, char, ls1-ls2)
694 return s1, s2
695
696 -def _pad(s, char, n):
697
698 return s + char*n
699
701
702 return char*n + s
703
704 _PRECISION = 1000
706 return int(x * precision + 0.5)
707
709 """identity_match([match][, mismatch]) -> match_fn
710
711 Create a match function for use in an alignment. match and
712 mismatch are the scores to give when two residues are equal or
713 unequal. By default, match is 1 and mismatch is 0.
714
715 """
716 - def __init__(self, match=1, mismatch=0):
717 self.match = match
718 self.mismatch = mismatch
720 if charA == charB:
721 return self.match
722 return self.mismatch
723
725 """dictionary_match(score_dict[, symmetric]) -> match_fn
726
727 Create a match function for use in an alignment. score_dict is a
728 dictionary where the keys are tuples (residue 1, residue 2) and
729 the values are the match scores between those residues. symmetric
730 is a flag that indicates whether the scores are symmetric. If
731 true, then if (res 1, res 2) doesn't exist, I will use the score
732 at (res 2, res 1).
733
734 """
735 - def __init__(self, score_dict, symmetric=1):
736 self.score_dict = score_dict
737 self.symmetric = symmetric
739 if self.symmetric and (charA, charB) not in self.score_dict:
740
741
742 charB, charA = charA, charB
743 return self.score_dict[(charA, charB)]
744
746 """affine_penalty(open, extend[, penalize_extend_when_opening]) -> gap_fn
747
748 Create a gap function for use in an alignment.
749
750 """
751 - def __init__(self, open, extend, penalize_extend_when_opening=0):
752 if open > 0 or extend > 0:
753 raise ValueError("Gap penalties should be non-positive.")
754 self.open, self.extend = open, extend
755 self.penalize_extend_when_opening = penalize_extend_when_opening
759
761 if length <= 0:
762 return 0
763 penalty = open + extend * length
764 if not penalize_extend_when_opening:
765 penalty -= extend
766 return penalty
767
769 """print_matrix(matrix)
770
771 Print out a matrix. For debugging purposes.
772
773 """
774
775 matrixT = [[] for x in range(len(matrix[0]))]
776 for i in range(len(matrix)):
777 for j in range(len(matrix[i])):
778 matrixT[j].append(len(str(matrix[i][j])))
779 ndigits = map(max, matrixT)
780 for i in range(len(matrix)):
781 for j in range(len(matrix[i])):
782 n = ndigits[j]
783 print "%*s " % (n, matrix[i][j]),
784 print
785
798
799
800
801
802 try:
803 import cpairwise2
804 except ImportError:
805 pass
806 else:
807 import sys
808 this_module = sys.modules[__name__]
809 for name in cpairwise2.__dict__.keys():
810 if not name.startswith("__"):
811 this_module.__dict__[name] = cpairwise2.__dict__[name]
812