1 """Extract information from alignment objects.
2
3 In order to try and avoid huge alignment objects with tons of functions,
4 functions which return summary type information about alignments should
5 be put into classes in this module.
6
7 classes:
8 o SummaryInfo
9 o PSSM
10 """
11
12
13 import math
14 import sys
15
16
17 from Bio import Alphabet
18 from Bio.Alphabet import IUPAC
19 from Bio.Seq import Seq
20 from Bio.SubsMat import FreqTable
21
22
23
24 Protein20Random = 0.05
25 Nucleotide4Random = 0.25
27 """Calculate summary info about the alignment.
28
29 This class should be used to caclculate information summarizing the
30 results of an alignment. This may either be straight consensus info
31 or more complicated things.
32 """
34 """Initialize with the alignment to calculate information on.
35 ic_vector attribute. A dictionary. Keys: column numbers. Values:
36 """
37 self.alignment = alignment
38 self.ic_vector = {}
39
40 - def dumb_consensus(self, threshold = .7, ambiguous = "X",
41 consensus_alpha = None, require_multiple = 0):
42 """Output a fast consensus sequence of the alignment.
43
44 This doesn't do anything fancy at all. It will just go through the
45 sequence residue by residue and count up the number of each type
46 of residue (ie. A or G or T or C for DNA) in all sequences in the
47 alignment. If the percentage of the most common residue type is
48 greater then the passed threshold, then we will add that residue type,
49 otherwise an ambiguous character will be added.
50
51 This could be made a lot fancier (ie. to take a substitution matrix
52 into account), but it just meant for a quick and dirty consensus.
53
54 Arguments:
55 o threshold - The threshold value that is required to add a particular
56 atom.
57 o ambiguous - The ambiguous character to be added when the threshold is
58 not reached.
59 o consensus_alpha - The alphabet to return for the consensus sequence.
60 If this is None, then we will try to guess the alphabet.
61 o require_multiple - If set as 1, this will require that more than
62 1 sequence be part of an alignment to put it in the consensus (ie.
63 not just 1 sequence and gaps).
64 """
65
66 consensus = ''
67
68
69 con_len = self.alignment.get_alignment_length()
70
71
72 for n in range(con_len):
73
74 atom_dict = {}
75 num_atoms = 0
76
77 for record in self.alignment._records:
78
79
80 if n < len(record.seq):
81 if record.seq[n] != '-' and record.seq[n] != '.':
82 if record.seq[n] not in atom_dict.keys():
83 atom_dict[record.seq[n]] = 1
84 else:
85 atom_dict[record.seq[n]] += 1
86
87 num_atoms = num_atoms + 1
88
89 max_atoms = []
90 max_size = 0
91
92 for atom in atom_dict.keys():
93 if atom_dict[atom] > max_size:
94 max_atoms = [atom]
95 max_size = atom_dict[atom]
96 elif atom_dict[atom] == max_size:
97 max_atoms.append(atom)
98
99 if require_multiple and num_atoms == 1:
100 consensus += ambiguous
101 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms))
102 >= threshold):
103 consensus += max_atoms[0]
104 else:
105 consensus += ambiguous
106
107
108 if consensus_alpha is None:
109 consensus_alpha = self._guess_consensus_alphabet(ambiguous)
110
111 return Seq(consensus, consensus_alpha)
112
113 - def gap_consensus(self, threshold = .7, ambiguous = "X",
114 consensus_alpha = None, require_multiple = 0):
115 """Same as dumb_consensus(), but allows gap on the output.
116
117 Things to do: Let the user define that with only one gap, the result
118 character in consensus is gap. Let the user select gap character, now
119 it takes the same is input.
120 """
121
122 consensus = ''
123
124
125 con_len = self.alignment.get_alignment_length()
126
127
128 for n in range(con_len):
129
130 atom_dict = {}
131 num_atoms = 0
132
133 for record in self.alignment._records:
134
135
136 if n < len(record.seq):
137 if record.seq[n] not in atom_dict.keys():
138 atom_dict[record.seq[n]] = 1
139 else:
140 atom_dict[record.seq[n]] += 1
141
142 num_atoms += 1
143
144 max_atoms = []
145 max_size = 0
146
147 for atom in atom_dict.keys():
148 if atom_dict[atom] > max_size:
149 max_atoms = [atom]
150 max_size = atom_dict[atom]
151 elif atom_dict[atom] == max_size:
152 max_atoms.append(atom)
153
154 if require_multiple and num_atoms == 1:
155 consensus += ambiguous
156 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms))
157 >= threshold):
158 consensus += max_atoms[0]
159 else:
160 consensus += ambiguous
161
162
163 if consensus_alpha is None:
164
165 consensus_alpha = self._guess_consensus_alphabet(ambiguous)
166
167 return Seq(consensus, consensus_alpha)
168
210
212 """Generate a replacement dictionary to plug into a substitution matrix
213
214 This should look at an alignment, and be able to generate the number
215 of substitutions of different residues for each other in the
216 aligned object.
217
218 Will then return a dictionary with this information:
219 {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....}
220
221 This also treats weighted sequences. The following example shows how
222 we calculate the replacement dictionary. Given the following
223 multiple sequence alignments:
224
225 GTATC 0.5
226 AT--C 0.8
227 CTGTC 1.0
228
229 For the first column we have:
230 ('A', 'G') : 0.5 * 0.8 = 0.4
231 ('C', 'G') : 0.5 * 1.0 = 0.5
232 ('A', 'C') : 0.8 * 1.0 = 0.8
233
234 We then continue this for all of the columns in the alignment, summing
235 the information for each substitution in each column, until we end
236 up with the replacement dictionary.
237
238 Arguments:
239 o skip_chars - A list of characters to skip when creating the dictionary.
240 For instance, you might have Xs (screened stuff) or Ns, and not want
241 to include the ambiguity characters in the dictionary.
242 """
243
244 rep_dict, skip_items = self._get_base_replacements(skip_chars)
245
246
247 for rec_num1 in range(len(self.alignment._records)):
248
249
250 for rec_num2 in range(rec_num1 + 1, len(self.alignment._records)):
251
252
253 rep_dict = self._pair_replacement(
254 self.alignment._records[rec_num1].seq,
255 self.alignment._records[rec_num2].seq,
256 self.alignment._records[rec_num1].annotations.get('weight',1.0),
257 self.alignment._records[rec_num2].annotations.get('weight',1.0),
258 rep_dict, skip_items)
259
260 return rep_dict
261
262 - def _pair_replacement(self, seq1, seq2, weight1, weight2,
263 start_dict, ignore_chars):
264 """Compare two sequences and generate info on the replacements seen.
265
266 Arguments:
267 o seq1, seq2 - The two sequences to compare.
268 o weight1, weight2 - The relative weights of seq1 and seq2.
269 o start_dict - The dictionary containing the starting replacement
270 info that we will modify.
271 o ignore_chars - A list of characters to ignore when calculating
272 replacements (ie. '-').
273
274 Returns:
275 o A replacment dictionary which is modified from initial_dict with
276 the information from the sequence comparison.
277 """
278
279 for residue_num in range(len(seq1)):
280 residue1 = seq1[residue_num]
281 try:
282 residue2 = seq2[residue_num]
283
284
285 except IndexError:
286 return start_dict
287
288
289 if (residue1 not in ignore_chars) and (residue2 not in ignore_chars):
290 try:
291
292
293 start_dict[(residue1, residue2)] += weight1 * weight2
294
295
296 except KeyError:
297 raise ValueError("Residues %s, %s not found in alphabet %s"
298 % (residue1, residue2,
299 self.alignment._alphabet))
300
301 return start_dict
302
303
305 """Returns a string containing the expected letters in the alignment."""
306 all_letters = self.alignment._alphabet.letters
307 if all_letters is None \
308 or (isinstance(self.alignment._alphabet, Alphabet.Gapped) \
309 and all_letters == self.alignment._alphabet.gap_char):
310
311
312
313 set_letters = set()
314 for record in self.alignment:
315
316
317 set_letters = set_letters.union(record.seq)
318 list_letters = list(set_letters)
319 list_letters.sort()
320 all_letters = "".join(list_letters)
321 return all_letters
322
324 """Get a zeroed dictonary of all possible letter combinations.
325
326 This looks at the type of alphabet and gets the letters for it.
327 It then creates a dictionary with all possible combinations of these
328 letters as keys (ie. ('A', 'G')) and sets the values as zero.
329
330 Returns:
331 o The base dictionary created
332 o A list of alphabet items to skip when filling the dictionary.Right
333 now the only thing I can imagine in this list is gap characters, but
334 maybe X's or something else might be useful later. This will also
335 include any characters that are specified to be skipped.
336 """
337 base_dictionary = {}
338 all_letters = self._get_all_letters()
339
340
341
342 if isinstance(self.alignment._alphabet, Alphabet.Gapped):
343 skip_items.append(self.alignment._alphabet.gap_char)
344 all_letters = all_letters.replace(self.alignment._alphabet.gap_char,'')
345
346
347 for first_letter in all_letters:
348 for second_letter in all_letters:
349 if (first_letter not in skip_items and
350 second_letter not in skip_items):
351 base_dictionary[(first_letter, second_letter)] = 0
352
353 return base_dictionary, skip_items
354
355
358 """Create a position specific score matrix object for the alignment.
359
360 This creates a position specific score matrix (pssm) which is an
361 alternative method to look at a consensus sequence.
362
363 Arguments:
364 o chars_to_ignore - A listing of all characters not to include in
365 the pssm. If the alignment alphabet declares a gap character,
366 then it will be excluded automatically.
367 o axis_seq - An optional argument specifying the sequence to
368 put on the axis of the PSSM. This should be a Seq object. If nothing
369 is specified, the consensus sequence, calculated with default
370 parameters, will be used.
371
372 Returns:
373 o A PSSM (position specific score matrix) object.
374 """
375
376 all_letters = self._get_all_letters()
377 assert all_letters
378
379 if not isinstance(chars_to_ignore, list):
380 raise TypeError("chars_to_ignore should be a list.")
381
382
383 if isinstance(self.alignment._alphabet, Alphabet.Gapped):
384 chars_to_ignore.append(self.alignment._alphabet.gap_char)
385
386 for char in chars_to_ignore:
387 all_letters = all_letters.replace(char, '')
388
389 if axis_seq:
390 left_seq = axis_seq
391 assert len(axis_seq) == self.alignment.get_alignment_length()
392 else:
393 left_seq = self.dumb_consensus()
394
395 pssm_info = []
396
397 for residue_num in range(len(left_seq)):
398 score_dict = self._get_base_letters(all_letters)
399 for record in self.alignment._records:
400 try:
401 this_residue = record.seq[residue_num]
402
403
404 except IndexError:
405 this_residue = None
406
407 if this_residue and this_residue not in chars_to_ignore:
408 weight = record.annotations.get('weight', 1.0)
409 try:
410 score_dict[this_residue] += weight
411
412 except KeyError:
413 raise ValueError("Residue %s not found in alphabet %s"
414 % (this_residue,
415 self.alignment._alphabet))
416
417 pssm_info.append((left_seq[residue_num],
418 score_dict))
419
420
421 return PSSM(pssm_info)
422
424 """Create a zeroed dictionary with all of the specified letters.
425 """
426 base_info = {}
427 for letter in letters:
428 base_info[letter] = 0
429
430 return base_info
431
432 - def information_content(self, start = 0,
433 end = None,
434 e_freq_table = None, log_base = 2,
435 chars_to_ignore = []):
436 """Calculate the information content for each residue along an alignment.
437
438 Arguments:
439 o start, end - The starting an ending points to calculate the
440 information content. These points should be relative to the first
441 sequence in the alignment, starting at zero (ie. even if the 'real'
442 first position in the seq is 203 in the initial sequence, for
443 the info content, we need to use zero). This defaults to the entire
444 length of the first sequence.
445 o e_freq_table - A FreqTable object specifying the expected frequencies
446 for each letter in the alphabet we are using (e.g. {'G' : 0.4,
447 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be
448 included, since these should not have expected frequencies.
449 o log_base - The base of the logathrim to use in calculating the
450 information content. This defaults to 2 so the info is in bits.
451 o chars_to_ignore - A listing of characterw which should be ignored
452 in calculating the info content.
453
454 Returns:
455 o A number representing the info content for the specified region.
456
457 Please see the Biopython manual for more information on how information
458 content is calculated.
459 """
460
461 if end is None:
462 end = len(self.alignment._records[0].seq)
463
464 if start < 0 or end > len(self.alignment._records[0].seq):
465 raise ValueError \
466 ("Start (%s) and end (%s) are not in the range %s to %s"
467 % (start, end, 0, len(self.alignment._records[0].seq)))
468
469 random_expected = None
470 if not e_freq_table:
471
472 base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet)
473 if isinstance(base_alpha, Alphabet.ProteinAlphabet):
474 random_expected = Protein20Random
475 elif isinstance(base_alpha, Alphabet.NucleotideAlphabet):
476 random_expected = Nucleotide4Random
477 else:
478 errstr = "Error in alphabet: not Nucleotide or Protein, "
479 errstr += "supply expected frequencies"
480 raise ValueError(errstr)
481 del base_alpha
482 elif not isinstance(e_freq_table, FreqTable.FreqTable):
483 raise ValueError("e_freq_table should be a FreqTable object")
484
485
486
487 all_letters = self._get_all_letters()
488 for char in chars_to_ignore:
489 all_letters = all_letters.replace(char, '')
490
491 info_content = {}
492 for residue_num in range(start, end):
493 freq_dict = self._get_letter_freqs(residue_num,
494 self.alignment._records,
495 all_letters, chars_to_ignore)
496
497 column_score = self._get_column_info_content(freq_dict,
498 e_freq_table,
499 log_base,
500 random_expected)
501
502 info_content[residue_num] = column_score
503
504 total_info = 0
505 for column_info in info_content.values():
506 total_info += column_info
507
508 for i in info_content.keys():
509 self.ic_vector[i] = info_content[i]
510 return total_info
511
513 """Determine the frequency of specific letters in the alignment.
514
515 Arguments:
516 o residue_num - The number of the column we are getting frequencies
517 from.
518 o all_records - All of the SeqRecords in the alignment.
519 o letters - The letters we are interested in getting the frequency
520 for.
521 o to_ignore - Letters we are specifically supposed to ignore.
522
523 This will calculate the frequencies of each of the specified letters
524 in the alignment at the given frequency, and return this as a
525 dictionary where the keys are the letters and the values are the
526 frequencies.
527 """
528 freq_info = self._get_base_letters(letters)
529
530 total_count = 0
531
532 for record in all_records:
533 try:
534 if record.seq[residue_num] not in to_ignore:
535 weight = record.annotations.get('weight',1.0)
536 freq_info[record.seq[residue_num]] += weight
537 total_count += weight
538
539 except KeyError:
540 raise ValueError("Residue %s not found in alphabet %s"
541 % (record.seq[residue_num],
542 self.alignment._alphabet))
543
544 if total_count == 0:
545
546 for letter in freq_info.keys():
547 assert freq_info[letter] == 0
548
549 else:
550
551 for letter in freq_info.keys():
552 freq_info[letter] = freq_info[letter] / total_count
553
554 return freq_info
555
556 - def _get_column_info_content(self, obs_freq, e_freq_table, log_base,
557 random_expected):
558 """Calculate the information content for a column.
559
560 Arguments:
561 o obs_freq - The frequencies observed for each letter in the column.
562 o e_freq_table - An optional argument specifying the expected
563 frequencies for each letter. This is a SubsMat.FreqTable instance.
564 o log_base - The base of the logathrim to use in calculating the
565 info content.
566 """
567 try:
568 gap_char = self.alignment._alphabet.gap_char
569 except AttributeError:
570
571
572 gap_char = "-"
573
574 if e_freq_table:
575 if not isinstance(e_freq_table, FreqTable.FreqTable):
576 raise ValueError("e_freq_table should be a FreqTable object")
577
578 for key in obs_freq.keys():
579 if (key != gap_char and key not in e_freq_table):
580 raise ValueError("Expected frequency letters %s "
581 "do not match observed %s" \
582 % (e_freq_table.keys(),
583 obs_freq.keys() - [gap_char]))
584
585 total_info = 0.0
586
587 for letter in obs_freq.keys():
588 inner_log = 0.0
589
590
591
592 if letter != gap_char:
593 if e_freq_table:
594 inner_log = obs_freq[letter] / e_freq_table[letter]
595 else:
596 inner_log = obs_freq[letter] / random_expected
597
598
599 if inner_log > 0:
600 letter_info = (obs_freq[letter] *
601 math.log(inner_log) / math.log(log_base))
602 total_info += letter_info
603 return total_info
604
607
609 """Represent a position specific score matrix.
610
611 This class is meant to make it easy to access the info within a PSSM
612 and also make it easy to print out the information in a nice table.
613
614 Let's say you had an alignment like this:
615 GTATC
616 AT--C
617 CTGTC
618
619 The position specific score matrix (when printed) looks like:
620
621 G A T C
622 G 1 1 0 1
623 T 0 0 3 0
624 A 1 1 0 0
625 T 0 0 2 0
626 C 0 0 0 3
627
628 You can access a single element of the PSSM using the following:
629
630 your_pssm[sequence_number][residue_count_name]
631
632 For instance, to get the 'T' residue for the second element in the
633 above alignment you would need to do:
634
635 your_pssm[1]['T']
636 """
638 """Initialize with pssm data to represent.
639
640 The pssm passed should be a list with the following structure:
641
642 list[0] - The letter of the residue being represented (for instance,
643 from the example above, the first few list[0]s would be GTAT...
644 list[1] - A dictionary with the letter substitutions and counts.
645 """
646 self.pssm = pssm
647
649 return self.pssm[pos][1]
650
652 out = " "
653 all_residues = self.pssm[0][1].keys()
654 all_residues.sort()
655
656
657 for res in all_residues:
658 out += " %s" % res
659 out += "\n"
660
661
662 for item in self.pssm:
663 out += "%s " % item[0]
664 for res in all_residues:
665 out += " %.1f" % item[1][res]
666
667 out += "\n"
668 return out
669
671 """Return the residue letter at the specified position.
672 """
673 return self.pssm[pos][0]
674
675
676 -def print_info_content(summary_info,fout=None,rep_record=0):
677 """ Three column output: position, aa in representative sequence,
678 ic_vector value"""
679 fout = fout or sys.stdout
680 if not summary_info.ic_vector:
681 summary_info.information_content()
682 rep_sequence = summary_info.alignment._records[rep_record].seq
683 positions = summary_info.ic_vector.keys()
684 positions.sort()
685 for pos in positions:
686 fout.write("%d %s %.3f\n" % (pos, rep_sequence[pos],
687 summary_info.ic_vector[pos]))
688
689 if __name__ == "__main__":
690 print "Quick test"
691 from Bio import AlignIO
692 from Bio.Align.Generic import Alignment
693
694 filename = "../../Tests/GFF/multi.fna"
695 format = "fasta"
696 expected = FreqTable.FreqTable({"A":0.25,"G":0.25,"T":0.25,"C":0.25},
697 FreqTable.FREQ,
698 IUPAC.unambiguous_dna)
699
700 alignment = AlignIO.read(open(filename), format)
701 for record in alignment:
702 print record.seq.tostring()
703 print "="*alignment.get_alignment_length()
704
705 summary = SummaryInfo(alignment)
706 consensus = summary.dumb_consensus(ambiguous="N")
707 print consensus
708 consensus = summary.gap_consensus(ambiguous="N")
709 print consensus
710 print
711 print summary.pos_specific_score_matrix(chars_to_ignore=['-'],
712 axis_seq=consensus)
713 print
714
715
716 print summary.information_content(e_freq_table=expected,
717 chars_to_ignore=['-'])
718 print
719 print "Trying a protein sequence with gaps and stops"
720
721 alpha = Alphabet.HasStopCodon(Alphabet.Gapped(Alphabet.generic_protein, "-"), "*")
722 a = Alignment(alpha)
723 a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-")
724 a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*")
725 a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*")
726 print a
727 print "="*a.get_alignment_length()
728
729 s = SummaryInfo(a)
730 c = s.dumb_consensus(ambiguous="X")
731 print c
732 c = s.gap_consensus(ambiguous="X")
733 print c
734 print
735 print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)
736
737 print s.information_content(chars_to_ignore=['-', '*'])
738
739
740 print "Done"
741