Package Bio :: Package SeqUtils :: Module lcc
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.lcc

  1  # Copyright 2003, 2007 by Sebastian Bassi. sbassi@genesdigitales.com 
  2  # All rights reserved.  This code is part of the Biopython  
  3  # distribution and governed by its license. 
  4  # Please see the LICENSE file that should have been included as part 
  5  # of this package. 
  6   
  7  import math 
  8   
9 -def lcc_mult(seq,wsize):
10 """Local Composition Complexity (LCC) values over sliding window. 11 12 Returns a list of floats, the LCC values for a sliding window over 13 the sequence. 14 15 seq - an unambiguous DNA sequence (a string or Seq object) 16 wsize - window size, integer 17 18 The result is the same as applying lcc_simp multiple times, but this 19 version is optimized for speed. The optimization works by using the 20 value of previous window as a base to compute the next one.""" 21 l2=math.log(2) 22 tamseq=len(seq) 23 try: 24 #Assume its a string 25 upper = seq.upper() 26 except AttributeError: 27 #Should be a Seq object then 28 upper = seq.tostring().upper() 29 compone=[0] 30 lccsal=[0] 31 for i in range(wsize): 32 compone.append(((i+1)/float(wsize))* 33 ((math.log((i+1)/float(wsize)))/l2)) 34 window=seq[0:wsize] 35 cant_a=window.count('A') 36 cant_c=window.count('C') 37 cant_t=window.count('T') 38 cant_g=window.count('G') 39 term_a=compone[cant_a] 40 term_c=compone[cant_c] 41 term_t=compone[cant_t] 42 term_g=compone[cant_g] 43 lccsal.append(-(term_a+term_c+term_t+term_g)) 44 tail=seq[0] 45 for x in range (tamseq-wsize): 46 window=upper[x+1:wsize+x+1] 47 if tail==window[-1]: 48 lccsal.append(lccsal[-1]) 49 elif tail=='A': 50 cant_a=cant_a-1 51 if window.endswith('C'): 52 cant_c=cant_c+1 53 term_a=compone[cant_a] 54 term_c=compone[cant_c] 55 lccsal.append(-(term_a+term_c+term_t+term_g)) 56 elif window.endswith('T'): 57 cant_t=cant_t+1 58 term_a=compone[cant_a] 59 term_t=compone[cant_t] 60 lccsal.append(-(term_a+term_c+term_t+term_g)) 61 elif window.endswith('G'): 62 cant_g=cant_g+1 63 term_a=compone[cant_a] 64 term_g=compone[cant_g] 65 lccsal.append(-(term_a+term_c+term_t+term_g)) 66 elif tail=='C': 67 cant_c=cant_c-1 68 if window.endswith('A'): 69 cant_a=cant_a+1 70 term_a=compone[cant_a] 71 term_c=compone[cant_c] 72 lccsal.append(-(term_a+term_c+term_t+term_g)) 73 elif window.endswith('T'): 74 cant_t=cant_t+1 75 term_c=compone[cant_c] 76 term_t=compone[cant_t] 77 lccsal.append(-(term_a+term_c+term_t+term_g)) 78 elif window.endswith('G'): 79 cant_g=cant_g+1 80 term_c=compone[cant_c] 81 term_g=compone[cant_g] 82 lccsal.append(-(term_a+term_c+term_t+term_g)) 83 elif tail=='T': 84 cant_t=cant_t-1 85 if window.endswith('A'): 86 cant_a=cant_a+1 87 term_a=compone[cant_a] 88 term_t=compone[cant_t] 89 lccsal.append(-(term_a+term_c+term_t+term_g)) 90 elif window.endswith('C'): 91 cant_c=cant_c+1 92 term_c=compone[cant_c] 93 term_t=compone[cant_t] 94 lccsal.append(-(term_a+term_c+term_t+term_g)) 95 elif window.endswith('G'): 96 cant_g=cant_g+1 97 term_t=compone[cant_t] 98 term_g=compone[cant_g] 99 lccsal.append(-(term_a+term_c+term_t+term_g)) 100 elif tail=='G': 101 cant_g=cant_g-1 102 if window.endswith('A'): 103 cant_a=cant_a+1 104 term_a=compone[cant_a] 105 term_g=compone[cant_g] 106 lccsal.append(-(term_a+term_c+term_t+term_g)) 107 elif window.endswith('C'): 108 cant_c=cant_c+1 109 term_c=compone[cant_c] 110 term_g=compone[cant_g] 111 lccsal.append(-(term_a+term_c+term_t+term_g)) 112 elif window.endswith('T'): 113 cant_t=cant_t+1 114 term_t=compone[cant_t] 115 term_g=compone[cant_g] 116 lccsal.append(-(term_a+term_c+term_t+term_g)) 117 tail=window[0] 118 return lccsal
119
120 -def lcc_simp(seq):
121 """Local Composition Complexity (LCC) for a sequence. 122 123 seq - an unambiguous DNA sequence (a string or Seq object) 124 125 Returns the Local Composition Complexity (LCC) value for the entire 126 sequence (as a float). 127 128 Reference: 129 Andrzej K Konopka (2005) Sequence Complexity and Composition 130 DOI: 10.1038/npg.els.0005260 131 """ 132 wsize=len(seq) 133 try: 134 #Assume its a string 135 upper = seq.upper() 136 except AttributeError: 137 #Should be a Seq object then 138 upper = seq.tostring().upper() 139 l2=math.log(2) 140 if 'A' not in seq: 141 term_a=0 142 # Check to avoid calculating the log of 0. 143 else: 144 term_a=((upper.count('A'))/float(wsize))*((math.log((upper.count('A')) 145 /float(wsize)))/l2) 146 if 'C' not in seq: 147 term_c=0 148 else: 149 term_c=((upper.count('C'))/float(wsize))*((math.log((upper.count('C')) 150 /float(wsize)))/l2) 151 if 'T' not in seq: 152 term_t=0 153 else: 154 term_t=((upper.count('T'))/float(wsize))*((math.log((upper.count('T')) 155 /float(wsize)))/l2) 156 if 'G' not in seq: 157 term_g=0 158 else: 159 term_g=((upper.count('G'))/float(wsize))*((math.log((upper.count('G')) 160 /float(wsize)))/l2) 161 lccsal=-(term_a+term_c+term_t+term_g) 162 return lccsal
163