1
2
3
4
5
6
7
8
9
10 """Miscellaneous functions for dealing with sequences."""
11
12 import re, time
13 from Bio import SeqIO
14 from Bio.Seq import Seq
15 from Bio import Alphabet
16 from Bio.Alphabet import IUPAC
17 from Bio.Data import IUPACData, CodonTable
18
19
20
21
22
23
24
26 """Reverse the sequence. Works on string sequences (DEPRECATED).
27
28 This function is now deprecated, instead use the string's built in slice
29 method with a step of minus one:
30
31 >>> "ACGGT"[::-1]
32 'TGGCA'
33 """
34 import warnings
35 warnings.warn("Bio.SeqUtils.reverse() is deprecated, use the string's "
36 "slice method with a step of minus one instead.",
37 DeprecationWarning)
38 r = list(seq)
39 r.reverse()
40 return ''.join(r)
41
43 """Calculates G+C content, returns the percentage (float between 0 and 100).
44
45 Copes mixed case seuqneces, and with the ambiguous nucleotide S (G or C)
46 when counting the G and C content. The percentage is calculated against
47 the full length, e.g.:
48
49 >>> from Bio.SeqUtils import GC
50 >>> GC("ACTGN")
51 40.0
52
53 Note that this will return zero for an empty sequence.
54 """
55 try:
56 gc = sum(map(seq.count,['G','C','g','c','S','s']))
57 return gc*100.0/len(seq)
58 except ZeroDivisionError:
59 return 0.0
60
61
63 """Calculates total G+C content plus first, second and third positions.
64
65 Returns a tuple of four floats (percentages between 0 and 100) for the
66 entire sequence, and the three codon positions. e.g.
67
68 >>> from Bio.SeqUtils import GC123
69 >>> GC123("ACTGTN")
70 (40.0, 50.0, 50.0, 0.0)
71
72 Copes with mixed case sequences, but does NOT deal with ambiguous
73 nucleotides.
74 """
75 d= {}
76 for nt in ['A','T','G','C']:
77 d[nt] = [0,0,0]
78
79 for i in range(0,len(seq),3):
80 codon = seq[i:i+3]
81 if len(codon) <3: codon += ' '
82 for pos in range(0,3):
83 for nt in ['A','T','G','C']:
84 if codon[pos] == nt or codon[pos] == nt.lower():
85 d[nt][pos] += 1
86 gc = {}
87 gcall = 0
88 nall = 0
89 for i in range(0,3):
90 try:
91 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
92 gc[i] = (d['G'][i] + d['C'][i])*100.0/n
93 except:
94 gc[i] = 0
95
96 gcall = gcall + d['G'][i] + d['C'][i]
97 nall = nall + n
98
99 gcall = 100.0*gcall/nall
100 return gcall, gc[0], gc[1], gc[2]
101
103 """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence.
104
105 Returns a list of ratios (floats), controlled by the length of the sequence
106 and the size of the window.
107
108 Does NOT look at any ambiguous nucleotides.
109 """
110
111 values = []
112 for i in range(0, len(seq), window):
113 s = seq[i: i + window]
114 g = s.count('G') + s.count('g')
115 c = s.count('C') + s.count('c')
116 skew = (g-c)/float(g+c)
117 values.append(skew)
118 return values
119
120 from math import pi, sin, cos, log
121 -def xGC_skew(seq, window = 1000, zoom = 100,
122 r = 300, px = 100, py = 100):
123 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!)."""
124 from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \
125 VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y
126 yscroll = Scrollbar(orient = VERTICAL)
127 xscroll = Scrollbar(orient = HORIZONTAL)
128 canvas = Canvas(yscrollcommand = yscroll.set,
129 xscrollcommand = xscroll.set, background = 'white')
130 win = canvas.winfo_toplevel()
131 win.geometry('700x700')
132
133 yscroll.config(command = canvas.yview)
134 xscroll.config(command = canvas.xview)
135 yscroll.pack(side = RIGHT, fill = Y)
136 xscroll.pack(side = BOTTOM, fill = X)
137 canvas.pack(fill=BOTH, side = LEFT, expand = 1)
138 canvas.update()
139
140 X0, Y0 = r + px, r + py
141 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r
142
143 ty = Y0
144 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq)))
145 ty +=20
146 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq)))
147 ty +=20
148 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue')
149 ty +=20
150 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta')
151 ty +=20
152 canvas.create_oval(x1,y1, x2, y2)
153
154 acc = 0
155 start = 0
156 for gc in GC_skew(seq, window):
157 r1 = r
158 acc+=gc
159
160 alpha = pi - (2*pi*start)/len(seq)
161 r2 = r1 - gc*zoom
162 x1 = X0 + r1 * sin(alpha)
163 y1 = Y0 + r1 * cos(alpha)
164 x2 = X0 + r2 * sin(alpha)
165 y2 = Y0 + r2 * cos(alpha)
166 canvas.create_line(x1,y1,x2,y2, fill = 'blue')
167
168 r1 = r - 50
169 r2 = r1 - acc
170 x1 = X0 + r1 * sin(alpha)
171 y1 = Y0 + r1 * cos(alpha)
172 x2 = X0 + r2 * sin(alpha)
173 y2 = Y0 + r2 * cos(alpha)
174 canvas.create_line(x1,y1,x2,y2, fill = 'magenta')
175
176 canvas.update()
177 start += window
178
179 canvas.configure(scrollregion = canvas.bbox(ALL))
180
186
188 """Search for a DNA subseq in sequence.
189
190 use ambiguous values (like N = A or T or C or G, R = A or G etc.)
191 searches only on forward strand
192 """
193 pattern = ''
194 for nt in subseq:
195 value = IUPACData.ambiguous_dna_values[nt]
196 if len(value) == 1:
197 pattern += value
198 else:
199 pattern += '[%s]' % value
200
201 pos = -1
202 result = [pattern]
203 l = len(seq)
204 while True:
205 pos+=1
206 s = seq[pos:]
207 m = re.search(pattern, s)
208 if not m: break
209 pos += int(m.start(0))
210 result.append(pos)
211 return result
212
213
214
215
216
217
218
219
220
221
222
223 -class ProteinX(Alphabet.ProteinAlphabet):
226
227
228 proteinX = ProteinX()
229
232 self._table = table
233 import warnings
234 warnings.warn("Function Bio.SeqUtils.makeTableX() and related classes ProteinX "
235 "and MissingTable are deprecated.", DeprecationWarning)
236 - def get(self, codon, stop_symbol):
241
248
249
250
252 """Turn a one letter code protein sequence into one with three letter codes.
253
254 The single input argument 'seq' should be a protein sequence using single
255 letter codes, either as a python string or as a Seq or MutableSeq object.
256
257 This function returns the amino acid sequence as a string using the three
258 letter amino acid codes. Output follows the IUPAC standard (including
259 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
260 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. Any unknown
261 character (including possible gap characters), is changed into 'Xaa'.
262
263 e.g.
264 >>> from Bio.SeqUtils import seq3
265 >>> seq3("MAIVMGRWKGAR*")
266 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
267
268 This function was inspired by BioPerl's seq3.
269 """
270 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
271 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
272 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
273 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
274 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
275 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter',
276 'U':'Sel', 'O':'Pyl', 'J':'Xle',
277 }
278
279
280 return ''.join([threecode.get(aa,'Xaa') for aa in seq])
281
282
283
284
285
286
287
288
289
291 """Just an alias for six_frame_translations (OBSOLETE).
292
293 Use six_frame_translation directly, as this function may be deprecated
294 in a future release."""
295 return six_frame_translations(seq, genetic_code)
296
298 """Formatted string showing the 6 frame translations and GC content.
299
300 nice looking 6 frame translation with GC content - code from xbbtools
301 similar to DNA Striders six-frame translation
302
303 e.g.
304 from Bio.SeqUtils import six_frame_translations
305 print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")
306 """
307 from Bio.Seq import reverse_complement, translate
308 anti = reverse_complement(seq)
309 comp = anti[::-1]
310 length = len(seq)
311 frames = {}
312 for i in range(0,3):
313 frames[i+1] = translate(seq[i:], genetic_code)
314 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code))
315
316
317 if length > 20:
318 short = '%s ... %s' % (seq[:10], seq[-10:])
319 else:
320 short = seq
321
322 date = time.strftime('%y %b %d, %X', time.localtime(time.time()))
323 header = 'GC_Frame: %s, ' % date
324 for nt in ['a','t','g','c']:
325 header += '%s:%d ' % (nt, seq.count(nt.upper()))
326
327 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq))
328 res = header
329
330 for i in range(0,length,60):
331 subseq = seq[i:i+60]
332 csubseq = comp[i:i+60]
333 p = i/3
334 res = res + '%d/%d\n' % (i+1, i/3+1)
335 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n'
336 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n'
337 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n'
338
339 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq))
340 res = res + csubseq.lower() + '\n'
341
342 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n'
343 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n'
344 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n'
345 return res
346
347
348
349
350
351
352
353
355 """Checks and changes the name/ID's to be unique identifiers by adding numbers (OBSOLETE).
356
357 file - a FASTA format filename to read in.
358
359 No return value, the output is written to screen.
360 """
361 dict = {}
362 txt = open(file).read()
363 entries = []
364 for entry in txt.split('>')[1:]:
365 name, seq= entry.split('\n',1)
366 name = name.split()[0].split(',')[0]
367
368 if name in dict:
369 n = 1
370 while 1:
371 n = n + 1
372 _name = name + str(n)
373 if _name not in dict:
374 name = _name
375 break
376
377 dict[name] = seq
378
379 for name, seq in dict.items():
380 print '>%s\n%s' % (name, seq)
381
383 """Simple FASTA reader, returning a list of string tuples.
384
385 The single argument 'file' should be the filename of a FASTA format file.
386 This function will open and read in the entire file, constructing a list
387 of all the records, each held as a tuple of strings (the sequence name or
388 title, and its sequence).
389
390 This function was originally intended for use on large files, where its
391 low overhead makes it very fast. However, because it returns the data as
392 a single in memory list, this can require a lot of RAM on large files.
393
394 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which
395 allows you to iterate over the records one by one (avoiding having all the
396 records in memory at once). Using Bio.SeqIO also makes it easy to switch
397 between different input file formats. However, please note that rather
398 than simple strings, Bio.SeqIO uses SeqRecord objects for each record.
399 """
400
401
402
403 handle = open(file)
404 txt = "\n" + handle.read()
405 handle.close()
406 entries = []
407 for entry in txt.split('\n>')[1:]:
408 name,seq= entry.split('\n',1)
409 seq = seq.replace('\n','').replace(' ','').upper()
410 entries.append((name, seq))
411 return entries
412
414 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
415
416 file - filename of a FASTA format file
417 function - the function you wish to invoke on each record
418 *args - any extra arguments you want passed to the function
419
420 This function will iterate over each record in a FASTA file as SeqRecord
421 objects, calling your function with the record (and supplied args) as
422 arguments.
423
424 This function returns a list. For those records where your function
425 returns a value, this is taken as a sequence and used to construct a
426 FASTA format string. If your function never has a return value, this
427 means apply_on_multi_fasta will return an empty list.
428 """
429 try:
430 f = globals()[function]
431 except:
432 raise NotImplementedError("%s not implemented" % function)
433
434 handle = open(file, 'r')
435 records = SeqIO.parse(handle, "fasta")
436 results = []
437 for record in records:
438 arguments = [record.sequence]
439 for arg in args: arguments.append(arg)
440 result = f(*arguments)
441 if result:
442 results.append('>%s\n%s' % (record.name, result))
443 handle.close()
444 return results
445
447 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
448
449 file - filename of a FASTA format file
450 function - the function you wish to invoke on each record
451 *args - any extra arguments you want passed to the function
452
453 This function will use quick_FASTA_reader to load every record in the
454 FASTA file into memory as a list of tuples. For each record, it will
455 call your supplied function with the record as a tuple of the name and
456 sequence as strings (plus any supplied args).
457
458 This function returns a list. For those records where your function
459 returns a value, this is taken as a sequence and used to construct a
460 FASTA format string. If your function never has a return value, this
461 means quicker_apply_on_multi_fasta will return an empty list.
462 """
463 try:
464 f = globals()[function]
465 except:
466 raise NotImplementedError("%s not implemented" % function)
467
468 entries = quick_FASTA_reader(file)
469 results = []
470 for name, seq in entries:
471 arguments = [seq]
472 for arg in args: arguments.append(arg)
473 result = f(*arguments)
474 if result:
475 results.append('>%s\n%s' % (name, result))
476 file.close()
477 return results
478
479
480
481
482
483
484
485
486 if __name__ == '__main__':
487 import sys, getopt
488
489 options = {'apply_on_multi_fasta':0,
490 'quick':0,
491 'uniq_ids':0,
492 }
493
494 optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=',
495 'help', 'quick', 'uniq_ids', 'search='])
496 for arg in optlist:
497 if arg[0] in ['-h', '--help']:
498 pass
499 elif arg[0] in ['--describe']:
500
501 mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)]
502 mol_funcs.sort()
503 print 'available functions:'
504 for f in mol_funcs: print '\t--%s' % f
505 print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas'
506
507 sys.exit(0)
508 elif arg[0] in ['--apply_on_multi_fasta']:
509 options['apply_on_multi_fasta'] = arg[1]
510 elif arg[0] in ['--search']:
511 options['search'] = arg[1]
512 else:
513 key = re.search('-*(.+)', arg[0]).group(1)
514 options[key] = 1
515
516
517 if options.get('apply_on_multi_fasta'):
518 file = args[0]
519 function = options['apply_on_multi_fasta']
520 arguments = []
521 if options.get('search'):
522 arguments = options['search']
523 if function == 'xGC_skew':
524 arguments = 1000
525 if options.get('quick'):
526 results = quicker_apply_on_multi_fasta(file, function, arguments)
527 else:
528 results = apply_on_multi_fasta(file, function, arguments)
529 for result in results: print result
530
531 elif options.get('uniq_ids'):
532 file = args[0]
533 fasta_uniqids(file)
534
535
536