Package Bio :: Module Seq
[hide private]
[frames] | no frames]

Source Code for Module Bio.Seq

   1  # Copyright 2000-2002 Brad Chapman. 
   2  # Copyright 2004-2005 by M de Hoon. 
   3  # Copyright 2007-2009 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Provides objects to represent biological sequences with alphabets. 
   9   
  10  See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial: 
  11   - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html} 
  12   - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf} 
  13  """ 
  14  __docformat__ ="epytext en" #Don't just use plain text in epydoc API pages! 
  15   
  16  import string #for maketrans only 
  17  import array 
  18  import sys 
  19   
  20  import Alphabet 
  21  from Alphabet import IUPAC 
  22  from Bio.SeqRecord import SeqRecord 
  23  from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement 
  24  from Bio.Data import CodonTable 
25 26 -def _maketrans(complement_mapping):
27 """Makes a python string translation table (PRIVATE). 28 29 Arguments: 30 - complement_mapping - a dictionary such as ambiguous_dna_complement 31 and ambiguous_rna_complement from Data.IUPACData. 32 33 Returns a translation table (a string of length 256) for use with the 34 python string's translate method to use in a (reverse) complement. 35 36 Compatible with lower case and upper case sequences. 37 38 For internal use only. 39 """ 40 before = ''.join(complement_mapping.keys()) 41 after = ''.join(complement_mapping.values()) 42 before = before + before.lower() 43 after = after + after.lower() 44 return string.maketrans(before, after)
45 46 _dna_complement_table = _maketrans(ambiguous_dna_complement) 47 _rna_complement_table = _maketrans(ambiguous_rna_complement)
48 49 -class Seq(object):
50 """A read-only sequence object (essentially a string with an alphabet). 51 52 Like normal python strings, our basic sequence object is immutable. 53 This prevents you from doing my_seq[5] = "A" for example, but does allow 54 Seq objects to be used as dictionary keys. 55 56 The Seq object provides a number of string like methods (such as count, 57 find, split and strip), which are alphabet aware where appropriate. 58 59 The Seq object also provides some biological methods, such as complement, 60 reverse_complement, transcribe, back_transcribe and translate (which are 61 not applicable to sequences with a protein alphabet). 62 """
63 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
64 """Create a Seq object. 65 66 Arguments: 67 - seq - Sequence, required (string) 68 - alphabet - Optional argument, an Alphabet object from Bio.Alphabet 69 70 You will typically use Bio.SeqIO to read in sequences from files as 71 SeqRecord objects, whose sequence will be exposed as a Seq object via 72 the seq property. 73 74 However, will often want to create your own Seq objects directly: 75 76 >>> from Bio.Seq import Seq 77 >>> from Bio.Alphabet import IUPAC 78 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 79 ... IUPAC.protein) 80 >>> my_seq 81 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 82 >>> print my_seq 83 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 84 """ 85 # Enforce string storage 86 assert (type(data) == type("") or # must use a string 87 type(data) == type(u"")) # but can be a unicode string 88 self._data = data 89 self.alphabet = alphabet # Seq API requirement
90 91 # A data property is/was a Seq API requirement 92 # Note this is read only since the Seq object is meant to be imutable 93 @property
94 - def data(self) :
95 """Sequence as a string (OBSOLETE/DEPRECATED). 96 97 This is a read only property provided for backwards compatility with 98 older versions of Biopython (as is the tostring() method). We now 99 encourage you to use str(my_seq) instead of my_seq.data or the method 100 my_seq.tostring(). 101 102 In recent releases of Biopython it was possible to change a Seq object 103 by updating its data property, but this triggered a deprecation warning. 104 Now the data property is read only, since Seq objects are meant to be 105 immutable: 106 107 >>> from Bio.Seq import Seq 108 >>> from Bio.Alphabet import generic_dna 109 >>> my_seq = Seq("ACGT", generic_dna) 110 >>> str(my_seq) == my_seq.tostring() == my_seq.data == "ACGT" 111 True 112 >>> my_seq.data = "AAAA" 113 Traceback (most recent call last): 114 ... 115 AttributeError: can't set attribute 116 """ 117 return str(self)
118
119 - def __repr__(self):
120 """Returns a (truncated) representation of the sequence for debugging.""" 121 if len(self) > 60: 122 #Shows the last three letters as it is often useful to see if there 123 #is a stop codon at the end of a sequence. 124 #Note total length is 54+3+3=60 125 return "%s('%s...%s', %s)" % (self.__class__.__name__, 126 str(self)[:54], str(self)[-3:], 127 repr(self.alphabet)) 128 else: 129 return "%s(%s, %s)" % (self.__class__.__name__, 130 repr(self.data), 131 repr(self.alphabet))
132 - def __str__(self):
133 """Returns the full sequence as a python string, use str(my_seq). 134 135 Note that Biopython 1.44 and earlier would give a truncated 136 version of repr(my_seq) for str(my_seq). If you are writing code 137 which need to be backwards compatible with old Biopython, you 138 should continue to use my_seq.tostring() rather than str(my_seq). 139 """ 140 return self._data
141
142 - def __cmp__(self, other):
143 """Compare the sequence to another sequence or a string (README). 144 145 Historically comparing Seq objects has done Python object comparison. 146 After considerable discussion (keeping in mind constraints of the 147 Python language, hashes and dictionary support) a future release of 148 Biopython will change this to use simple string comparison. The plan is 149 that comparing incompatible alphabets (e.g. DNA to RNA) will trigger a 150 warning. 151 152 This version of Biopython still does Python object comparison, but with 153 a warning about this future change. During this transition period, 154 please just do explicit comparisons: 155 156 >>> seq1 = Seq("ACGT") 157 >>> seq2 = Seq("ACGT") 158 >>> id(seq1) == id(seq2) 159 False 160 >>> str(seq1) == str(seq2) 161 True 162 163 Note - This method indirectly supports ==, < , etc. 164 """ 165 if hasattr(other, "alphabet"): 166 #other should be a Seq or a MutableSeq 167 import warnings 168 warnings.warn("In future comparing Seq objects will use string " 169 "comparison (not object comparison). Incompatible " 170 "alphabets will trigger a warning (not an exception). " 171 "In the interim please use id(seq1)==id(seq2) or " 172 "str(seq1)==str(seq2) to make your code explicit " 173 "and to avoid this warning.", FutureWarning) 174 return cmp(id(self), id(other))
175
176 - def __len__(self):
177 """Returns the length of the sequence, use len(my_seq).""" 178 return len(self._data) # Seq API requirement
179
180 - def __getitem__(self, index) : # Seq API requirement
181 """Returns a subsequence of single letter, use my_seq[index].""" 182 #Note since Python 2.0, __getslice__ is deprecated 183 #and __getitem__ is used instead. 184 #See http://docs.python.org/ref/sequence-methods.html 185 if isinstance(index, int): 186 #Return a single letter as a string 187 return self._data[index] 188 else: 189 #Return the (sub)sequence as another Seq object 190 return Seq(self._data[index], self.alphabet)
191
192 - def __add__(self, other):
193 """Add another sequence or string to this sequence. 194 195 If adding a string to a Seq, the alphabet is preserved: 196 197 >>> from Bio.Seq import Seq 198 >>> from Bio.Alphabet import generic_protein 199 >>> Seq("MELKI", generic_protein) + "LV" 200 Seq('MELKILV', ProteinAlphabet()) 201 202 When adding two Seq (like) objects, the alphabets are important. 203 Consider this example: 204 205 >>> from Bio.Seq import Seq 206 >>> from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna 207 >>> unamb_dna_seq = Seq("ACGT", unambiguous_dna) 208 >>> ambig_dna_seq = Seq("ACRGT", ambiguous_dna) 209 >>> unamb_dna_seq 210 Seq('ACGT', IUPACUnambiguousDNA()) 211 >>> ambig_dna_seq 212 Seq('ACRGT', IUPACAmbiguousDNA()) 213 214 If we add the ambiguous and unambiguous IUPAC DNA alphabets, we get 215 the more general ambiguous IUPAC DNA alphabet: 216 217 >>> unamb_dna_seq + ambig_dna_seq 218 Seq('ACGTACRGT', IUPACAmbiguousDNA()) 219 220 However, if the default generic alphabet is included, the result is 221 a generic alphabet: 222 223 >>> Seq("") + ambig_dna_seq 224 Seq('ACRGT', Alphabet()) 225 226 You can't add RNA and DNA sequences: 227 228 >>> from Bio.Alphabet import generic_dna, generic_rna 229 >>> Seq("ACGT", generic_dna) + Seq("ACGU", generic_rna) 230 Traceback (most recent call last): 231 ... 232 TypeError: Incompatable alphabets DNAAlphabet() and RNAAlphabet() 233 234 You can't add nucleotide and protein sequences: 235 236 >>> from Bio.Alphabet import generic_dna, generic_protein 237 >>> Seq("ACGT", generic_dna) + Seq("MELKI", generic_protein) 238 Traceback (most recent call last): 239 ... 240 TypeError: Incompatable alphabets DNAAlphabet() and ProteinAlphabet() 241 """ 242 if hasattr(other, "alphabet"): 243 #other should be a Seq or a MutableSeq 244 if not Alphabet._check_type_compatible([self.alphabet, 245 other.alphabet]): 246 raise TypeError("Incompatable alphabets %s and %s" \ 247 % (repr(self.alphabet), repr(other.alphabet))) 248 #They should be the same sequence type (or one of them is generic) 249 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 250 return self.__class__(str(self) + str(other), a) 251 elif isinstance(other, basestring): 252 #other is a plain string - use the current alphabet 253 return self.__class__(str(self) + other, self.alphabet) 254 elif isinstance(other, SeqRecord): 255 #Get the SeqRecord's __radd__ to handle this 256 return NotImplemented 257 else : 258 raise TypeError
259
260 - def __radd__(self, other):
261 """Adding a sequence on the left. 262 263 If adding a string to a Seq, the alphabet is preserved: 264 265 >>> from Bio.Seq import Seq 266 >>> from Bio.Alphabet import generic_protein 267 >>> "LV" + Seq("MELKI", generic_protein) 268 Seq('LVMELKI', ProteinAlphabet()) 269 270 Adding two Seq (like) objects is handled via the __add__ method. 271 """ 272 if hasattr(other, "alphabet"): 273 #other should be a Seq or a MutableSeq 274 if not Alphabet._check_type_compatible([self.alphabet, 275 other.alphabet]): 276 raise TypeError("Incompatable alphabets %s and %s" \ 277 % (repr(self.alphabet), repr(other.alphabet))) 278 #They should be the same sequence type (or one of them is generic) 279 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 280 return self.__class__(str(other) + str(self), a) 281 elif isinstance(other, basestring): 282 #other is a plain string - use the current alphabet 283 return self.__class__(other + str(self), self.alphabet) 284 else: 285 raise TypeError
286
287 - def tostring(self): # Seq API requirement
288 """Returns the full sequence as a python string (OBSOLETE). 289 290 Although not formally deprecated, you are now encouraged to use 291 str(my_seq) instead of my_seq.tostring().""" 292 return str(self) 293
294 - def tomutable(self): # Needed? Or use a function?
295 """Returns the full sequence as a MutableSeq object. 296 297 >>> from Bio.Seq import Seq 298 >>> from Bio.Alphabet import IUPAC 299 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL", 300 ... IUPAC.protein) 301 >>> my_seq 302 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 303 >>> my_seq.tomutable() 304 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 305 306 Note that the alphabet is preserved. 307 """ 308 return MutableSeq(str(self), self.alphabet) 309
310 - def _get_seq_str_and_check_alphabet(self, other_sequence):
311 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE). 312 313 For a string argument, returns the string. 314 315 For a Seq or MutableSeq, it checks the alphabet is compatible 316 (raising an exception if it isn't), and then returns a string. 317 """ 318 try: 319 other_alpha = other_sequence.alphabet 320 except AttributeError: 321 #Assume other_sequence is a string 322 return other_sequence 323 324 #Other should be a Seq or a MutableSeq 325 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]): 326 raise TypeError("Incompatable alphabets %s and %s" \ 327 % (repr(self.alphabet), repr(other_alpha))) 328 #Return as a string 329 return str(other_sequence)
330
331 - def count(self, sub, start=0, end=sys.maxint):
332 """Non-overlapping count method, like that of a python string. 333 334 This behaves like the python string method of the same name, 335 which does a non-overlapping count! 336 337 Returns an integer, the number of occurrences of substring 338 argument sub in the (sub)sequence given by [start:end]. 339 Optional arguments start and end are interpreted as in slice 340 notation. 341 342 Arguments: 343 - sub - a string or another Seq object to look for 344 - start - optional integer, slice start 345 - end - optional integer, slice end 346 347 e.g. 348 349 >>> from Bio.Seq import Seq 350 >>> my_seq = Seq("AAAATGA") 351 >>> print my_seq.count("A") 352 5 353 >>> print my_seq.count("ATG") 354 1 355 >>> print my_seq.count(Seq("AT")) 356 1 357 >>> print my_seq.count("AT", 2, -1) 358 1 359 360 HOWEVER, please note because python strings and Seq objects (and 361 MutableSeq objects) do a non-overlapping search, this may not give 362 the answer you expect: 363 364 >>> "AAAA".count("AA") 365 2 366 >>> print Seq("AAAA").count("AA") 367 2 368 369 A non-overlapping search would give the answer as three! 370 """ 371 #If it has one, check the alphabet: 372 sub_str = self._get_seq_str_and_check_alphabet(sub) 373 return str(self).count(sub_str, start, end)
374
375 - def __contains__(self, char):
376 """Implements the 'in' keyword, like a python string. 377 378 e.g. 379 380 >>> from Bio.Seq import Seq 381 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 382 >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna) 383 >>> "AAA" in my_dna 384 True 385 >>> Seq("AAA") in my_dna 386 True 387 >>> Seq("AAA", generic_dna) in my_dna 388 True 389 390 Like other Seq methods, this will raise a type error if another Seq 391 (or Seq like) object with an incompatible alphabet is used: 392 393 >>> Seq("AAA", generic_rna) in my_dna 394 Traceback (most recent call last): 395 ... 396 TypeError: Incompatable alphabets DNAAlphabet() and RNAAlphabet() 397 >>> Seq("AAA", generic_protein) in my_dna 398 Traceback (most recent call last): 399 ... 400 TypeError: Incompatable alphabets DNAAlphabet() and ProteinAlphabet() 401 """ 402 #If it has one, check the alphabet: 403 sub_str = self._get_seq_str_and_check_alphabet(char) 404 return sub_str in str(self)
405
406 - def find(self, sub, start=0, end=sys.maxint):
407 """Find method, like that of a python string. 408 409 This behaves like the python string method of the same name. 410 411 Returns an integer, the index of the first occurrence of substring 412 argument sub in the (sub)sequence given by [start:end]. 413 414 Arguments: 415 - sub - a string or another Seq object to look for 416 - start - optional integer, slice start 417 - end - optional integer, slice end 418 419 Returns -1 if the subsequence is NOT found. 420 421 e.g. Locating the first typical start codon, AUG, in an RNA sequence: 422 423 >>> from Bio.Seq import Seq 424 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 425 >>> my_rna.find("AUG") 426 3 427 """ 428 #If it has one, check the alphabet: 429 sub_str = self._get_seq_str_and_check_alphabet(sub) 430 return str(self).find(sub_str, start, end)
431
432 - def rfind(self, sub, start=0, end=sys.maxint):
433 """Find from right method, like that of a python string. 434 435 This behaves like the python string method of the same name. 436 437 Returns an integer, the index of the last (right most) occurrence of 438 substring argument sub in the (sub)sequence given by [start:end]. 439 440 Arguments: 441 - sub - a string or another Seq object to look for 442 - start - optional integer, slice start 443 - end - optional integer, slice end 444 445 Returns -1 if the subsequence is NOT found. 446 447 e.g. Locating the last typical start codon, AUG, in an RNA sequence: 448 449 >>> from Bio.Seq import Seq 450 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 451 >>> my_rna.rfind("AUG") 452 15 453 """ 454 #If it has one, check the alphabet: 455 sub_str = self._get_seq_str_and_check_alphabet(sub) 456 return str(self).rfind(sub_str, start, end)
457
458 - def startswith(self, prefix, start=0, end=sys.maxint):
459 """Does the Seq start with the given prefix? Returns True/False. 460 461 This behaves like the python string method of the same name. 462 463 Return True if the sequence starts with the specified prefix 464 (a string or another Seq object), False otherwise. 465 With optional start, test sequence beginning at that position. 466 With optional end, stop comparing sequence at that position. 467 prefix can also be a tuple of strings to try. e.g. 468 469 >>> from Bio.Seq import Seq 470 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 471 >>> my_rna.startswith("GUC") 472 True 473 >>> my_rna.startswith("AUG") 474 False 475 >>> my_rna.startswith("AUG", 3) 476 True 477 >>> my_rna.startswith(("UCC","UCA","UCG"),1) 478 True 479 """ 480 #If it has one, check the alphabet: 481 if isinstance(prefix, tuple): 482 #TODO - Once we drop support for Python 2.4, instead of this 483 #loop offload to the string method (requires Python 2.5+). 484 #Check all the alphabets first... 485 prefix_strings = [self._get_seq_str_and_check_alphabet(p) \ 486 for p in prefix] 487 for prefix_str in prefix_strings: 488 if str(self).startswith(prefix_str, start, end): 489 return True 490 return False 491 else: 492 prefix_str = self._get_seq_str_and_check_alphabet(prefix) 493 return str(self).startswith(prefix_str, start, end)
494
495 - def endswith(self, suffix, start=0, end=sys.maxint):
496 """Does the Seq end with the given suffix? Returns True/False. 497 498 This behaves like the python string method of the same name. 499 500 Return True if the sequence ends with the specified suffix 501 (a string or another Seq object), False otherwise. 502 With optional start, test sequence beginning at that position. 503 With optional end, stop comparing sequence at that position. 504 suffix can also be a tuple of strings to try. e.g. 505 506 >>> from Bio.Seq import Seq 507 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 508 >>> my_rna.endswith("UUG") 509 True 510 >>> my_rna.endswith("AUG") 511 False 512 >>> my_rna.endswith("AUG", 0, 18) 513 True 514 >>> my_rna.endswith(("UCC","UCA","UUG")) 515 True 516 """ 517 #If it has one, check the alphabet: 518 if isinstance(suffix, tuple): 519 #TODO - Once we drop support for Python 2.4, instead of this 520 #loop offload to the string method (requires Python 2.5+). 521 #Check all the alphabets first... 522 suffix_strings = [self._get_seq_str_and_check_alphabet(p) \ 523 for p in suffix] 524 for suffix_str in suffix_strings: 525 if str(self).endswith(suffix_str, start, end): 526 return True 527 return False 528 else: 529 suffix_str = self._get_seq_str_and_check_alphabet(suffix) 530 return str(self).endswith(suffix_str, start, end)
531 532
533 - def split(self, sep=None, maxsplit=-1):
534 """Split method, like that of a python string. 535 536 This behaves like the python string method of the same name. 537 538 Return a list of the 'words' in the string (as Seq objects), 539 using sep as the delimiter string. If maxsplit is given, at 540 most maxsplit splits are done. If maxsplit is ommited, all 541 splits are made. 542 543 Following the python string method, sep will by default be any 544 white space (tabs, spaces, newlines) but this is unlikely to 545 apply to biological sequences. 546 547 e.g. 548 549 >>> from Bio.Seq import Seq 550 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 551 >>> my_aa = my_rna.translate() 552 >>> my_aa 553 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) 554 >>> my_aa.split("*") 555 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 556 >>> my_aa.split("*",1) 557 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 558 559 See also the rsplit method: 560 561 >>> my_aa.rsplit("*",1) 562 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 563 """ 564 #If it has one, check the alphabet: 565 sep_str = self._get_seq_str_and_check_alphabet(sep) 566 #TODO - If the sep is the defined stop symbol, or gap char, 567 #should we adjust the alphabet? 568 return [Seq(part, self.alphabet) \ 569 for part in str(self).split(sep_str, maxsplit)]
570
571 - def rsplit(self, sep=None, maxsplit=-1):
572 """Right split method, like that of a python string. 573 574 This behaves like the python string method of the same name. 575 576 Return a list of the 'words' in the string (as Seq objects), 577 using sep as the delimiter string. If maxsplit is given, at 578 most maxsplit splits are done COUNTING FROM THE RIGHT. 579 If maxsplit is ommited, all splits are made. 580 581 Following the python string method, sep will by default be any 582 white space (tabs, spaces, newlines) but this is unlikely to 583 apply to biological sequences. 584 585 e.g. print my_seq.rsplit("*",1) 586 587 See also the split method. 588 """ 589 #If it has one, check the alphabet: 590 sep_str = self._get_seq_str_and_check_alphabet(sep) 591 return [Seq(part, self.alphabet) \ 592 for part in str(self).rsplit(sep_str, maxsplit)]
593
594 - def strip(self, chars=None):
595 """Returns a new Seq object with leading and trailing ends stripped. 596 597 This behaves like the python string method of the same name. 598 599 Optional argument chars defines which characters to remove. If 600 ommitted or None (default) then as for the python string method, 601 this defaults to removing any white space. 602 603 e.g. print my_seq.strip("-") 604 605 See also the lstrip and rstrip methods. 606 """ 607 #If it has one, check the alphabet: 608 strip_str = self._get_seq_str_and_check_alphabet(chars) 609 return Seq(str(self).strip(strip_str), self.alphabet)
610
611 - def lstrip(self, chars=None):
612 """Returns a new Seq object with leading (left) end stripped. 613 614 This behaves like the python string method of the same name. 615 616 Optional argument chars defines which characters to remove. If 617 ommitted or None (default) then as for the python string method, 618 this defaults to removing any white space. 619 620 e.g. print my_seq.lstrip("-") 621 622 See also the strip and rstrip methods. 623 """ 624 #If it has one, check the alphabet: 625 strip_str = self._get_seq_str_and_check_alphabet(chars) 626 return Seq(str(self).lstrip(strip_str), self.alphabet)
627
628 - def rstrip(self, chars=None):
629 """Returns a new Seq object with trailing (right) end stripped. 630 631 This behaves like the python string method of the same name. 632 633 Optional argument chars defines which characters to remove. If 634 ommitted or None (default) then as for the python string method, 635 this defaults to removing any white space. 636 637 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail): 638 639 >>> from Bio.Alphabet import IUPAC 640 >>> from Bio.Seq import Seq 641 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna) 642 >>> my_seq 643 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA()) 644 >>> my_seq.rstrip("A") 645 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA()) 646 647 See also the strip and lstrip methods. 648 """ 649 #If it has one, check the alphabet: 650 strip_str = self._get_seq_str_and_check_alphabet(chars) 651 return Seq(str(self).rstrip(strip_str), self.alphabet)
652
653 - def upper(self):
654 """Returns an upper case copy of the sequence. 655 656 >>> from Bio.Alphabet import HasStopCodon, generic_protein 657 >>> from Bio.Seq import Seq 658 >>> my_seq = Seq("VHLTPeeK*", HasStopCodon(generic_protein)) 659 >>> my_seq 660 Seq('VHLTPeeK*', HasStopCodon(ProteinAlphabet(), '*')) 661 >>> my_seq.lower() 662 Seq('vhltpeek*', HasStopCodon(ProteinAlphabet(), '*')) 663 >>> my_seq.upper() 664 Seq('VHLTPEEK*', HasStopCodon(ProteinAlphabet(), '*')) 665 666 This will adjust the alphabet if required. See also the lower method. 667 """ 668 return Seq(str(self).upper(), self.alphabet._upper())
669
670 - def lower(self):
671 """Returns a lower case copy of the sequence. 672 673 This will adjust the alphabet if required. Note that the IUPAC alphabets 674 are upper case only, and thus a generic alphabet must be substituted. 675 676 >>> from Bio.Alphabet import Gapped, generic_dna 677 >>> from Bio.Alphabet import IUPAC 678 >>> from Bio.Seq import Seq 679 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAG*AAAAAA", Gapped(IUPAC.unambiguous_dna, "*")) 680 >>> my_seq 681 Seq('CGGTACGCTTATGTCACGTAG*AAAAAA', Gapped(IUPACUnambiguousDNA(), '*')) 682 >>> my_seq.lower() 683 Seq('cggtacgcttatgtcacgtag*aaaaaa', Gapped(DNAAlphabet(), '*')) 684 685 See also the upper method. 686 """ 687 return Seq(str(self).lower(), self.alphabet._lower())
688
689 - def complement(self):
690 """Returns the complement sequence. New Seq object. 691 692 >>> from Bio.Seq import Seq 693 >>> from Bio.Alphabet import IUPAC 694 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) 695 >>> my_dna 696 Seq('CCCCCGATAG', IUPACUnambiguousDNA()) 697 >>> my_dna.complement() 698 Seq('GGGGGCTATC', IUPACUnambiguousDNA()) 699 700 You can of course used mixed case sequences, 701 702 >>> from Bio.Seq import Seq 703 >>> from Bio.Alphabet import generic_dna 704 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna) 705 >>> my_dna 706 Seq('CCCCCgatA-GD', DNAAlphabet()) 707 >>> my_dna.complement() 708 Seq('GGGGGctaT-CH', DNAAlphabet()) 709 710 Note in the above example, ambiguous character D denotes 711 G, A or T so its complement is H (for C, T or A). 712 713 Trying to complement a protein sequence raises an exception. 714 715 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 716 >>> my_protein.complement() 717 Traceback (most recent call last): 718 ... 719 ValueError: Proteins do not have complements! 720 """ 721 base = Alphabet._get_base_alphabet(self.alphabet) 722 if isinstance(base, Alphabet.ProteinAlphabet): 723 raise ValueError("Proteins do not have complements!") 724 if isinstance(base, Alphabet.DNAAlphabet): 725 ttable = _dna_complement_table 726 elif isinstance(base, Alphabet.RNAAlphabet): 727 ttable = _rna_complement_table 728 elif ('U' in self._data or 'u' in self._data) \ 729 and ('T' in self._data or 't' in self._data): 730 #TODO - Handle this cleanly? 731 raise ValueError("Mixed RNA/DNA found") 732 elif 'U' in self._data or 'u' in self._data: 733 ttable = _rna_complement_table 734 else: 735 ttable = _dna_complement_table 736 #Much faster on really long sequences than the previous loop based one. 737 #thx to Michael Palmer, University of Waterloo 738 return Seq(str(self).translate(ttable), self.alphabet)
739
740 - def reverse_complement(self):
741 """Returns the reverse complement sequence. New Seq object. 742 743 >>> from Bio.Seq import Seq 744 >>> from Bio.Alphabet import IUPAC 745 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna) 746 >>> my_dna 747 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA()) 748 >>> my_dna.reverse_complement() 749 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA()) 750 751 Note in the above example, since R = G or A, its complement 752 is Y (which denotes C or T). 753 754 You can of course used mixed case sequences, 755 756 >>> from Bio.Seq import Seq 757 >>> from Bio.Alphabet import generic_dna 758 >>> my_dna = Seq("CCCCCgatA-G", generic_dna) 759 >>> my_dna 760 Seq('CCCCCgatA-G', DNAAlphabet()) 761 >>> my_dna.reverse_complement() 762 Seq('C-TatcGGGGG', DNAAlphabet()) 763 764 Trying to complement a protein sequence raises an exception: 765 766 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 767 >>> my_protein.reverse_complement() 768 Traceback (most recent call last): 769 ... 770 ValueError: Proteins do not have complements! 771 """ 772 #Use -1 stride/step to reverse the complement 773 return self.complement()[::-1]
774
775 - def transcribe(self):
776 """Returns the RNA sequence from a DNA sequence. New Seq object. 777 778 >>> from Bio.Seq import Seq 779 >>> from Bio.Alphabet import IUPAC 780 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", 781 ... IUPAC.unambiguous_dna) 782 >>> coding_dna 783 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 784 >>> coding_dna.transcribe() 785 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 786 787 Trying to transcribe a protein or RNA sequence raises an exception: 788 789 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 790 >>> my_protein.transcribe() 791 Traceback (most recent call last): 792 ... 793 ValueError: Proteins cannot be transcribed! 794 """ 795 base = Alphabet._get_base_alphabet(self.alphabet) 796 if isinstance(base, Alphabet.ProteinAlphabet): 797 raise ValueError("Proteins cannot be transcribed!") 798 if isinstance(base, Alphabet.RNAAlphabet): 799 raise ValueError("RNA cannot be transcribed!") 800 801 if self.alphabet==IUPAC.unambiguous_dna: 802 alphabet = IUPAC.unambiguous_rna 803 elif self.alphabet==IUPAC.ambiguous_dna: 804 alphabet = IUPAC.ambiguous_rna 805 else: 806 alphabet = Alphabet.generic_rna 807 return Seq(str(self).replace('T','U').replace('t','u'), alphabet)
808
809 - def back_transcribe(self):
810 """Returns the DNA sequence from an RNA sequence. New Seq object. 811 812 >>> from Bio.Seq import Seq 813 >>> from Bio.Alphabet import IUPAC 814 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", 815 ... IUPAC.unambiguous_rna) 816 >>> messenger_rna 817 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 818 >>> messenger_rna.back_transcribe() 819 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 820 821 Trying to back-transcribe a protein or DNA sequence raises an 822 exception: 823 824 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 825 >>> my_protein.back_transcribe() 826 Traceback (most recent call last): 827 ... 828 ValueError: Proteins cannot be back transcribed! 829 """ 830 base = Alphabet._get_base_alphabet(self.alphabet) 831 if isinstance(base, Alphabet.ProteinAlphabet): 832 raise ValueError("Proteins cannot be back transcribed!") 833 if isinstance(base, Alphabet.DNAAlphabet): 834 raise ValueError("DNA cannot be back transcribed!") 835 836 if self.alphabet==IUPAC.unambiguous_rna: 837 alphabet = IUPAC.unambiguous_dna 838 elif self.alphabet==IUPAC.ambiguous_rna: 839 alphabet = IUPAC.ambiguous_dna 840 else: 841 alphabet = Alphabet.generic_dna 842 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
843
844 - def translate(self, table="Standard", stop_symbol="*", to_stop=False, 845 cds=False):
846 """Turns a nucleotide sequence into a protein sequence. New Seq object. 847 848 This method will translate DNA or RNA sequences, and those with a 849 nucleotide or generic alphabet. Trying to translate a protein 850 sequence raises an exception. 851 852 Arguments: 853 - table - Which codon table to use? This can be either a name 854 (string) or an NCBI identifier (integer). This defaults 855 to the "Standard" table. 856 - stop_symbol - Single character string, what to use for terminators. 857 This defaults to the asterisk, "*". 858 - to_stop - Boolean, defaults to False meaning do a full translation 859 continuing on past any stop codons (translated as the 860 specified stop_symbol). If True, translation is 861 terminated at the first in frame stop codon (and the 862 stop_symbol is not appended to the returned protein 863 sequence). 864 - cds - Boolean, indicates this is a complete CDS. If True, 865 this checks the sequence starts with a valid alternative start 866 codon (which will be translated as methionine, M), that the 867 sequence length is a multiple of three, and that there is a 868 single in frame stop codon at the end (this will be excluded 869 from the protein sequence, regardless of the to_stop option). 870 If these tests fail, an exception is raised. 871 872 e.g. Using the standard table: 873 874 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") 875 >>> coding_dna.translate() 876 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 877 >>> coding_dna.translate(stop_symbol="@") 878 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@')) 879 >>> coding_dna.translate(to_stop=True) 880 Seq('VAIVMGR', ExtendedIUPACProtein()) 881 882 Now using NCBI table 2, where TGA is not a stop codon: 883 884 >>> coding_dna.translate(table=2) 885 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 886 >>> coding_dna.translate(table=2, to_stop=True) 887 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein()) 888 889 In fact, GTG is an alternative start codon under NCBI table 2, meaning 890 this sequence could be a complete CDS: 891 892 >>> coding_dna.translate(table=2, cds=True) 893 Seq('MAIVMGRWKGAR', ExtendedIUPACProtein()) 894 895 It isn't a valid CDS under NCBI table 1, due to both the start codon and 896 also the in frame stop codons: 897 898 >>> coding_dna.translate(table=1, cds=True) 899 Traceback (most recent call last): 900 ... 901 TranslationError: First codon 'GTG' is not a start codon 902 903 If the sequence has no in-frame stop codon, then the to_stop argument 904 has no effect: 905 906 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") 907 >>> coding_dna2.translate() 908 Seq('LAIVMGR', ExtendedIUPACProtein()) 909 >>> coding_dna2.translate(to_stop=True) 910 Seq('LAIVMGR', ExtendedIUPACProtein()) 911 912 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 913 or a stop codon. These are translated as "X". Any invalid codon 914 (e.g. "TA?" or "T-A") will throw a TranslationError. 915 916 NOTE - Does NOT support gapped sequences. 917 918 NOTE - This does NOT behave like the python string's translate 919 method. For that use str(my_seq).translate(...) instead. 920 """ 921 try: 922 table_id = int(table) 923 except ValueError: 924 table_id = None 925 if isinstance(table, str) and len(table)==256: 926 raise ValueError("The Seq object translate method DOES NOT take " \ 927 + "a 256 character string mapping table like " \ 928 + "the python string object's translate method. " \ 929 + "Use str(my_seq).translate(...) instead.") 930 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 931 Alphabet.ProteinAlphabet): 932 raise ValueError("Proteins cannot be translated!") 933 if self.alphabet==IUPAC.unambiguous_dna: 934 #Will use standard IUPAC protein alphabet, no need for X 935 if table_id is None: 936 codon_table = CodonTable.unambiguous_dna_by_name[table] 937 else: 938 codon_table = CodonTable.unambiguous_dna_by_id[table_id] 939 elif self.alphabet==IUPAC.unambiguous_rna: 940 #Will use standard IUPAC protein alphabet, no need for X 941 if table_id is None: 942 codon_table = CodonTable.unambiguous_rna_by_name[table] 943 else: 944 codon_table = CodonTable.unambiguous_rna_by_id[table_id] 945 else: 946 #This will use the extend IUPAC protein alphabet with X etc. 947 #The same table can be used for RNA or DNA (we use this for 948 #translating strings). 949 if table_id is None: 950 codon_table = CodonTable.ambiguous_generic_by_name[table] 951 else: 952 codon_table = CodonTable.ambiguous_generic_by_id[table_id] 953 protein = _translate_str(str(self), codon_table, \ 954 stop_symbol, to_stop, cds) 955 if stop_symbol in protein: 956 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet, 957 stop_symbol = stop_symbol) 958 else: 959 alphabet = codon_table.protein_alphabet 960 return Seq(protein, alphabet)
961
962 - def ungap(self, gap=None):
963 """Return a copy of the sequence without the gap character(s). 964 965 The gap character can be specified in two ways - either as an explicit 966 argument, or via the sequence's alphabet. For example: 967 968 >>> from Bio.Seq import Seq 969 >>> from Bio.Alphabet import generic_dna 970 >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA", generic_dna) 971 >>> my_dna 972 Seq('-ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 973 >>> my_dna.ungap("-") 974 Seq('ATATGAAATTTGAAAA', DNAAlphabet()) 975 976 If the gap character is not given as an argument, it will be taken from 977 the sequence's alphabet (if defined). Notice that the returned sequence's 978 alphabet is adjusted since it no longer requires a gapped alphabet: 979 980 >>> from Bio.Seq import Seq 981 >>> from Bio.Alphabet import IUPAC, Gapped, HasStopCodon 982 >>> my_pro = Seq("MVVLE=AD*", HasStopCodon(Gapped(IUPAC.protein, "="))) 983 >>> my_pro 984 Seq('MVVLE=AD*', HasStopCodon(Gapped(IUPACProtein(), '='), '*')) 985 >>> my_pro.ungap() 986 Seq('MVVLEAD*', HasStopCodon(IUPACProtein(), '*')) 987 988 Or, with a simpler gapped DNA example: 989 990 >>> from Bio.Seq import Seq 991 >>> from Bio.Alphabet import IUPAC, Gapped 992 >>> my_seq = Seq("CGGGTAG=AAAAAA", Gapped(IUPAC.unambiguous_dna, "=")) 993 >>> my_seq 994 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 995 >>> my_seq.ungap() 996 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 997 998 As long as it is consistent with the alphabet, although it is redundant, 999 you can still supply the gap character as an argument to this method: 1000 1001 >>> my_seq 1002 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1003 >>> my_seq.ungap("=") 1004 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1005 1006 However, if the gap character given as the argument disagrees with that 1007 declared in the alphabet, an exception is raised: 1008 1009 >>> my_seq 1010 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1011 >>> my_seq.ungap("-") 1012 Traceback (most recent call last): 1013 ... 1014 ValueError: Gap '-' does not match '=' from alphabet 1015 1016 Finally, if a gap character is not supplied, and the alphabet does not 1017 define one, an exception is raised: 1018 1019 >>> from Bio.Seq import Seq 1020 >>> from Bio.Alphabet import generic_dna 1021 >>> my_dna = Seq("ATA--TGAAAT-TTGAAAA", generic_dna) 1022 >>> my_dna 1023 Seq('ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1024 >>> my_dna.ungap() 1025 Traceback (most recent call last): 1026 ... 1027 ValueError: Gap character not given and not defined in alphabet 1028 1029 """ 1030 if hasattr(self.alphabet, "gap_char"): 1031 if not gap: 1032 gap = self.alphabet.gap_char 1033 elif gap != self.alphabet.gap_char: 1034 raise ValueError("Gap %s does not match %s from alphabet" \ 1035 % (repr(gap), repr(self.alphabet.gap_char))) 1036 alpha = Alphabet._ungap(self.alphabet) 1037 elif not gap: 1038 raise ValueError("Gap character not given and not defined in alphabet") 1039 else: 1040 alpha = self.alphabet #modify! 1041 if len(gap)!=1 or not isinstance(gap, str): 1042 raise ValueError("Unexpected gap character, %s" % repr(gap)) 1043 return Seq(str(self).replace(gap, ""), alpha)
1044
1045 -class UnknownSeq(Seq):
1046 """A read-only sequence object of known length but unknown contents. 1047 1048 If you have an unknown sequence, you can represent this with a normal 1049 Seq object, for example: 1050 1051 >>> my_seq = Seq("N"*5) 1052 >>> my_seq 1053 Seq('NNNNN', Alphabet()) 1054 >>> len(my_seq) 1055 5 1056 >>> print my_seq 1057 NNNNN 1058 1059 However, this is rather wasteful of memory (especially for large 1060 sequences), which is where this class is most usefull: 1061 1062 >>> unk_five = UnknownSeq(5) 1063 >>> unk_five 1064 UnknownSeq(5, alphabet = Alphabet(), character = '?') 1065 >>> len(unk_five) 1066 5 1067 >>> print(unk_five) 1068 ????? 1069 1070 You can add unknown sequence together, provided their alphabets and 1071 characters are compatible, and get another memory saving UnknownSeq: 1072 1073 >>> unk_four = UnknownSeq(4) 1074 >>> unk_four 1075 UnknownSeq(4, alphabet = Alphabet(), character = '?') 1076 >>> unk_four + unk_five 1077 UnknownSeq(9, alphabet = Alphabet(), character = '?') 1078 1079 If the alphabet or characters don't match up, the addition gives an 1080 ordinary Seq object: 1081 1082 >>> unk_nnnn = UnknownSeq(4, character = "N") 1083 >>> unk_nnnn 1084 UnknownSeq(4, alphabet = Alphabet(), character = 'N') 1085 >>> unk_nnnn + unk_four 1086 Seq('NNNN????', Alphabet()) 1087 1088 Combining with a real Seq gives a new Seq object: 1089 1090 >>> known_seq = Seq("ACGT") 1091 >>> unk_four + known_seq 1092 Seq('????ACGT', Alphabet()) 1093 >>> known_seq + unk_four 1094 Seq('ACGT????', Alphabet()) 1095 """
1096 - def __init__(self, length, alphabet = Alphabet.generic_alphabet, character = None):
1097 """Create a new UnknownSeq object. 1098 1099 If character is ommited, it is determed from the alphabet, "N" for 1100 nucleotides, "X" for proteins, and "?" otherwise. 1101 """ 1102 self._length = int(length) 1103 if self._length < 0: 1104 #TODO - Block zero length UnknownSeq? You can just use a Seq! 1105 raise ValueError("Length must not be negative.") 1106 self.alphabet = alphabet 1107 if character: 1108 if len(character) != 1: 1109 raise ValueError("character argument should be a single letter string.") 1110 self._character = character 1111 else: 1112 base = Alphabet._get_base_alphabet(alphabet) 1113 #TODO? Check the case of the letters in the alphabet? 1114 #We may have to use "n" instead of "N" etc. 1115 if isinstance(base, Alphabet.NucleotideAlphabet): 1116 self._character = "N" 1117 elif isinstance(base, Alphabet.ProteinAlphabet): 1118 self._character = "X" 1119 else: 1120 self._character = "?"
1121
1122 - def __len__(self):
1123 """Returns the stated length of the unknown sequence.""" 1124 return self._length
1125
1126 - def __str__(self):
1127 """Returns the unknown sequence as full string of the given length.""" 1128 return self._character * self._length
1129
1130 - def __repr__(self):
1131 return "UnknownSeq(%i, alphabet = %s, character = %s)" \ 1132 % (self._length, repr(self.alphabet), repr(self._character))
1133
1134 - def __add__(self, other):
1135 """Add another sequence or string to this sequence. 1136 1137 Adding two UnknownSeq objects returns another UnknownSeq object 1138 provided the character is the same and the alphabets are compatible. 1139 1140 >>> from Bio.Seq import UnknownSeq 1141 >>> from Bio.Alphabet import generic_protein 1142 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein) 1143 UnknownSeq(15, alphabet = ProteinAlphabet(), character = 'X') 1144 1145 If the characters differ, an UnknownSeq object cannot be used, so a 1146 Seq object is returned: 1147 1148 >>> from Bio.Seq import UnknownSeq 1149 >>> from Bio.Alphabet import generic_protein 1150 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein, 1151 ... character="x") 1152 Seq('XXXXXXXXXXxxxxx', ProteinAlphabet()) 1153 1154 If adding a string to an UnknownSeq, a new Seq is returned with the 1155 same alphabet: 1156 1157 >>> from Bio.Seq import UnknownSeq 1158 >>> from Bio.Alphabet import generic_protein 1159 >>> UnknownSeq(5, generic_protein) + "LV" 1160 Seq('XXXXXLV', ProteinAlphabet()) 1161 """ 1162 if isinstance(other, UnknownSeq) \ 1163 and other._character == self._character: 1164 #TODO - Check the alphabets match 1165 return UnknownSeq(len(self)+len(other), 1166 self.alphabet, self._character) 1167 #Offload to the base class... 1168 return Seq(str(self), self.alphabet) + other
1169
1170 - def __radd__(self, other):
1171 #If other is an UnknownSeq, then __add__ would be called. 1172 #Offload to the base class... 1173 return other + Seq(str(self), self.alphabet)
1174
1175 - def __getitem__(self, index):
1176 if isinstance(index, int): 1177 #TODO - Check the bounds without wasting memory 1178 return str(self)[index] 1179 else: 1180 #TODO - Work out the length without wasting memory 1181 return UnknownSeq(len(("#"*self._length)[index]), 1182 self.alphabet, self._character)
1183
1184 - def count(self, sub, start=0, end=sys.maxint):
1185 """Non-overlapping count method, like that of a python string. 1186 1187 This behaves like the python string (and Seq object) method of the 1188 same name, which does a non-overlapping count! 1189 1190 Returns an integer, the number of occurrences of substring 1191 argument sub in the (sub)sequence given by [start:end]. 1192 Optional arguments start and end are interpreted as in slice 1193 notation. 1194 1195 Arguments: 1196 - sub - a string or another Seq object to look for 1197 - start - optional integer, slice start 1198 - end - optional integer, slice end 1199 1200 >>> "NNNN".count("N") 1201 4 1202 >>> Seq("NNNN").count("N") 1203 4 1204 >>> UnknownSeq(4, character="N").count("N") 1205 4 1206 >>> UnknownSeq(4, character="N").count("A") 1207 0 1208 >>> UnknownSeq(4, character="N").count("AA") 1209 0 1210 1211 HOWEVER, please note because that python strings and Seq objects (and 1212 MutableSeq objects) do a non-overlapping search, this may not give 1213 the answer you expect: 1214 1215 >>> UnknownSeq(4, character="N").count("NN") 1216 2 1217 >>> UnknownSeq(4, character="N").count("NNN") 1218 1 1219 """ 1220 sub_str = self._get_seq_str_and_check_alphabet(sub) 1221 if len(sub_str) == 1: 1222 if str(sub_str) == self._character: 1223 if start==0 and end >= self._length: 1224 return self._length 1225 else: 1226 #This could be done more cleverly... 1227 return str(self).count(sub_str, start, end) 1228 else: 1229 return 0 1230 else: 1231 if set(sub_str) == set(self._character): 1232 if start==0 and end >= self._length: 1233 return self._length // len(sub_str) 1234 else: 1235 #This could be done more cleverly... 1236 return str(self).count(sub_str, start, end) 1237 else: 1238 return 0
1239
1240 - def complement(self):
1241 """The complement of an unknown nucleotide equals itself. 1242 1243 >>> my_nuc = UnknownSeq(8) 1244 >>> my_nuc 1245 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1246 >>> print my_nuc 1247 ???????? 1248 >>> my_nuc.complement() 1249 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1250 >>> print my_nuc.complement() 1251 ???????? 1252 """ 1253 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1254 Alphabet.ProteinAlphabet): 1255 raise ValueError("Proteins do not have complements!") 1256 return self
1257
1258 - def reverse_complement(self):
1259 """The reverse complement of an unknown nucleotide equals itself. 1260 1261 >>> my_nuc = UnknownSeq(10) 1262 >>> my_nuc 1263 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1264 >>> print my_nuc 1265 ?????????? 1266 >>> my_nuc.reverse_complement() 1267 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1268 >>> print my_nuc.reverse_complement() 1269 ?????????? 1270 """ 1271 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1272 Alphabet.ProteinAlphabet): 1273 raise ValueError("Proteins do not have complements!") 1274 return self
1275
1276 - def transcribe(self):
1277 """Returns unknown RNA sequence from an unknown DNA sequence. 1278 1279 >>> my_dna = UnknownSeq(10, character="N") 1280 >>> my_dna 1281 UnknownSeq(10, alphabet = Alphabet(), character = 'N') 1282 >>> print my_dna 1283 NNNNNNNNNN 1284 >>> my_rna = my_dna.transcribe() 1285 >>> my_rna 1286 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N') 1287 >>> print my_rna 1288 NNNNNNNNNN 1289 """ 1290 #Offload the alphabet stuff 1291 s = Seq(self._character, self.alphabet).transcribe() 1292 return UnknownSeq(self._length, s.alphabet, self._character)
1293
1294 - def back_transcribe(self):
1295 """Returns unknown DNA sequence from an unknown RNA sequence. 1296 1297 >>> my_rna = UnknownSeq(20, character="N") 1298 >>> my_rna 1299 UnknownSeq(20, alphabet = Alphabet(), character = 'N') 1300 >>> print my_rna 1301 NNNNNNNNNNNNNNNNNNNN 1302 >>> my_dna = my_rna.back_transcribe() 1303 >>> my_dna 1304 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1305 >>> print my_dna 1306 NNNNNNNNNNNNNNNNNNNN 1307 """ 1308 #Offload the alphabet stuff 1309 s = Seq(self._character, self.alphabet).back_transcribe() 1310 return UnknownSeq(self._length, s.alphabet, self._character)
1311
1312 - def upper(self):
1313 """Returns an upper case copy of the sequence. 1314 1315 >>> from Bio.Alphabet import generic_dna 1316 >>> from Bio.Seq import UnknownSeq 1317 >>> my_seq = UnknownSeq(20, generic_dna, character="n") 1318 >>> my_seq 1319 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'n') 1320 >>> print my_seq 1321 nnnnnnnnnnnnnnnnnnnn 1322 >>> my_seq.upper() 1323 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1324 >>> print my_seq.upper() 1325 NNNNNNNNNNNNNNNNNNNN 1326 1327 This will adjust the alphabet if required. See also the lower method. 1328 """ 1329 return UnknownSeq(self._length, self.alphabet._upper(), self._character.upper())
1330
1331 - def lower(self):
1332 """Returns a lower case copy of the sequence. 1333 1334 This will adjust the alphabet if required: 1335 1336 >>> from Bio.Alphabet import IUPAC 1337 >>> from Bio.Seq import UnknownSeq 1338 >>> my_seq = UnknownSeq(20, IUPAC.extended_protein) 1339 >>> my_seq 1340 UnknownSeq(20, alphabet = ExtendedIUPACProtein(), character = 'X') 1341 >>> print my_seq 1342 XXXXXXXXXXXXXXXXXXXX 1343 >>> my_seq.lower() 1344 UnknownSeq(20, alphabet = ProteinAlphabet(), character = 'x') 1345 >>> print my_seq.lower() 1346 xxxxxxxxxxxxxxxxxxxx 1347 1348 See also the upper method. 1349 """ 1350 return UnknownSeq(self._length, self.alphabet._lower(), self._character.lower())
1351
1352 - def translate(self, **kwargs):
1353 """Translate an unknown nucleotide sequence into an unknown protein. 1354 1355 e.g. 1356 1357 >>> my_seq = UnknownSeq(11, character="N") 1358 >>> print my_seq 1359 NNNNNNNNNNN 1360 >>> my_protein = my_seq.translate() 1361 >>> my_protein 1362 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X') 1363 >>> print my_protein 1364 XXX 1365 1366 In comparison, using a normal Seq object: 1367 1368 >>> my_seq = Seq("NNNNNNNNNNN") 1369 >>> print my_seq 1370 NNNNNNNNNNN 1371 >>> my_protein = my_seq.translate() 1372 >>> my_protein 1373 Seq('XXX', ExtendedIUPACProtein()) 1374 >>> print my_protein 1375 XXX 1376 1377 """ 1378 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1379 Alphabet.ProteinAlphabet): 1380 raise ValueError("Proteins cannot be translated!") 1381 return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
1382
1383 - def ungap(self, gap=None):
1384 """Return a copy of the sequence without the gap character(s). 1385 1386 The gap character can be specified in two ways - either as an explicit 1387 argument, or via the sequence's alphabet. For example: 1388 1389 >>> from Bio.Seq import UnknownSeq 1390 >>> from Bio.Alphabet import Gapped, generic_dna 1391 >>> my_dna = UnknownSeq(20, Gapped(generic_dna,"-")) 1392 >>> my_dna 1393 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = 'N') 1394 >>> my_dna.ungap() 1395 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1396 >>> my_dna.ungap("-") 1397 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1398 1399 If the UnknownSeq is using the gap character, then an empty Seq is 1400 returned: 1401 1402 >>> my_gap = UnknownSeq(20, Gapped(generic_dna,"-"), character="-") 1403 >>> my_gap 1404 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = '-') 1405 >>> my_gap.ungap() 1406 Seq('', DNAAlphabet()) 1407 >>> my_gap.ungap("-") 1408 Seq('', DNAAlphabet()) 1409 1410 Notice that the returned sequence's alphabet is adjusted to remove any 1411 explicit gap character declaration. 1412 """ 1413 #Offload the alphabet stuff 1414 s = Seq(self._character, self.alphabet).ungap() 1415 if s : 1416 return UnknownSeq(self._length, s.alphabet, self._character) 1417 else : 1418 return Seq("", s.alphabet)
1419
1420 -class MutableSeq(object):
1421 """An editable sequence object (with an alphabet). 1422 1423 Unlike normal python strings and our basic sequence object (the Seq class) 1424 which are immuatable, the MutableSeq lets you edit the sequence in place. 1425 However, this means you cannot use a MutableSeq object as a dictionary key. 1426 1427 >>> from Bio.Seq import MutableSeq 1428 >>> from Bio.Alphabet import generic_dna 1429 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna) 1430 >>> my_seq 1431 MutableSeq('ACTCGTCGTCG', DNAAlphabet()) 1432 >>> my_seq[5] 1433 'T' 1434 >>> my_seq[5] = "A" 1435 >>> my_seq 1436 MutableSeq('ACTCGACGTCG', DNAAlphabet()) 1437 >>> my_seq[5] 1438 'A' 1439 >>> my_seq[5:8] = "NNN" 1440 >>> my_seq 1441 MutableSeq('ACTCGNNNTCG', DNAAlphabet()) 1442 >>> len(my_seq) 1443 11 1444 1445 Note that the MutableSeq object does not support as many string-like 1446 or biological methods as the Seq object. 1447 """
1448 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
1449 if type(data) == type(""): 1450 self.data = array.array("c", data) 1451 else: 1452 self.data = data # assumes the input is an array 1453 self.alphabet = alphabet
1454
1455 - def __repr__(self):
1456 """Returns a (truncated) representation of the sequence for debugging.""" 1457 if len(self) > 60: 1458 #Shows the last three letters as it is often useful to see if there 1459 #is a stop codon at the end of a sequence. 1460 #Note total length is 54+3+3=60 1461 return "%s('%s...%s', %s)" % (self.__class__.__name__, 1462 str(self[:54]), str(self[-3:]), 1463 repr(self.alphabet)) 1464 else: 1465 return "%s('%s', %s)" % (self.__class__.__name__, 1466 str(self), 1467 repr(self.alphabet))
1468
1469 - def __str__(self):
1470 """Returns the full sequence as a python string. 1471 1472 Note that Biopython 1.44 and earlier would give a truncated 1473 version of repr(my_seq) for str(my_seq). If you are writing code 1474 which needs to be backwards compatible with old Biopython, you 1475 should continue to use my_seq.tostring() rather than str(my_seq). 1476 """ 1477 #See test_GAQueens.py for an historic usage of a non-string alphabet! 1478 return "".join(self.data)
1479
1480 - def __cmp__(self, other):
1481 """Compare the sequence to another sequence or a string (README). 1482 1483 Currently if compared to another sequence the alphabets must be 1484 compatible. Comparing DNA to RNA, or Nucleotide to Protein will raise 1485 an exception. Otherwise only the sequence itself is compared, not the 1486 precise alphabet. 1487 1488 A future release of Biopython will change this (and the Seq object etc) 1489 to use simple string comparison. The plan is that comparing sequences 1490 with incompatible alphabets (e.g. DNA to RNA) will trigger a warning 1491 but not an exception. 1492 1493 During this transition period, please just do explicit comparisons: 1494 1495 >>> seq1 = MutableSeq("ACGT") 1496 >>> seq2 = MutableSeq("ACGT") 1497 >>> id(seq1) == id(seq2) 1498 False 1499 >>> str(seq1) == str(seq2) 1500 True 1501 1502 This method indirectly supports ==, < , etc. 1503 """ 1504 if hasattr(other, "alphabet"): 1505 #other should be a Seq or a MutableSeq 1506 import warnings 1507 warnings.warn("In future comparing incompatible alphabets will " 1508 "only trigger a warning (not an exception). In " 1509 "the interim please use id(seq1)==id(seq2) or " 1510 "str(seq1)==str(seq2) to make your code explicit " 1511 "and to avoid this warning.", FutureWarning) 1512 if not Alphabet._check_type_compatible([self.alphabet, 1513 other.alphabet]): 1514 raise TypeError("Incompatable alphabets %s and %s" \ 1515 % (repr(self.alphabet), repr(other.alphabet))) 1516 #They should be the same sequence type (or one of them is generic) 1517 if isinstance(other, MutableSeq): 1518 #See test_GAQueens.py for an historic usage of a non-string 1519 #alphabet! Comparing the arrays supports this. 1520 return cmp(self.data, other.data) 1521 else: 1522 return cmp(str(self), str(other)) 1523 elif isinstance(other, basestring): 1524 return cmp(str(self), other) 1525 else: 1526 raise TypeError
1527
1528 - def __len__(self): return len(self.data)
1529
1530 - def __getitem__(self, index):
1531 #Note since Python 2.0, __getslice__ is deprecated 1532 #and __getitem__ is used instead. 1533 #See http://docs.python.org/ref/sequence-methods.html 1534 if isinstance(index, int): 1535 #Return a single letter as a string 1536 return self.data[index] 1537 else: 1538 #Return the (sub)sequence as another Seq object 1539 return MutableSeq(self.data[index], self.alphabet)
1540
1541 - def __setitem__(self, index, value):
1542 #Note since Python 2.0, __setslice__ is deprecated 1543 #and __setitem__ is used instead. 1544 #See http://docs.python.org/ref/sequence-methods.html 1545 if isinstance(index, int): 1546 #Replacing a single letter with a new string 1547 self.data[index] = value 1548 else: 1549 #Replacing a sub-sequence 1550 if isinstance(value, MutableSeq): 1551 self.data[index] = value.data 1552 elif isinstance(value, type(self.data)): 1553 self.data[index] = value 1554 else: 1555 self.data[index] = array.array("c", str(value))
1556
1557 - def __delitem__(self, index):
1558 #Note since Python 2.0, __delslice__ is deprecated 1559 #and __delitem__ is used instead. 1560 #See http://docs.python.org/ref/sequence-methods.html 1561 1562 #Could be deleting a single letter, or a slice 1563 del self.data[index]
1564
1565 - def __add__(self, other):
1566 """Add another sequence or string to this sequence. 1567 1568 Returns a new MutableSeq object.""" 1569 if hasattr(other, "alphabet"): 1570 #other should be a Seq or a MutableSeq 1571 if not Alphabet._check_type_compatible([self.alphabet, 1572 other.alphabet]): 1573 raise TypeError("Incompatable alphabets %s and %s" \ 1574 % (repr(self.alphabet), repr(other.alphabet))) 1575 #They should be the same sequence type (or one of them is generic) 1576 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1577 if isinstance(other, MutableSeq): 1578 #See test_GAQueens.py for an historic usage of a non-string 1579 #alphabet! Adding the arrays should support this. 1580 return self.__class__(self.data + other.data, a) 1581 else: 1582 return self.__class__(str(self) + str(other), a) 1583 elif isinstance(other, basestring): 1584 #other is a plain string - use the current alphabet 1585 return self.__class__(str(self) + str(other), self.alphabet) 1586 else: 1587 raise TypeError
1588
1589 - def __radd__(self, other):
1590 if hasattr(other, "alphabet"): 1591 #other should be a Seq or a MutableSeq 1592 if not Alphabet._check_type_compatible([self.alphabet, 1593 other.alphabet]): 1594 raise TypeError("Incompatable alphabets %s and %s" \ 1595 % (repr(self.alphabet), repr(other.alphabet))) 1596 #They should be the same sequence type (or one of them is generic) 1597 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1598 if isinstance(other, MutableSeq): 1599 #See test_GAQueens.py for an historic usage of a non-string 1600 #alphabet! Adding the arrays should support this. 1601 return self.__class__(other.data + self.data, a) 1602 else: 1603 return self.__class__(str(other) + str(self), a) 1604 elif isinstance(other, basestring): 1605 #other is a plain string - use the current alphabet 1606 return self.__class__(str(other) + str(self), self.alphabet) 1607 else: 1608 raise TypeError
1609
1610 - def append(self, c):
1611 self.data.append(c)
1612
1613 - def insert(self, i, c):
1614 self.data.insert(i, c)
1615
1616 - def pop(self, i = (-1)):
1617 c = self.data[i] 1618 del self.data[i] 1619 return c
1620
1621 - def remove(self, item):
1622 for i in range(len(self.data)): 1623 if self.data[i] == item: 1624 del self.data[i] 1625 return 1626 raise ValueError("MutableSeq.remove(x): x not in list")
1627
1628 - def count(self, sub, start=0, end=sys.maxint):
1629 """Non-overlapping count method, like that of a python string. 1630 1631 This behaves like the python string method of the same name, 1632 which does a non-overlapping count! 1633 1634 Returns an integer, the number of occurrences of substring 1635 argument sub in the (sub)sequence given by [start:end]. 1636 Optional arguments start and end are interpreted as in slice 1637 notation. 1638 1639 Arguments: 1640 - sub - a string or another Seq object to look for 1641 - start - optional integer, slice start 1642 - end - optional integer, slice end 1643 1644 e.g. 1645 1646 >>> from Bio.Seq import MutableSeq 1647 >>> my_mseq = MutableSeq("AAAATGA") 1648 >>> print my_mseq.count("A") 1649 5 1650 >>> print my_mseq.count("ATG") 1651 1 1652 >>> print my_mseq.count(Seq("AT")) 1653 1 1654 >>> print my_mseq.count("AT", 2, -1) 1655 1 1656 1657 HOWEVER, please note because that python strings, Seq objects and 1658 MutableSeq objects do a non-overlapping search, this may not give 1659 the answer you expect: 1660 1661 >>> "AAAA".count("AA") 1662 2 1663 >>> print MutableSeq("AAAA").count("AA") 1664 2 1665 1666 A non-overlapping search would give the answer as three! 1667 """ 1668 try: 1669 #TODO - Should we check the alphabet? 1670 search = sub.tostring() 1671 except AttributeError: 1672 search = sub 1673 1674 if not isinstance(search, basestring): 1675 raise TypeError("expected a string, Seq or MutableSeq") 1676 1677 if len(search) == 1: 1678 #Try and be efficient and work directly from the array. 1679 count = 0 1680 for c in self.data[start:end]: 1681 if c == search: count += 1 1682 return count 1683 else: 1684 #TODO - Can we do this more efficiently? 1685 return self.tostring().count(search, start, end)
1686
1687 - def index(self, item):
1688 for i in range(len(self.data)): 1689 if self.data[i] == item: 1690 return i 1691 raise ValueError("MutableSeq.index(x): x not in list")
1692
1693 - def reverse(self):
1694 """Modify the mutable sequence to reverse itself. 1695 1696 No return value. 1697 """ 1698 self.data.reverse()
1699
1700 - def complement(self):
1701 """Modify the mutable sequence to take on its complement. 1702 1703 Trying to complement a protein sequence raises an exception. 1704 1705 No return value. 1706 """ 1707 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1708 Alphabet.ProteinAlphabet): 1709 raise ValueError("Proteins do not have complements!") 1710 if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): 1711 d = ambiguous_dna_complement 1712 elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): 1713 d = ambiguous_rna_complement 1714 elif 'U' in self.data and 'T' in self.data: 1715 #TODO - Handle this cleanly? 1716 raise ValueError("Mixed RNA/DNA found") 1717 elif 'U' in self.data: 1718 d = ambiguous_rna_complement 1719 else: 1720 d = ambiguous_dna_complement 1721 c = dict([(x.lower(), y.lower()) for x,y in d.iteritems()]) 1722 d.update(c) 1723 self.data = map(lambda c: d[c], self.data) 1724 self.data = array.array('c', self.data)
1725
1726 - def reverse_complement(self):
1727 """Modify the mutable sequence to take on its reverse complement. 1728 1729 Trying to reverse complement a protein sequence raises an exception. 1730 1731 No return value. 1732 """ 1733 self.complement() 1734 self.data.reverse()
1735 1736 ## Sorting a sequence makes no sense. 1737 # def sort(self, *args): self.data.sort(*args) 1738
1739 - def extend(self, other):
1740 if isinstance(other, MutableSeq): 1741 for c in other.data: 1742 self.data.append(c) 1743 else: 1744 for c in other: 1745 self.data.append(c)
1746
1747 - def tostring(self):
1748 """Returns the full sequence as a python string. 1749 1750 Although not formally deprecated, you are now encouraged to use 1751 str(my_seq) instead of my_seq.tostring(). 1752 1753 Because str(my_seq) will give you the full sequence as a python string, 1754 there is often no need to make an explicit conversion. For example, 1755 1756 print "ID={%s}, sequence={%s}" % (my_name, my_seq) 1757 1758 On Biopython 1.44 or older you would have to have done this: 1759 1760 print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring()) 1761 """ 1762 return "".join(self.data)
1763
1764 - def toseq(self):
1765 """Returns the full sequence as a new immutable Seq object. 1766 1767 >>> from Bio.Seq import Seq 1768 >>> from Bio.Alphabet import IUPAC 1769 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", \ 1770 IUPAC.protein) 1771 >>> my_mseq 1772 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1773 >>> my_mseq.toseq() 1774 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1775 1776 Note that the alphabet is preserved. 1777 """ 1778 return Seq("".join(self.data), self.alphabet)
1779
1780 # The transcribe, backward_transcribe, and translate functions are 1781 # user-friendly versions of the corresponding functions in Bio.Transcribe 1782 # and Bio.Translate. The functions work both on Seq objects, and on strings. 1783 1784 -def transcribe(dna):
1785 """Transcribes a DNA sequence into RNA. 1786 1787 If given a string, returns a new string object. 1788 1789 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1790 1791 Trying to transcribe a protein or RNA sequence raises an exception. 1792 1793 e.g. 1794 1795 >>> transcribe("ACTGN") 1796 'ACUGN' 1797 """ 1798 if isinstance(dna, Seq): 1799 return dna.transcribe() 1800 elif isinstance(dna, MutableSeq): 1801 return dna.toseq().transcribe() 1802 else: 1803 return dna.replace('T','U').replace('t','u')
1804
1805 -def back_transcribe(rna):
1806 """Back-transcribes an RNA sequence into DNA. 1807 1808 If given a string, returns a new string object. 1809 1810 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1811 1812 Trying to transcribe a protein or DNA sequence raises an exception. 1813 1814 e.g. 1815 1816 >>> back_transcribe("ACUGN") 1817 'ACTGN' 1818 """ 1819 if isinstance(rna, Seq): 1820 return rna.back_transcribe() 1821 elif isinstance(rna, MutableSeq): 1822 return rna.toseq().back_transcribe() 1823 else: 1824 return rna.replace('U','T').replace('u','t')
1825
1826 -def _translate_str(sequence, table, stop_symbol="*", to_stop=False, 1827 cds=False, pos_stop="X"):
1828 """Helper function to translate a nucleotide string (PRIVATE). 1829 1830 Arguments: 1831 - sequence - a string 1832 - table - a CodonTable object (NOT a table name or id number) 1833 - stop_symbol - a single character string, what to use for terminators. 1834 - to_stop - boolean, should translation terminate at the first 1835 in frame stop codon? If there is no in-frame stop codon 1836 then translation continues to the end. 1837 - pos_stop - a single character string for a possible stop codon 1838 (e.g. TAN or NNN) 1839 - cds - Boolean, indicates this is a complete CDS. If True, this 1840 checks the sequence starts with a valid alternative start 1841 codon (which will be translated as methionine, M), that the 1842 sequence length is a multiple of three, and that there is a 1843 single in frame stop codon at the end (this will be excluded 1844 from the protein sequence, regardless of the to_stop option). 1845 If these tests fail, an exception is raised. 1846 1847 Returns a string. 1848 1849 e.g. 1850 1851 >>> from Bio.Data import CodonTable 1852 >>> table = CodonTable.ambiguous_dna_by_id[1] 1853 >>> _translate_str("AAA", table) 1854 'K' 1855 >>> _translate_str("TAR", table) 1856 '*' 1857 >>> _translate_str("TAN", table) 1858 'X' 1859 >>> _translate_str("TAN", table, pos_stop="@") 1860 '@' 1861 >>> _translate_str("TA?", table) 1862 Traceback (most recent call last): 1863 ... 1864 TranslationError: Codon 'TA?' is invalid 1865 >>> _translate_str("ATGCCCTAG", table, cds=True) 1866 'MP' 1867 >>> _translate_str("AAACCCTAG", table, cds=True) 1868 Traceback (most recent call last): 1869 ... 1870 TranslationError: First codon 'AAA' is not a start codon 1871 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True) 1872 Traceback (most recent call last): 1873 ... 1874 TranslationError: Extra in frame stop codon found. 1875 """ 1876 sequence = sequence.upper() 1877 amino_acids = [] 1878 forward_table = table.forward_table 1879 stop_codons = table.stop_codons 1880 if table.nucleotide_alphabet.letters is not None: 1881 valid_letters = set(table.nucleotide_alphabet.letters.upper()) 1882 else: 1883 #Assume the worst case, ambiguous DNA or RNA: 1884 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + \ 1885 IUPAC.ambiguous_rna.letters.upper()) 1886 if cds: 1887 if str(sequence[:3]).upper() not in table.start_codons: 1888 raise CodonTable.TranslationError(\ 1889 "First codon '%s' is not a start codon" % sequence[:3]) 1890 if len(sequence) % 3 != 0: 1891 raise CodonTable.TranslationError(\ 1892 "Sequence length %i is not a multiple of three" % len(sequence)) 1893 if str(sequence[-3:]).upper() not in stop_codons: 1894 raise CodonTable.TranslationError(\ 1895 "Final codon '%s' is not a stop codon" % sequence[-3:]) 1896 #Don't translate the stop symbol, and manually translate the M 1897 sequence = sequence[3:-3] 1898 amino_acids = ["M"] 1899 n = len(sequence) 1900 for i in xrange(0,n-n%3,3): 1901 codon = sequence[i:i+3] 1902 try: 1903 amino_acids.append(forward_table[codon]) 1904 except (KeyError, CodonTable.TranslationError): 1905 #Todo? Treat "---" as a special case (gapped translation) 1906 if codon in table.stop_codons: 1907 if cds: 1908 raise CodonTable.TranslationError(\ 1909 "Extra in frame stop codon found.") 1910 if to_stop : break 1911 amino_acids.append(stop_symbol) 1912 elif valid_letters.issuperset(set(codon)): 1913 #Possible stop codon (e.g. NNN or TAN) 1914 amino_acids.append(pos_stop) 1915 else: 1916 raise CodonTable.TranslationError(\ 1917 "Codon '%s' is invalid" % codon) 1918 return "".join(amino_acids)
1919
1920 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False, 1921 cds=False):
1922 """Translate a nucleotide sequence into amino acids. 1923 1924 If given a string, returns a new string object. Given a Seq or 1925 MutableSeq, returns a Seq object with a protein alphabet. 1926 1927 Arguments: 1928 - table - Which codon table to use? This can be either a name 1929 (string) or an NCBI identifier (integer). Defaults 1930 to the "Standard" table. 1931 - stop_symbol - Single character string, what to use for any 1932 terminators, defaults to the asterisk, "*". 1933 - to_stop - Boolean, defaults to False meaning do a full 1934 translation continuing on past any stop codons 1935 (translated as the specified stop_symbol). If 1936 True, translation is terminated at the first in 1937 frame stop codon (and the stop_symbol is not 1938 appended to the returned protein sequence). 1939 - cds - Boolean, indicates this is a complete CDS. If True, this 1940 checks the sequence starts with a valid alternative start 1941 codon (which will be translated as methionine, M), that the 1942 sequence length is a multiple of three, and that there is a 1943 single in frame stop codon at the end (this will be excluded 1944 from the protein sequence, regardless of the to_stop option). 1945 If these tests fail, an exception is raised. 1946 1947 A simple string example using the default (standard) genetic code: 1948 1949 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" 1950 >>> translate(coding_dna) 1951 'VAIVMGR*KGAR*' 1952 >>> translate(coding_dna, stop_symbol="@") 1953 'VAIVMGR@KGAR@' 1954 >>> translate(coding_dna, to_stop=True) 1955 'VAIVMGR' 1956 1957 Now using NCBI table 2, where TGA is not a stop codon: 1958 1959 >>> translate(coding_dna, table=2) 1960 'VAIVMGRWKGAR*' 1961 >>> translate(coding_dna, table=2, to_stop=True) 1962 'VAIVMGRWKGAR' 1963 1964 In fact this example uses an alternative start codon valid under NCBI table 2, 1965 GTG, which means this example is a complete valid CDS which when translated 1966 should really start with methionine (not valine): 1967 1968 >>> translate(coding_dna, table=2, cds=True) 1969 'MAIVMGRWKGAR' 1970 1971 Note that if the sequence has no in-frame stop codon, then the to_stop 1972 argument has no effect: 1973 1974 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" 1975 >>> translate(coding_dna2) 1976 'VAIVMGR' 1977 >>> translate(coding_dna2, to_stop=True) 1978 'VAIVMGR' 1979 1980 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 1981 or a stop codon. These are translated as "X". Any invalid codon 1982 (e.g. "TA?" or "T-A") will throw a TranslationError. 1983 1984 NOTE - Does NOT support gapped sequences. 1985 1986 It will however translate either DNA or RNA. 1987 """ 1988 if isinstance(sequence, Seq): 1989 return sequence.translate(table, stop_symbol, to_stop, cds) 1990 elif isinstance(sequence, MutableSeq): 1991 #Return a Seq object 1992 return sequence.toseq().translate(table, stop_symbol, to_stop, cds) 1993 else: 1994 #Assume its a string, return a string 1995 try: 1996 codon_table = CodonTable.ambiguous_generic_by_id[int(table)] 1997 except ValueError: 1998 codon_table = CodonTable.ambiguous_generic_by_name[table] 1999 return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds)
2000
2001 -def reverse_complement(sequence):
2002 """Returns the reverse complement sequence of a nucleotide string. 2003 2004 If given a string, returns a new string object. 2005 Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet. 2006 2007 Supports unambiguous and ambiguous nucleotide sequences. 2008 2009 e.g. 2010 2011 >>> reverse_complement("ACTG-NH") 2012 'DN-CAGT' 2013 """ 2014 if isinstance(sequence, Seq): 2015 #Return a Seq 2016 return sequence.reverse_complement() 2017 elif isinstance(sequence, MutableSeq): 2018 #Return a Seq 2019 #Don't use the MutableSeq reverse_complement method as it is 'in place'. 2020 return sequence.toseq().reverse_complement() 2021 2022 #Assume its a string. 2023 #In order to avoid some code duplication, the old code would turn the string 2024 #into a Seq, use the reverse_complement method, and convert back to a string. 2025 #This worked, but is over five times slower on short sequences! 2026 if ('U' in sequence or 'u' in sequence) \ 2027 and ('T' in sequence or 't' in sequence): 2028 raise ValueError("Mixed RNA/DNA found") 2029 elif 'U' in sequence or 'u' in sequence: 2030 ttable = _rna_complement_table 2031 else: 2032 ttable = _dna_complement_table 2033 return sequence.translate(ttable)[::-1]
2034
2035 -def _test():
2036 """Run the Bio.Seq module's doctests.""" 2037 print "Runing doctests..." 2038 import doctest 2039 doctest.testmod() 2040 print "Done"
2041 2042 if __name__ == "__main__": 2043 _test() 2044