Package Bio :: Package SeqUtils :: Module CheckSum
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.CheckSum

  1  # Copyright 2002 by Yves Bastide and Brad Chapman. 
  2  # Copyright 2007 by Sebastian Bassi 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Functions to calculate assorted sequence checksums.""" 
  9   
 10  # crc32, crc64, gcg, and seguid 
 11  # crc64 is adapted from BioPerl 
 12   
 13  from binascii import crc32 as _crc32 
 14   
15 -def crc32(seq):
16 """Returns the crc32 checksum for a sequence (string or Seq object).""" 17 try: 18 #Assume its a Seq object 19 return _crc32(seq.tostring()) 20 except AttributeError: 21 #Assume its a string 22 return _crc32(seq)
23
24 -def _init_table_h():
25 _table_h = [] 26 for i in range(256): 27 l = i 28 part_h = 0 29 for j in range(8): 30 rflag = l & 1 31 l >>= 1 32 if part_h & 1: l |= (1L << 31) 33 part_h >>= 1L 34 if rflag: part_h ^= 0xd8000000L 35 _table_h.append(part_h) 36 return _table_h
37 38 # Initialisation 39 _table_h = _init_table_h() 40
41 -def crc64(s):
42 """Returns the crc64 checksum for a sequence (string or Seq object).""" 43 crcl = 0 44 crch = 0 45 for c in s: 46 shr = (crch & 0xFF) << 24 47 temp1h = crch >> 8 48 temp1l = (crcl >> 8) | shr 49 idx = (crcl ^ ord(c)) & 0xFF 50 crch = temp1h ^ _table_h[idx] 51 crcl = temp1l 52 53 return "CRC-%08X%08X" % (crch, crcl)
54 55
56 -def gcg(seq):
57 """Returns the GCG checksum (int) for a sequence (string or Seq object). 58 59 Given a nucleotide or amino-acid secuence (or any string), 60 returns the GCG checksum (int). Checksum used by GCG program. 61 seq type = str. 62 Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi 63 with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina. 64 All sequences are converted to uppercase """ 65 index = checksum = 0 66 if type(seq)!=type("aa"): 67 seq=seq.tostring() 68 for char in seq: 69 index += 1 70 checksum += index * ord(char.upper()) 71 if index == 57: index = 0 72 return checksum % 10000
73
74 -def seguid(seq):
75 """Returns the SEGUID (string) for a sequence (string or Seq object). 76 77 Given a nucleotide or amino-acid secuence (or any string), 78 returns the SEGUID string (A SEquence Globally Unique IDentifier). 79 seq type = str. 80 For more information about SEGUID, see: 81 http://bioinformatics.anl.gov/seguid/ 82 DOI: 10.1002/pmic.200600032 """ 83 try: 84 #Python 2.5 sha1 is in hashlib 85 import hashlib 86 m = hashlib.sha1() 87 except: 88 #For older versions 89 import sha 90 m = sha.new() 91 import base64 92 if type(seq)!=type("aa"): 93 seq=seq.tostring().upper() 94 else: 95 seq=seq.upper() 96 m.update(seq) 97 try: 98 #For Python 2.5 99 return base64.b64encode(m.digest()).rstrip("=") 100 except: 101 #For older versions 102 import os 103 #Note: Using os.linesep doesn't work on Windows, 104 #where os.linesep= "\r\n" but the encoded string 105 #contains "\n" but not "\r\n" 106 return base64.encodestring(m.digest()).replace("\n","").rstrip("=")
107 108 if __name__ == "__main__": 109 print "Quick self test" 110 111 str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ 112 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ 113 + "YCSSYAGSSTLVFGGGTKLTVL" 114 115 str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ 116 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ 117 + "YCCSYAGSSTWVFGGGTKLTVL" 118 119 assert crc64(str_light_chain_one) == crc64(str_light_chain_two) 120 assert 'CRC-44CAAD88706CC153' == crc64(str_light_chain_one) 121 122 assert 'BpBeDdcNUYNsdk46JoJdw7Pd3BI' == seguid(str_light_chain_one) 123 assert 'X5XEaayob1nZLOc7eVT9qyczarY' == seguid(str_light_chain_two) 124 125 print "Done" 126