Package Bio :: Package Restriction :: Package _Update :: Module RestrictionCompiler
[hide private]
[frames] | no frames]

Source Code for Module Bio.Restriction._Update.RestrictionCompiler

  1  #!/usr/bin/env python 
  2  # 
  3  #      Restriction Analysis Libraries. 
  4  #      Copyright (C) 2004. Frederic Sohm. 
  5  # 
  6  # This code is part of the Biopython distribution and governed by its 
  7  # license.  Please see the LICENSE file that should have been included 
  8  # as part of this package. 
  9  # 
 10  #   this script is used to produce the dictionary which will contains the data 
 11  #   about the restriction enzymes from the Emboss/Rebase data files 
 12  #   namely 
 13  #   emboss_e.### (description of the sites), 
 14  #   emboss_r.### (origin, methylation, references) 
 15  #   emboss_s.### (suppliers) 
 16  #   where ### is a number of three digits : 1 for the year two for the month 
 17  # 
 18  #   very dirty implementation but it does the job, so... 
 19  #   Not very quick either but you are not supposed to use it frequently. 
 20  # 
 21  #   The results are stored in 
 22  #   path/to/site-packages/Bio/Restriction/Restriction_Dictionary.py 
 23  #   the file contains two dictionary: 
 24  #   'rest_dict' which contains the data for the enzymes 
 25  #   and 
 26  #   'suppliers' which map the name of the suppliers to their abbreviation. 
 27  # 
 28   
 29  """Convert a serie of Rebase files into a Restriction_Dictionary.py module. 
 30   
 31  The Rebase files are in the emboss format: 
 32   
 33      emboss_e.###    -> contains informations about the restriction sites. 
 34      emboss_r.###    -> contains general informations about the enzymes. 
 35      emboss_s.###    -> contains informations about the suppliers. 
 36       
 37  ### is a 3 digit number. The first digit is the year and the two last the month. 
 38  """ 
 39   
 40  import sre 
 41  import os 
 42  import itertools 
 43  import time 
 44  import sys 
 45  import site 
 46  import shutil 
 47   
 48  from Bio.Seq import Seq 
 49  from Bio.Alphabet import IUPAC 
 50   
 51  import Bio.Restriction.Restriction 
 52  from Bio.Restriction.Restriction import AbstractCut, RestrictionType, NoCut, OneCut,\ 
 53  TwoCuts, Meth_Dep, Meth_Undep, Palindromic, NonPalindromic, Unknown, Blunt,\ 
 54  Ov5, Ov3, NotDefined, Defined, Ambiguous, Commercially_available, Not_available  
 55   
 56  import Bio.Restriction.RanaConfig as config 
 57  from Bio.Restriction._Update.Update import RebaseUpdate 
 58  from Bio.Restriction.Restriction import * 
 59  from Bio.Restriction.DNAUtils import antiparallel 
 60   
 61  DNA=Seq 
 62  dna_alphabet = {'A':'A', 'C':'C', 'G':'G', 'T':'T', 
 63                  'R':'AG', 'Y':'CT', 'W':'AT', 'S':'CG', 'M':'AC', 'K':'GT', 
 64                  'H':'ACT', 'B':'CGT', 'V':'ACG', 'D':'AGT', 
 65                  'N':'ACGT', 
 66                  'a': 'a', 'c': 'c', 'g': 'g', 't': 't', 
 67                  'r':'ag', 'y':'ct', 'w':'at', 's':'cg', 'm':'ac', 'k':'gt', 
 68                  'h':'act', 'b':'cgt', 'v':'acg', 'd':'agt', 
 69                  'n':'acgt'} 
 70   
 71   
 72  complement_alphabet = {'A':'T', 'T':'A', 'C':'G', 'G':'C','R':'Y', 'Y':'R', 
 73                         'W':'W', 'S':'S', 'M':'K', 'K':'M', 'H':'D', 'D':'H', 
 74                         'B':'V', 'V':'B', 'N':'N','a':'t', 'c':'g', 'g':'c', 
 75                         't':'a', 'r':'y', 'y':'r', 'w':'w', 's':'s','m':'k', 
 76                         'k':'m', 'h':'d', 'd':'h', 'b':'v', 'v':'b', 'n':'n'} 
 77  enzymedict = {} 
 78  suppliersdict = {} 
 79  classdict = {} 
 80  typedict = {} 
 81   
 82   
83 -class OverhangError(ValueError):
84 """Exception for dealing with overhang.""" 85 pass
86
87 -def BaseExpand(base):
88 """BaseExpand(base) -> string. 89 90 given a degenerated base, returns its meaning in IUPAC alphabet. 91 92 i.e: 93 b= 'A' -> 'A' 94 b= 'N' -> 'ACGT' 95 etc...""" 96 base = base.upper() 97 return dna_alphabet[base]
98
99 -def regex(site):
100 """regex(site) -> string. 101 102 Construct a regular expression from a DNA sequence. 103 i.e.: 104 site = 'ABCGN' -> 'A[CGT]CG.'""" 105 reg_ex = site 106 for base in reg_ex: 107 if base in ('A', 'T', 'C', 'G', 'a', 'c', 'g', 't'): 108 pass 109 if base in ('N', 'n'): 110 reg_ex = '.'.join(reg_ex.split('N')) 111 reg_ex = '.'.join(reg_ex.split('n')) 112 if base in ('R', 'Y', 'W', 'M', 'S', 'K', 'H', 'D', 'B', 'V'): 113 expand = '['+ str(BaseExpand(base))+']' 114 reg_ex = expand.join(reg_ex.split(base)) 115 return reg_ex
116
117 -def Antiparallel(sequence):
118 """Antiparallel(sequence) -> string. 119 120 returns a string which represents the reverse complementary strand of 121 a DNA sequence.""" 122 return antiparallel(sequence.tostring())
123
124 -def is_palindrom(sequence):
125 """is_palindrom(sequence) -> bool. 126 127 True is the sequence is a palindrom. 128 sequence is a DNA object.""" 129 return sequence == DNA(Antiparallel(sequence))
130
131 -def LocalTime():
132 """LocalTime() -> string. 133 134 LocalTime calculate the extension for emboss file for the current year and 135 month.""" 136 t = time.gmtime() 137 year = str(t.tm_year)[-1] 138 month = str(t.tm_mon) 139 if len(month) == 1 : month = '0'+month 140 return year+month
141 142
143 -class newenzyme(object):
144 """construct the attributes of the enzyme corresponding to 'name'."""
145 - def __init__(cls, name):
146 cls.opt_temp = 37 147 cls.inact_temp = 65 148 cls.substrat = 'DNA' 149 target = enzymedict[name] 150 cls.site = target[0] 151 cls.size = target[1] 152 cls.suppl = tuple(target[9]) 153 cls.freq = target[11] 154 cls.ovhg = target[13] 155 cls.ovhgseq = target[14] 156 cls.bases = () 157 # 158 # Is the site palindromic? 159 # Important for the way the DNA is search for the site. 160 # Palindromic sites needs to be looked for only over 1 strand. 161 # Non Palindromic needs to be search for on the reverse complement 162 # as well. 163 # 164 if target[10] : cls.bases += ('Palindromic',) 165 else : cls.bases += ('NonPalindromic',) 166 # 167 # Number of cut the enzyme produce. 168 # 0 => unknown, the enzyme has not been fully characterised. 169 # 2 => 1 cut, (because one cut is realised by cutting 2 strands 170 # 4 => 2 cuts, same logic. 171 # A little bit confusing but it is the way EMBOSS/Rebase works. 172 # 173 if not target[2]: 174 # 175 # => undefined enzymes, nothing to be done. 176 # 177 cls.bases += ('NoCut','Unknown', 'NotDefined') 178 cls.fst5 = None 179 cls.fst3 = None 180 cls.scd5 = None 181 cls.scd3 = None 182 cls.ovhg = None 183 cls.ovhgseq = None 184 else: 185 # 186 # we will need to calculate the overhang. 187 # 188 if target[2] == 2: 189 cls.bases += ('OneCut',) 190 cls.fst5 = target[4] 191 cls.fst3 = target[5] 192 cls.scd5 = None 193 cls.scd3 = None 194 else: 195 cls.bases += ('TwoCuts',) 196 cls.fst5 = target[4] 197 cls.fst3 = target[5] 198 cls.scd5 = target[6] 199 cls.scd3 = target[7] 200 # 201 # Now, prepare the overhangs which will be added to the DNA 202 # after the cut. 203 # Undefined enzymes will not be allowed to catalyse, 204 # they are not available commercially anyway. 205 # I assumed that if an enzyme cut twice the overhang will be of 206 # the same kind. The only exception is HaeIV. I do not deal 207 # with that at the moment (ie I don't include it, 208 # need to be fixed). 209 # They generally cut outside their recognition site and 210 # therefore the overhang is undetermined and dependent of 211 # the DNA sequence upon which the enzyme act. 212 # 213 if target[3]: 214 # 215 # rebase field for blunt: blunt == 1, other == 0. 216 # The enzyme is blunt. No overhang. 217 # 218 cls.bases += ('Blunt', 'Defined') 219 cls.ovhg = 0 220 elif isinstance(cls.ovhg, int): 221 # 222 # => overhang is sequence dependent 223 # 224 if cls.ovhg > 0: 225 # 226 # 3' overhang, ambiguous site (outside recognition site 227 # or site containing ambiguous bases (N, W, R,...) 228 # 229 cls.bases += ('Ov3', 'Ambiguous') 230 elif cls.ovhg < 0: 231 # 232 # 5' overhang, ambiguous site (outside recognition site 233 # or site containing ambiguous bases (N, W, R,...) 234 # 235 cls.bases += ('Ov5', 'Ambiguous') 236 else: 237 # 238 # cls.ovhg is a string => overhang is constant 239 # 240 if cls.fst5 - (cls.fst3 + cls.size) < 0: 241 cls.bases += ('Ov5', 'Defined') 242 cls.ovhg = - len(cls.ovhg) 243 else: 244 cls.bases += ('Ov3', 'Defined') 245 cls.ovhg = + len(cls.ovhg) 246 # 247 # Next class : sensibility to methylation. 248 # Set by EmbossMixer from emboss_r.txt file 249 # Not really methylation dependent at the moment, stands rather for 250 # 'is the site methylable?'. 251 # Proper methylation sensibility has yet to be implemented. 252 # But the class is there for further development. 253 # 254 if target[8]: 255 cls.bases += ('Meth_Dep', ) 256 cls.compsite = target[12] 257 else: 258 cls.bases += ('Meth_Undep',) 259 cls.compsite = target[12] 260 # 261 # Next class will allow to select enzymes in function of their 262 # suppliers. Not essential but can be useful. 263 # 264 if cls.suppl: 265 cls.bases += ('Commercially_available', ) 266 else: 267 cls.bases += ('Not_available', ) 268 cls.bases += ('AbstractCut', 'RestrictionType') 269 cls.__name__ = name 270 cls.results = None 271 cls.dna = None 272 cls.__bases__ = cls.bases 273 cls.charac = (cls.fst5, cls.fst3, cls.scd5, cls.scd3, cls.site) 274 if not target[2] and cls.suppl: 275 supp = ', '.join([suppliersdict[s][0] for s in cls.suppl]) 276 print 'WARNING : It seems that %s is both commercially available\ 277 \n\tand its characteristics are unknown. \ 278 \n\tThis seems counter-intuitive.\ 279 \n\tThere is certainly an error either in ranacompiler or\ 280 \n\tin this REBASE release.\ 281 \n\tThe supplier is : %s.' % (name, supp) 282 return
283 284
285 -class TypeCompiler(object):
286 287 """Build the different types possible for Restriction Enzymes""" 288
289 - def __init__(self):
290 """TypeCompiler() -> new TypeCompiler instance.""" 291 pass
292
293 - def buildtype(self):
294 """TC.buildtype() -> generator. 295 296 build the new types that will be needed for constructing the 297 restriction enzymes.""" 298 baT = (AbstractCut, RestrictionType) 299 cuT = (NoCut, OneCut, TwoCuts) 300 meT = (Meth_Dep, Meth_Undep) 301 paT = (Palindromic, NonPalindromic) 302 ovT = (Unknown, Blunt, Ov5, Ov3) 303 deT = (NotDefined, Defined, Ambiguous) 304 coT = (Commercially_available, Not_available) 305 All = (baT, cuT, meT, paT, ovT, deT, coT) 306 # 307 # Now build the types. Only the most obvious are left out. 308 # Modified even the most obvious are not so obvious. 309 # emboss_*.403 AspCNI is unknown and commercially available. 310 # So now do not remove the most obvious. 311 # 312 types = [(p,c,o,d,m,co,baT[0],baT[1]) 313 for p in paT for c in cuT for o in ovT 314 for d in deT for m in meT for co in coT] 315 n= 1 316 for ty in types: 317 dct = {} 318 for t in ty: 319 dct.update(t.__dict__) 320 # 321 # here we need to customize the dictionary. 322 # i.e. types deriving from OneCut have always scd5 and scd3 323 # equal to None. No need therefore to store that in a specific 324 # enzyme of this type. but it then need to be in the type. 325 # 326 dct['results'] = [] 327 dct['substrat'] = 'DNA' 328 dct['dna'] = None 329 if t == NoCut: 330 dct.update({'fst5':None,'fst3':None, 331 'scd5':None,'scd3':None, 332 'ovhg':None,'ovhgseq':None}) 333 elif t == OneCut: 334 dct.update({'scd5':None, 'scd3':None}) 335 class klass(type): 336 def __new__(cls): 337 return type.__new__(cls, 'type%i'%n,ty,dct)
338 def __init__(cls): 339 super(klass, cls).__init__('type%i'%n,ty,dct)
340 yield klass() 341 n+=1 342 343 start = '\n\ 344 #!/usr/bin/env python\n\ 345 #\n\ 346 # Restriction Analysis Libraries.\n\ 347 # Copyright (C) 2004. Frederic Sohm.\n\ 348 #\n\ 349 # This code is part of the Biopython distribution and governed by its\n\ 350 # license. Please see the LICENSE file that should have been included\n\ 351 # as part of this package.\n\ 352 #\n\ 353 # This file is automatically generated - do not edit it by hand! Instead,\n\ 354 # use the tool Scripts/Restriction/ranacompiler.py which in turn uses\n\ 355 # Bio/Restriction/_Update/RestrictionCompiler.py\n\ 356 #\n\ 357 # The following dictionaries used to be defined in one go, but that does\n\ 358 # not work on Jython due to JVM limitations. Therefore we break this up\n\ 359 # into steps, using temporary functions to avoid the JVM limits.\n\ 360 \n\n' 361
362 -class DictionaryBuilder(object):
363
364 - def __init__(self, e_mail='', ftp_proxy=''):
365 """DictionaryBuilder([e_mail[, ftp_proxy]) -> DictionaryBuilder instance. 366 367 If the emboss files used for the construction need to be updated this 368 class will download them if the ftp connection is correctly set. 369 either in RanaConfig.py or given at run time. 370 371 e_mail is the e-mail address used as password for the anonymous 372 ftp connection. 373 374 proxy is the ftp_proxy to use if any.""" 375 self.rebase_pass = e_mail or config.Rebase_password 376 self.proxy = ftp_proxy or config.ftp_proxy
377
378 - def build_dict(self):
379 """DB.build_dict() -> None. 380 381 Construct the dictionary and build the files containing the new 382 dictionaries.""" 383 # 384 # first parse the emboss files. 385 # 386 emboss_e, emboss_r, emboss_s = self.lastrebasefile() 387 # 388 # the results will be stored into enzymedict. 389 # 390 self.information_mixer(emboss_r, emboss_e, emboss_s) 391 emboss_r.close() 392 emboss_e.close() 393 emboss_s.close() 394 # 395 # we build all the possible type 396 # 397 tdct = {} 398 for klass in TypeCompiler().buildtype(): 399 exec klass.__name__ +'= klass' 400 exec "tdct['"+klass.__name__+"'] = klass" 401 402 # 403 # Now we build the enzymes from enzymedict 404 # and store them in a dictionary. 405 # The type we will need will also be stored. 406 # 407 408 for name in enzymedict: 409 # 410 # the class attributes first: 411 # 412 cls = newenzyme(name) 413 # 414 # Now select the right type for the enzyme. 415 # 416 bases = cls.bases 417 clsbases = tuple([eval(x) for x in bases]) 418 typestuff = '' 419 for n, t in tdct.iteritems(): 420 # 421 # if the bases are the same. it is the right type. 422 # create the enzyme and remember the type 423 # 424 if t.__bases__ == clsbases: 425 typestuff = t 426 typename = t.__name__ 427 continue 428 # 429 # now we build the dictionaries. 430 # 431 dct = dict(cls.__dict__) 432 del dct['bases'] 433 del dct['__bases__'] 434 del dct['__name__']# no need to keep that, it's already in the type. 435 classdict[name] = dct 436 437 commonattr = ['fst5', 'fst3', 'scd5', 'scd3', 'substrat', 438 'ovhg', 'ovhgseq','results', 'dna'] 439 if typename in typedict: 440 typedict[typename][1].append(name) 441 else: 442 enzlst= [] 443 tydct = dict(typestuff.__dict__) 444 tydct = dict([(k,v) for k,v in tydct.iteritems() if k in commonattr]) 445 enzlst.append(name) 446 typedict[typename] = (bases, enzlst) 447 for letter in cls.__dict__['suppl']: 448 supplier = suppliersdict[letter] 449 suppliersdict[letter][1].append(name) 450 if not classdict or not suppliersdict or not typedict: 451 print 'One of the new dictionaries is empty.' 452 print 'Check the integrity of the emboss file before continuing.' 453 print 'Update aborted.' 454 sys.exit() 455 # 456 # How many enzymes this time? 457 # 458 print '\nThe new database contains %i enzymes.\n' % len(classdict) 459 # 460 # the dictionaries are done. Build the file 461 # 462 #update = config.updatefolder 463 464 update = os.getcwd() 465 results = open(os.path.join(update, 'Restriction_Dictionary.py'), 'w') 466 print 'Writing the dictionary containing the new Restriction classes.\t', 467 results.write(start) 468 results.write('rest_dict = {}\n') 469 for name in sorted(classdict) : 470 results.write("def _temp():\n") 471 results.write(" return {\n") 472 for key, value in classdict[name].iteritems() : 473 results.write(" %s : %s,\n" % (repr(key), repr(value))) 474 results.write(" }\n") 475 results.write("rest_dict[%s] = _temp()\n" % repr(name)) 476 results.write("\n") 477 print 'OK.\n' 478 print 'Writing the dictionary containing the suppliers datas.\t\t', 479 results.write('suppliers = {}\n') 480 for name in sorted(suppliersdict) : 481 results.write("def _temp():\n") 482 results.write(" return (\n") 483 for value in suppliersdict[name] : 484 results.write(" %s,\n" % repr(value)) 485 results.write(" )\n") 486 results.write("suppliers[%s] = _temp()\n" % repr(name)) 487 results.write("\n") 488 print 'OK.\n' 489 print 'Writing the dictionary containing the Restriction types.\t', 490 results.write('typedict = {}\n') 491 for name in sorted(typedict) : 492 results.write("def _temp():\n") 493 results.write(" return (\n") 494 for value in typedict[name] : 495 results.write(" %s,\n" % repr(value)) 496 results.write(" )\n") 497 results.write("typedict[%s] = _temp()\n" % repr(name)) 498 results.write("\n") 499 #I had wanted to do "del _temp" at each stage (just for clarity), but 500 #that pushed the code size just over the Jython JVM limit. We include 501 #one the final "del _temp" to clean up the namespace. 502 results.write("del _temp\n") 503 results.write("\n") 504 print 'OK.\n' 505 results.close() 506 return
507
508 - def install_dict(self):
509 """DB.install_dict() -> None. 510 511 Install the newly created dictionary in the site-packages folder. 512 513 May need super user privilege on some architectures.""" 514 print '\n ' +'*'*78 + ' \n' 515 print '\n\t\tInstalling Restriction_Dictionary.py' 516 try: 517 import Bio.Restriction.Restriction_Dictionary as rd 518 except ImportError: 519 print '\ 520 \n Unable to locate the previous Restriction_Dictionary.py module\ 521 \n Aborting installation.' 522 sys.exit() 523 # 524 # first save the old file in Updates 525 # 526 old = os.path.join(os.path.split(rd.__file__)[0], 527 'Restriction_Dictionary.py') 528 #update_folder = config.updatefolder 529 update_folder = os.getcwd() 530 shutil.copyfile(old, os.path.join(update_folder, 531 'Restriction_Dictionary.old')) 532 # 533 # Now test and install. 534 # 535 new = os.path.join(update_folder, 'Restriction_Dictionary.py') 536 try: 537 execfile(new) 538 print '\ 539 \n\tThe new file seems ok. Proceeding with the installation.' 540 except SyntaxError: 541 print '\ 542 \n The new dictionary file is corrupted. Aborting the installation.' 543 return 544 try: 545 shutil.copyfile(new, old) 546 print'\n\t Everything ok. If you need it a version of the old\ 547 \n\t dictionary have been saved in the Updates folder under\ 548 \n\t the name Restriction_Dictionary.old.' 549 print '\n ' +'*'*78 + ' \n' 550 except IOError: 551 print '\n ' +'*'*78 + ' \n' 552 print '\ 553 \n\t WARNING : Impossible to install the new dictionary.\ 554 \n\t Are you sure you have write permission to the folder :\n\ 555 \n\t %s ?\n\n' % os.path.split(old)[0] 556 return self.no_install() 557 return
558
559 - def no_install(self):
560 """BD.no_install() -> None. 561 562 build the new dictionary but do not install the dictionary.""" 563 print '\n ' +'*'*78 + '\n' 564 #update = config.updatefolder 565 try: 566 import Bio.Restriction.Restriction_Dictionary as rd 567 except ImportError: 568 print '\ 569 \n Unable to locate the previous Restriction_Dictionary.py module\ 570 \n Aborting installation.' 571 sys.exit() 572 # 573 # first save the old file in Updates 574 # 575 old = os.path.join(os.path.split(rd.__file__)[0], 576 'Restriction_Dictionary.py') 577 update = os.getcwd() 578 shutil.copyfile(old, os.path.join(update, 'Restriction_Dictionary.old')) 579 places = update, os.path.split(Bio.Restriction.Restriction.__file__)[0] 580 print "\t\tCompilation of the new dictionary : OK.\ 581 \n\t\tInstallation : No.\n\ 582 \n You will find the newly created 'Restriction_Dictionary.py' file\ 583 \n in the folder : \n\ 584 \n\t%s\n\ 585 \n Make a copy of 'Restriction_Dictionary.py' and place it with \ 586 \n the other Restriction libraries.\n\ 587 \n note : \ 588 \n This folder should be :\n\ 589 \n\t%s\n" % places 590 print '\n ' +'*'*78 + '\n' 591 return
592 593
594 - def lastrebasefile(self):
595 """BD.lastrebasefile() -> None. 596 597 Check the emboss files are up to date and download them if they are not. 598 """ 599 embossnames = ('emboss_e', 'emboss_r', 'emboss_s') 600 # 601 # first check if we have the last update: 602 # 603 emboss_now = ['.'.join((x,LocalTime())) for x in embossnames] 604 update_needed = False 605 #dircontent = os.listdir(config.Rebase) # local database content 606 dircontent = os.listdir(os.getcwd()) 607 base = os.getcwd() # added for biopython current directory 608 for name in emboss_now: 609 if name in dircontent: 610 pass 611 else: 612 update_needed = True 613 614 if not update_needed: 615 # 616 # nothing to be done 617 # 618 print '\n Using the files : %s'% ', '.join(emboss_now) 619 return tuple([open(os.path.join(base, n)) for n in emboss_now]) 620 else: 621 # 622 # may be download the files. 623 # 624 print '\n The rebase files are more than one month old.\ 625 \n Would you like to update them before proceeding?(y/n)' 626 r = raw_input(' update [n] >>> ') 627 if r in ['y', 'yes', 'Y', 'Yes']: 628 updt = RebaseUpdate(self.rebase_pass, self.proxy) 629 updt.openRebase() 630 updt.getfiles() 631 updt.close() 632 print '\n Update complete. Creating the dictionaries.\n' 633 print '\n Using the files : %s'% ', '.join(emboss_now) 634 return tuple([open(os.path.join(base, n)) for n in emboss_now]) 635 else: 636 # 637 # we will use the last files found without updating. 638 # But first we check we have some file to use. 639 # 640 class NotFoundError(Exception): 641 pass
642 for name in embossnames: 643 try: 644 for file in dircontent: 645 if file.startswith(name): 646 break 647 else: 648 pass 649 raise NotFoundError 650 except NotFoundError: 651 print "\nNo %s file found. Upgrade is impossible.\n"%name 652 sys.exit() 653 continue 654 pass 655 # 656 # now find the last file. 657 # 658 last = [0] 659 for file in dircontent: 660 fs = file.split('.') 661 try: 662 if fs[0] in embossnames and int(fs[1]) > int(last[-1]): 663 if last[0] : last.append(fs[1]) 664 else : last[0] = fs[1] 665 else: 666 continue 667 except ValueError: 668 continue 669 last.sort() 670 last = last[::-1] 671 if int(last[-1]) < 100 : last[0], last[-1] = last[-1], last[0] 672 for number in last: 673 files = [(name, name+'.%s'%number) for name in embossnames] 674 strmess = '\nLast EMBOSS files found are :\n' 675 try: 676 for name,file in files: 677 if os.path.isfile(os.path.join(base, file)): 678 strmess += '\t%s.\n'%file 679 else: 680 raise ValueError 681 print strmess 682 emboss_e = open(os.path.join(base, 'emboss_e.%s'%number),'r') 683 emboss_r = open(os.path.join(base, 'emboss_r.%s'%number),'r') 684 emboss_s = open(os.path.join(base, 'emboss_s.%s'%number),'r') 685 return emboss_e, emboss_r, emboss_s 686 except ValueError: 687 continue
688
689 - def parseline(self, line):
690 line = [line[0]]+[line[1].upper()]+[int(i) for i in line[2:9]]+line[9:] 691 name = line[0].replace("-","_") 692 site = line[1] # sequence of the recognition site 693 dna = DNA(site) 694 size = line[2] # size of the recognition site 695 # 696 # Calculate the overhang. 697 # 698 fst5 = line[5] # first site sense strand 699 fst3 = line[6] # first site antisense strand 700 scd5 = line[7] # second site sense strand 701 scd3 = line[8] # second site antisense strand 702 703 # 704 # the overhang is the difference between the two cut 705 # 706 ovhg1 = fst5 - fst3 707 ovhg2 = scd5 - scd3 708 709 # 710 # 0 has the meaning 'do not cut' in rebase. So we get short of 1 711 # for the negative numbers so we add 1 to negative sites for now. 712 # We will deal with the record later. 713 # 714 715 if fst5 < 0 : fst5 += 1 716 if fst3 < 0 : fst3 += 1 717 if scd5 < 0 : scd5 += 1 718 if scd3 < 0 : scd3 += 1 719 720 if ovhg2 != 0 and ovhg1 != ovhg2: 721 # 722 # different length of the overhang of the first and second cut 723 # it's a pain to deal with and at the moment it concerns only 724 # one enzyme which is not commercially available (HaeIV). 725 # So we don't deal with it but we check the progression 726 # of the affair. 727 # Should HaeIV become commercially available or other similar 728 # new enzymes be added, this might be modified. 729 # 730 print '\ 731 \nWARNING : %s cut twice with different overhang length each time.\ 732 \n\tUnable to deal with this behaviour. \ 733 \n\tThis enzyme will not be included in the database. Sorry.' %name 734 print '\tChecking :', 735 raise OverhangError 736 if 0 <= fst5 <= size and 0 <= fst3 <= size: 737 # 738 # cut inside recognition site 739 # 740 if fst5 < fst3: 741 # 742 # 5' overhang 743 # 744 ovhg1 = ovhgseq = site[fst5:fst3] 745 elif fst5 > fst3: 746 # 747 # 3' overhang 748 # 749 ovhg1 = ovhgseq = site[fst3:fst5] 750 else: 751 # 752 # blunt 753 # 754 ovhg1 = ovhgseq = '' 755 for base in 'NRYWMSKHDBV': 756 if base in ovhg1: 757 # 758 # site and overhang degenerated 759 # 760 ovhgseq = ovhg1 761 if fst5 < fst3 : ovhg1 = - len(ovhg1) 762 else : ovhg1 = len(ovhg1) 763 break 764 else: 765 continue 766 elif 0 <= fst5 <= size: 767 # 768 # 5' cut inside the site 3' outside 769 # 770 if fst5 < fst3: 771 # 772 # 3' cut after the site 773 # 774 ovhgseq = site[fst5:] + (fst3 - size) * 'N' 775 elif fst5 > fst3: 776 # 777 # 3' cut before the site 778 # 779 ovhgseq = abs(fst3) * 'N' + site[:fst5] 780 else: 781 # 782 # blunt outside 783 # 784 ovhg1 = ovhgseq = '' 785 elif 0 <= fst3 <= size: 786 # 787 # 3' cut inside the site, 5' outside 788 # 789 if fst5 < fst3: 790 # 791 # 5' cut before the site 792 # 793 ovhgseq = abs(fst5) * 'N' + site[:fst3] 794 elif fst5 > fst3: 795 # 796 # 5' cut after the site 797 # 798 ovhgseq = site[fst3:] + (fst5 - size) * 'N' 799 else: 800 # 801 # should not happend 802 # 803 raise ValueError('Error in #1') 804 elif fst3 < 0 and size < fst5: 805 # 806 # 3' overhang. site is included. 807 # 808 ovhgseq = abs(fst3)*'N' + site + (fst5-size)*'N' 809 elif fst5 < 0 and size <fst3: 810 # 811 # 5' overhang. site is included. 812 # 813 ovhgseq = abs(fst5)*'N' + site + (fst3-size)*'N' 814 else: 815 # 816 # 5' and 3' outside of the site 817 # 818 ovhgseq = 'N' * abs(ovhg1) 819 # 820 # Now line[5] to [8] are the location of the cut but we have to 821 # deal with the weird mathematics of biologists. 822 # 823 # EMBOSS sequence numbering give: 824 # DNA = 'a c g t A C G T' 825 # -1 1 2 3 4 826 # 827 # Biologists do not know about 0. Too much use of latin certainly. 828 # 829 # To compensate, we add 1 to the positions if they are negative. 830 # No need to modify 0 as it means no cut and will not been used. 831 # Positive numbers should be ok since our sequence starts 1. 832 # 833 # Moreover line[6] and line[8] represent cut on the reverse strand. 834 # They will be used for non palindromic sites and sre.finditer 835 # will detect the site in inverse orientation so we need to add the 836 # length of the site to compensate (+1 if they are negative). 837 # 838 for x in (5, 7): 839 if line[x] < 0 : line[x] += 1 840 for x in (6, 8): 841 if line[x] > 0 : line[x] -= size 842 elif line[x] < 0 : line[x] = line[x] - size + 1 843 # 844 # now is the site palindromic? 845 # produce the regular expression which correspond to the site. 846 # tag of the regex will be the name of the enzyme for palindromic 847 # enzymesband two tags for the other, the name for the sense sequence 848 # and the name with '_as' at the end for the antisense sequence. 849 # 850 rg = '' 851 if is_palindrom(dna): 852 line.append(True) 853 rg = ''.join(['(?P<', name, '>', regex(site.upper()), ')']) 854 else: 855 line.append(False) 856 sense = ''.join(['(?P<', name, '>', regex(site.upper()), ')']) 857 antisense = ''.join(['(?P<', name, '_as>', 858 regex(Antiparallel(dna)), ')']) 859 rg = sense + '|' + antisense 860 # 861 # exact frequency of the site. (ie freq(N) == 1, ...) 862 # 863 f = [4/len(dna_alphabet[l]) for l in site.upper()] 864 freq = reduce(lambda x, y : x*y, f) 865 line.append(freq) 866 # 867 # append regex and ovhg1, they have not been appended before not to 868 # break the factory class. simply to leazy to make the changes there. 869 # 870 line.append(rg) 871 line.append(ovhg1) 872 line.append(ovhgseq) 873 return line
874
875 - def removestart(self, file):
876 # 877 # remove the heading of the file. 878 # 879 return [l for l in itertools.dropwhile(lambda l:l.startswith('#'),file)]
880
881 - def getblock(self, file, index):
882 # 883 # emboss_r.txt, separation between blocks is // 884 # 885 take = itertools.takewhile 886 block = [l for l in take(lambda l :not l.startswith('//'),file[index:])] 887 index += len(block)+1 888 return block, index
889
890 - def get(self, block):
891 # 892 # take what we want from the block. 893 # Each block correspond to one enzyme. 894 # block[0] => enzyme name 895 # block[3] => methylation (position and type) 896 # block[5] => suppliers (as a string of single letter) 897 # 898 bl3 = block[3].strip() 899 if not bl3 : bl3 = False # site is not methylable 900 return (block[0].strip(), bl3, block[5].strip())
901
902 - def information_mixer(self, file1, file2, file3):
903 # 904 # Mix all the information from the 3 files and produce a coherent 905 # restriction record. 906 # 907 methfile = self.removestart(file1) 908 sitefile = self.removestart(file2) 909 supplier = self.removestart(file3) 910 911 i1, i2= 0, 0 912 try: 913 while True: 914 block, i1 = self.getblock(methfile, i1) 915 bl = self.get(block) 916 line = (sitefile[i2].strip()).split() 917 name = line[0] 918 if name == bl[0]: 919 line.append(bl[1]) # -> methylation 920 line.append(bl[2]) # -> suppliers 921 else: 922 bl = self.get(oldblock) 923 if line[0] == bl[0]: 924 line.append(bl[1]) 925 line.append(bl[2]) 926 i2 += 1 927 else: 928 raise TypeError 929 oldblock = block 930 i2 += 1 931 try: 932 line = self.parseline(line) 933 except OverhangError : # overhang error 934 n = name # do not include the enzyme 935 if not bl[2]: 936 print 'Anyway, %s is not commercially available.\n' %n 937 else: 938 print 'Unfortunately, %s is commercially available.\n'%n 939 940 continue 941 #Hyphens can't be used as a Python name, nor as a 942 #group name in a regular expression. 943 name = name.replace("-","_") 944 if name in enzymedict: 945 # 946 # deal with TaqII and its two sites. 947 # 948 print '\nWARNING :', 949 print name, 'has two different sites.\n' 950 other = line[0].replace("-","_") 951 dna = DNA(line[1]) 952 sense1 = regex(dna.tostring()) 953 antisense1 = regex(Antiparallel(dna)) 954 dna = DNA(enzymedict[other][0]) 955 sense2 = regex(dna.tostring()) 956 antisense2 = regex(Antiparallel(dna)) 957 sense = '(?P<'+other+'>'+sense1+'|'+sense2+')' 958 antisense = '(?P<'+other+'_as>'+antisense1+'|'+antisense2 + ')' 959 reg = sense + '|' + antisense 960 line[1] = line[1] + '|' + enzymedict[other][0] 961 line[-1] = reg 962 # 963 # the data to produce the enzyme class are then stored in 964 # enzymedict. 965 # 966 enzymedict[name] = line[1:] #element zero was the name 967 except IndexError: 968 pass 969 for i in supplier: 970 # 971 # construction of the list of suppliers. 972 # 973 t = i.strip().split(' ', 1) 974 suppliersdict[t[0]] = (t[1], []) 975 return
976