| Trees | Indices | Help |
|---|
|
|
1 #!/usr/bin/env python
2 #
3 # Restriction Analysis Libraries.
4 # Copyright (C) 2004. Frederic Sohm.
5 #
6 # This code is part of the Biopython distribution and governed by its
7 # license. Please see the LICENSE file that should have been included
8 # as part of this package.
9 #
10 # this script is used to produce the dictionary which will contains the data
11 # about the restriction enzymes from the Emboss/Rebase data files
12 # namely
13 # emboss_e.### (description of the sites),
14 # emboss_r.### (origin, methylation, references)
15 # emboss_s.### (suppliers)
16 # where ### is a number of three digits : 1 for the year two for the month
17 #
18 # very dirty implementation but it does the job, so...
19 # Not very quick either but you are not supposed to use it frequently.
20 #
21 # The results are stored in
22 # path/to/site-packages/Bio/Restriction/Restriction_Dictionary.py
23 # the file contains two dictionary :
24 # 'rest_dict' which contains the data for the enzymes
25 # and
26 # 'suppliers' which map the name of the suppliers to their abbreviation.
27 #
28
29 """Convert a serie of Rebase files into a Restriction_Dictionary.py module.
30
31 The Rebase files are in the emboss format :
32
33 emboss_e.### -> contains informations about the restriction sites.
34 emboss_r.### -> contains general informations about the enzymes.
35 emboss_s.### -> contains informations about the suppliers.
36
37 ### is a 3 digit number. The first digit is the year and the two last the month.
38 """
39
40 import sre
41 import os
42 import itertools
43 import pprint
44 import time
45 import sys
46 import site
47 import shutil
48
49 from Bio.Seq import Seq
50 from Bio.Alphabet import IUPAC
51
52 import Bio.Restriction.Restriction
53 from Bio.Restriction.Restriction import AbstractCut, RestrictionType, NoCut, OneCut,\
54 TwoCuts, Meth_Dep, Meth_Undep, Palindromic, NonPalindromic, Unknown, Blunt,\
55 Ov5, Ov3, NotDefined, Defined, Ambiguous, Commercially_available, Not_available
56
57 import Bio.Restriction.RanaConfig as config
58 from Bio.Restriction._Update.Update import RebaseUpdate
59 from Bio.Restriction.Restriction import *
60 from Bio.Restriction.DNAUtils import antiparallel
61
62 DNA=Seq
63 dna_alphabet = {'A':'A', 'C':'C', 'G':'G', 'T':'T',
64 'R':'AG', 'Y':'CT', 'W':'AT', 'S':'CG', 'M':'AC', 'K':'GT',
65 'H':'ACT', 'B':'CGT', 'V':'ACG', 'D':'AGT',
66 'N':'ACGT',
67 'a': 'a', 'c': 'c', 'g': 'g', 't': 't',
68 'r':'ag', 'y':'ct', 'w':'at', 's':'cg', 'm':'ac', 'k':'gt',
69 'h':'act', 'b':'cgt', 'v':'acg', 'd':'agt',
70 'n':'acgt'}
71
72
73 complement_alphabet = {'A':'T', 'T':'A', 'C':'G', 'G':'C','R':'Y', 'Y':'R',
74 'W':'W', 'S':'S', 'M':'K', 'K':'M', 'H':'D', 'D':'H',
75 'B':'V', 'V':'B', 'N':'N','a':'t', 'c':'g', 'g':'c',
76 't':'a', 'r':'y', 'y':'r', 'w':'w', 's':'s','m':'k',
77 'k':'m', 'h':'d', 'd':'h', 'b':'v', 'v':'b', 'n':'n'}
78 enzymedict = {}
79 suppliersdict = {}
80 classdict = {}
81 typedict = {}
82
83
87
89 """BaseExpand(base) -> string.
90
91 given a degenerated base, returns its meaning in IUPAC alphabet.
92
93 i.e:
94 b= 'A' -> 'A'
95 b= 'N' -> 'ACGT'
96 etc..."""
97 base = base.upper()
98 return dna_alphabet[base]
99
101 """regex(site) -> string.
102
103 Construct a regular expression from a DNA sequence.
104 i.e. :
105 site = 'ABCGN' -> 'A[CGT]CG.'"""
106 reg_ex = site
107 for base in reg_ex :
108 if base in ('A', 'T', 'C', 'G', 'a', 'c', 'g', 't') :
109 pass
110 if base in ('N', 'n') :
111 reg_ex = '.'.join(reg_ex.split('N'))
112 reg_ex = '.'.join(reg_ex.split('n'))
113 if base in ('R', 'Y', 'W', 'M', 'S', 'K', 'H', 'D', 'B', 'V') :
114 expand = '['+ str(BaseExpand(base))+']'
115 reg_ex = expand.join(reg_ex.split(base))
116 return reg_ex
117
119 """Antiparallel(sequence) -> string.
120
121 returns a string which represents the reverse complementary strand of
122 a DNA sequence."""
123 return antiparallel(sequence.tostring())
124
126 """is_palindrom(sequence) -> bool.
127
128 True is the sequence is a palindrom.
129 sequence is a DNA object."""
130 return sequence == DNA(Antiparallel(sequence))
131
133 """LocalTime() -> string.
134
135 LocalTime calculate the extension for emboss file for the current year and
136 month."""
137 t = time.gmtime()
138 year = str(t.tm_year)[-1]
139 month = str(t.tm_mon)
140 if len(month) == 1 : month = '0'+month
141 return year+month
142
143
145 """construct the attributes of the enzyme corresponding to 'name'."""
147 cls.opt_temp = 37
148 cls.inact_temp = 65
149 cls.substrat = 'DNA'
150 target = enzymedict[name]
151 cls.site = target[0]
152 cls.size = target[1]
153 cls.suppl = tuple(target[9])
154 cls.freq = target[11]
155 cls.ovhg = target[13]
156 cls.ovhgseq = target[14]
157 cls.bases = ()
158 #
159 # Is the site palindromic?
160 # Important for the way the DNA is search for the site.
161 # Palindromic sites needs to be looked for only over 1 strand.
162 # Non Palindromic needs to be search for on the reverse complement
163 # as well.
164 #
165 if target[10] : cls.bases += ('Palindromic',)
166 else : cls.bases += ('NonPalindromic',)
167 #
168 # Number of cut the enzyme produce.
169 # 0 => unknown, the enzyme has not been fully characterised.
170 # 2 => 1 cut, (because one cut is realised by cutting 2 strands
171 # 4 => 2 cuts, same logic.
172 # A little bit confusing but it is the way EMBOSS/Rebase works.
173 #
174 if not target[2] :
175 #
176 # => undefined enzymes, nothing to be done.
177 #
178 cls.bases += ('NoCut','Unknown', 'NotDefined')
179 cls.fst5 = None
180 cls.fst3 = None
181 cls.scd5 = None
182 cls.scd3 = None
183 cls.ovhg = None
184 cls.ovhgseq = None
185 else :
186 #
187 # we will need to calculate the overhang.
188 #
189 if target[2] == 2 :
190 cls.bases += ('OneCut',)
191 cls.fst5 = target[4]
192 cls.fst3 = target[5]
193 cls.scd5 = None
194 cls.scd3 = None
195 else :
196 cls.bases += ('TwoCuts',)
197 cls.fst5 = target[4]
198 cls.fst3 = target[5]
199 cls.scd5 = target[6]
200 cls.scd3 = target[7]
201 #
202 # Now, prepare the overhangs which will be added to the DNA
203 # after the cut.
204 # Undefined enzymes will not be allowed to catalyse,
205 # they are not available commercially anyway.
206 # I assumed that if an enzyme cut twice the overhang will be of
207 # the same kind. The only exception is HaeIV. I do not deal
208 # with that at the moment (ie I don't include it,
209 # need to be fixed).
210 # They generally cut outside their recognition site and
211 # therefore the overhang is undetermined and dependent of
212 # the DNA sequence upon which the enzyme act.
213 #
214 if target[3] :
215 #
216 # rebase field for blunt: blunt == 1, other == 0.
217 # The enzyme is blunt. No overhang.
218 #
219 cls.bases += ('Blunt', 'Defined')
220 cls.ovhg = 0
221 elif isinstance(cls.ovhg, int) :
222 #
223 # => overhang is sequence dependent
224 #
225 if cls.ovhg > 0 :
226 #
227 # 3' overhang, ambiguous site (outside recognition site
228 # or site containing ambiguous bases (N, W, R,...)
229 #
230 cls.bases += ('Ov3', 'Ambiguous')
231 elif cls.ovhg < 0 :
232 #
233 # 5' overhang, ambiguous site (outside recognition site
234 # or site containing ambiguous bases (N, W, R,...)
235 #
236 cls.bases += ('Ov5', 'Ambiguous')
237 else :
238 #
239 # cls.ovhg is a string => overhang is constant
240 #
241 if cls.fst5 - (cls.fst3 + cls.size) < 0 :
242 cls.bases += ('Ov5', 'Defined')
243 cls.ovhg = - len(cls.ovhg)
244 else :
245 cls.bases += ('Ov3', 'Defined')
246 cls.ovhg = + len(cls.ovhg)
247 #
248 # Next class : sensibility to methylation.
249 # Set by EmbossMixer from emboss_r.txt file
250 # Not really methylation dependent at the moment, stands rather for
251 # 'is the site methylable?'.
252 # Proper methylation sensibility has yet to be implemented.
253 # But the class is there for further development.
254 #
255 if target[8] :
256 cls.bases += ('Meth_Dep', )
257 cls.compsite = target[12]
258 else :
259 cls.bases += ('Meth_Undep',)
260 cls.compsite = target[12]
261 #
262 # Next class will allow to select enzymes in function of their
263 # suppliers. Not essential but can be useful.
264 #
265 if cls.suppl :
266 cls.bases += ('Commercially_available', )
267 else :
268 cls.bases += ('Not_available', )
269 cls.bases += ('AbstractCut', 'RestrictionType')
270 cls.__name__ = name
271 cls.results = None
272 cls.dna = None
273 cls.__bases__ = cls.bases
274 cls.charac = (cls.fst5, cls.fst3, cls.scd5, cls.scd3, cls.site)
275 if not target[2] and cls.suppl :
276 supp = ', '.join([suppliersdict[s][0] for s in cls.suppl])
277 print 'WARNING : It seems that %s is both commercially available\
278 \n\tand its characteristics are unknown. \
279 \n\tThis seems counter-intuitive.\
280 \n\tThere is certainly an error either in ranacompiler or\
281 \n\tin this REBASE release.\
282 \n\tThe supplier is : %s.' % (name, supp)
283 return
284
285
287
288 """Build the different types possible for Restriction Enzymes"""
289
293
295 """TC.buildtype() -> generator.
296
297 build the new types that will be needed for constructing the
298 restriction enzymes."""
299 baT = (AbstractCut, RestrictionType)
300 cuT = (NoCut, OneCut, TwoCuts)
301 meT = (Meth_Dep, Meth_Undep)
302 paT = (Palindromic, NonPalindromic)
303 ovT = (Unknown, Blunt, Ov5, Ov3)
304 deT = (NotDefined, Defined, Ambiguous)
305 coT = (Commercially_available, Not_available)
306 All = (baT, cuT, meT, paT, ovT, deT, coT)
307 #
308 # Now build the types. Only the most obvious are left out.
309 # Modified even the most obvious are not so obvious.
310 # emboss_*.403 AspCNI is unknown and commercially available.
311 # So now do not remove the most obvious.
312 #
313 types = [(p,c,o,d,m,co,baT[0],baT[1])
314 for p in paT for c in cuT for o in ovT
315 for d in deT for m in meT for co in coT]
316 n= 1
317 for ty in types :
318 dct = {}
319 for t in ty :
320 dct.update(t.__dict__)
321 #
322 # here we need to customize the dictionary.
323 # i.e. types deriving from OneCut have always scd5 and scd3
324 # equal to None. No need therefore to store that in a specific
325 # enzyme of this type. but it then need to be in the type.
326 #
327 dct['results'] = []
328 dct['substrat'] = 'DNA'
329 dct['dna'] = None
330 if t == NoCut :
331 dct.update({'fst5':None,'fst3':None,
332 'scd5':None,'scd3':None,
333 'ovhg':None,'ovhgseq':None})
334 elif t == OneCut :
335 dct.update({'scd5':None, 'scd3':None})
336 class klass(type) :
337 def __new__(cls) :
338 return type.__new__(cls, 'type%i'%n,ty,dct)
339 def __init__(cls) :
340 super(klass, cls).__init__('type%i'%n,ty,dct)
341 yield klass()
342 n+=1
343
344 start = '\n\
345 #!/usr/bin/env python\n\
346 #\n\
347 # Restriction Analysis Libraries.\n\
348 # Copyright (C) 2004. Frederic Sohm.\n\
349 #\n\
350 # This code is part of the Biopython distribution and governed by its\n\
351 # license. Please see the LICENSE file that should have been included\n\
352 # as part of this package.\n\
353 #\n\
354 #\n\
355 rest_dict = \\\n'
356
358
360 """DictionaryBuilder([e_mail[, ftp_proxy]) -> DictionaryBuilder instance.
361
362 If the emboss files used for the construction need to be updated this
363 class will download them if the ftp connection is correctly set.
364 either in RanaConfig.py or given at run time.
365
366 e_mail is the e-mail address used as password for the anonymous
367 ftp connection.
368
369 proxy is the ftp_proxy to use if any."""
370 self.rebase_pass = e_mail or config.Rebase_password
371 self.proxy = ftp_proxy or config.ftp_proxy
372
374 """DB.build_dict() -> None.
375
376 Construct the dictionary and build the files containing the new
377 dictionaries."""
378 #
379 # first parse the emboss files.
380 #
381 emboss_e, emboss_r, emboss_s = self.lastrebasefile()
382 #
383 # the results will be stored into enzymedict.
384 #
385 self.information_mixer(emboss_r, emboss_e, emboss_s)
386 emboss_r.close()
387 emboss_e.close()
388 emboss_s.close()
389 #
390 # we build all the possible type
391 #
392 tdct = {}
393 for klass in TypeCompiler().buildtype() :
394 exec klass.__name__ +'= klass'
395 exec "tdct['"+klass.__name__+"'] = klass"
396
397 #
398 # Now we build the enzymes from enzymedict
399 # and store them in a dictionary.
400 # The type we will need will also be stored.
401 #
402
403 for name in enzymedict :
404 #
405 # the class attributes first:
406 #
407 cls = newenzyme(name)
408 #
409 # Now select the right type for the enzyme.
410 #
411 bases = cls.bases
412 clsbases = tuple([eval(x) for x in bases])
413 typestuff = ''
414 for n, t in tdct.iteritems() :
415 #
416 # if the bases are the same. it is the right type.
417 # create the enzyme and remember the type
418 #
419 if t.__bases__ == clsbases :
420 typestuff = t
421 typename = t.__name__
422 continue
423 #
424 # now we build the dictionaries.
425 #
426 dct = dict(cls.__dict__)
427 del dct['bases']
428 del dct['__bases__']
429 del dct['__name__']# no need to keep that, it's already in the type.
430 classdict[name] = dct
431
432 commonattr = ['fst5', 'fst3', 'scd5', 'scd3', 'substrat',
433 'ovhg', 'ovhgseq','results', 'dna']
434 if typename in typedict :
435 typedict[typename][1].append(name)
436 else :
437 enzlst= []
438 tydct = dict(typestuff.__dict__)
439 tydct = dict([(k,v) for k,v in tydct.iteritems() if k in commonattr])
440 enzlst.append(name)
441 typedict[typename] = (bases, enzlst)
442 for letter in cls.__dict__['suppl'] :
443 supplier = suppliersdict[letter]
444 suppliersdict[letter][1].append(name)
445 if not classdict or not suppliersdict or not typedict :
446 print 'One of the new dictionaries is empty.'
447 print 'Check the integrity of the emboss file before continuing.'
448 print 'Update aborted.'
449 sys.exit()
450 #
451 # How many enzymes this time?
452 #
453 print '\nThe new database contains %i enzymes.\n' % len(classdict)
454 #
455 # the dictionaries are done. Build the file
456 #
457 #update = config.updatefolder
458
459 update = os.getcwd()
460 results = open(os.path.join(update, 'Restriction_Dictionary.py'), 'w')
461 print 'Writing the dictionary containing the new Restriction classes.\t',
462 results.write(start)
463 a = pprint.PrettyPrinter(1, 80, None, results)
464 a.pprint(classdict)
465 print 'OK.\n'
466 print 'Writing the dictionary containing the suppliers datas.\t\t',
467 results.write('suppliers = \\\n')
468 a.pprint(suppliersdict)
469 print 'OK.\n'
470 print 'Writing the dictionary containing the Restriction types.\t',
471 results.write('typedict = \\\n')
472 a.pprint(typedict)
473 print 'OK.\n'
474 results.close()
475 return
476
478 """DB.install_dict() -> None.
479
480 Install the newly created dictionary in the site-packages folder.
481
482 May need super user privilege on some architectures."""
483 print '\n ' +'*'*78 + ' \n'
484 print '\n\t\tInstalling Restriction_Dictionary.py'
485 try :
486 import Bio.Restriction.Restriction_Dictionary as rd
487 except ImportError :
488 print '\
489 \n Unable to locate the previous Restriction_Dictionary.py module\
490 \n Aborting installation.'
491 sys.exit()
492 #
493 # first save the old file in Updates
494 #
495 old = os.path.join(os.path.split(rd.__file__)[0],
496 'Restriction_Dictionary.py')
497 #update_folder = config.updatefolder
498 update_folder = os.getcwd()
499 shutil.copyfile(old, os.path.join(update_folder,
500 'Restriction_Dictionary.old'))
501 #
502 # Now test and install.
503 #
504 new = os.path.join(update_folder, 'Restriction_Dictionary.py')
505 try :
506 execfile(new)
507 print '\
508 \n\tThe new file seems ok. Proceeding with the installation.'
509 except SyntaxError :
510 print '\
511 \n The new dictionary file is corrupted. Aborting the installation.'
512 return
513 try :
514 shutil.copyfile(new, old)
515 print'\n\t Everything ok. If you need it a version of the old\
516 \n\t dictionary have been saved in the Updates folder under\
517 \n\t the name Restriction_Dictionary.old.'
518 print '\n ' +'*'*78 + ' \n'
519 except IOError :
520 print '\n ' +'*'*78 + ' \n'
521 print '\
522 \n\t WARNING : Impossible to install the new dictionary.\
523 \n\t Are you sure you have write permission to the folder :\n\
524 \n\t %s ?\n\n' % os.path.split(old)[0]
525 return self.no_install()
526 return
527
529 """BD.no_install() -> None.
530
531 build the new dictionary but do not install the dictionary."""
532 print '\n ' +'*'*78 + '\n'
533 #update = config.updatefolder
534 try :
535 import Bio.Restriction.Restriction_Dictionary as rd
536 except ImportError :
537 print '\
538 \n Unable to locate the previous Restriction_Dictionary.py module\
539 \n Aborting installation.'
540 sys.exit()
541 #
542 # first save the old file in Updates
543 #
544 old = os.path.join(os.path.split(rd.__file__)[0],
545 'Restriction_Dictionary.py')
546 update = os.getcwd()
547 shutil.copyfile(old, os.path.join(update, 'Restriction_Dictionary.old'))
548 places = update, os.path.split(Bio.Restriction.Restriction.__file__)[0]
549 print "\t\tCompilation of the new dictionary : OK.\
550 \n\t\tInstallation : No.\n\
551 \n You will find the newly created 'Restriction_Dictionary.py' file\
552 \n in the folder : \n\
553 \n\t%s\n\
554 \n Make a copy of 'Restriction_Dictionary.py' and place it with \
555 \n the other Restriction libraries.\n\
556 \n note : \
557 \n This folder should be :\n\
558 \n\t%s\n" % places
559 print '\n ' +'*'*78 + '\n'
560 return
561
562
564 """BD.lastrebasefile() -> None.
565
566 Check the emboss files are up to date and download them if they are not.
567 """
568 embossnames = ('emboss_e', 'emboss_r', 'emboss_s')
569 #
570 # first check if we have the last update :
571 #
572 emboss_now = ['.'.join((x,LocalTime())) for x in embossnames]
573 update_needed = False
574 #dircontent = os.listdir(config.Rebase) # local database content
575 dircontent = os.listdir(os.getcwd())
576 base = os.getcwd() # added for biopython current directory
577 for name in emboss_now :
578 if name in dircontent :
579 pass
580 else :
581 update_needed = True
582
583 if not update_needed :
584 #
585 # nothing to be done
586 #
587 print '\n Using the files : %s'% ', '.join(emboss_now)
588 return tuple([open(os.path.join(base, n)) for n in emboss_now])
589 else :
590 #
591 # may be download the files.
592 #
593 print '\n The rebase files are more than one month old.\
594 \n Would you like to update them before proceeding?(y/n)'
595 r = raw_input(' update [n] >>> ')
596 if r in ['y', 'yes', 'Y', 'Yes'] :
597 updt = RebaseUpdate(self.rebase_pass, self.proxy)
598 updt.openRebase()
599 updt.getfiles()
600 updt.close()
601 print '\n Update complete. Creating the dictionaries.\n'
602 print '\n Using the files : %s'% ', '.join(emboss_now)
603 return tuple([open(os.path.join(base, n)) for n in emboss_now])
604 else :
605 #
606 # we will use the last files found without updating.
607 # But first we check we have some file to use.
608 #
609 class NotFoundError(Exception) :
610 pass
611 for name in embossnames :
612 try :
613 for file in dircontent :
614 if file.startswith(name) :
615 break
616 else :
617 pass
618 raise NotFoundError
619 except NotFoundError :
620 print "\nNo %s file found. Upgrade is impossible.\n"%name
621 sys.exit()
622 continue
623 pass
624 #
625 # now find the last file.
626 #
627 last = [0]
628 for file in dircontent :
629 fs = file.split('.')
630 try :
631 if fs[0] in embossnames and int(fs[1]) > int(last[-1]) :
632 if last[0] : last.append(fs[1])
633 else : last[0] = fs[1]
634 else :
635 continue
636 except ValueError :
637 continue
638 last.sort()
639 last = last[::-1]
640 if int(last[-1]) < 100 : last[0], last[-1] = last[-1], last[0]
641 for number in last :
642 files = [(name, name+'.%s'%number) for name in embossnames]
643 strmess = '\nLast EMBOSS files found are :\n'
644 try :
645 for name,file in files :
646 if os.path.isfile(os.path.join(base, file)) :
647 strmess += '\t%s.\n'%file
648 else :
649 raise ValueError
650 print strmess
651 emboss_e = open(os.path.join(base, 'emboss_e.%s'%number),'r')
652 emboss_r = open(os.path.join(base, 'emboss_r.%s'%number),'r')
653 emboss_s = open(os.path.join(base, 'emboss_s.%s'%number),'r')
654 return emboss_e, emboss_r, emboss_s
655 except ValueError :
656 continue
657
659 line = [line[0]]+[line[1].upper()]+[int(i) for i in line[2:9]]+line[9:]
660 name = line[0]
661 site = line[1] # sequence of the recognition site
662 dna = DNA(site)
663 size = line[2] # size of the recognition site
664 #
665 # Calculate the overhang.
666 #
667 fst5 = line[5] # first site sense strand
668 fst3 = line[6] # first site antisense strand
669 scd5 = line[7] # second site sense strand
670 scd3 = line[8] # second site antisense strand
671
672 #
673 # the overhang is the difference between the two cut
674 #
675 ovhg1 = fst5 - fst3
676 ovhg2 = scd5 - scd3
677
678 #
679 # 0 has the meaning 'do not cut' in rebase. So we get short of 1
680 # for the negative numbers so we add 1 to negative sites for now.
681 # We will deal with the record later.
682 #
683
684 if fst5 < 0 : fst5 += 1
685 if fst3 < 0 : fst3 += 1
686 if scd5 < 0 : scd5 += 1
687 if scd3 < 0 : scd3 += 1
688
689 if ovhg2 != 0 and ovhg1 != ovhg2 :
690 #
691 # different length of the overhang of the first and second cut
692 # it's a pain to deal with and at the moment it concerns only
693 # one enzyme which is not commercially available (HaeIV).
694 # So we don't deal with it but we check the progression
695 # of the affair.
696 # Should HaeIV become commercially available or other similar
697 # new enzymes be added, this might be modified.
698 #
699 print '\
700 \nWARNING : %s cut twice with different overhang length each time.\
701 \n\tUnable to deal with this behaviour. \
702 \n\tThis enzyme will not be included in the database. Sorry.' %name
703 print '\tChecking :',
704 raise OverhangError
705 if 0 <= fst5 <= size and 0 <= fst3 <= size :
706 #
707 # cut inside recognition site
708 #
709 if fst5 < fst3 :
710 #
711 # 5' overhang
712 #
713 ovhg1 = ovhgseq = site[fst5:fst3]
714 elif fst5 > fst3 :
715 #
716 # 3' overhang
717 #
718 ovhg1 = ovhgseq = site[fst3:fst5]
719 else :
720 #
721 # blunt
722 #
723 ovhg1 = ovhgseq = ''
724 for base in 'NRYWMSKHDBV' :
725 if base in ovhg1 :
726 #
727 # site and overhang degenerated
728 #
729 ovhgseq = ovhg1
730 if fst5 < fst3 : ovhg1 = - len(ovhg1)
731 else : ovhg1 = len(ovhg1)
732 break
733 else :
734 continue
735 elif 0 <= fst5 <= size :
736 #
737 # 5' cut inside the site 3' outside
738 #
739 if fst5 < fst3 :
740 #
741 # 3' cut after the site
742 #
743 ovhgseq = site[fst5:] + (fst3 - size) * 'N'
744 elif fst5 > fst3 :
745 #
746 # 3' cut before the site
747 #
748 ovhgseq = abs(fst3) * 'N' + site[:fst5]
749 else :
750 #
751 # blunt outside
752 #
753 ovhg1 = ovhgseq = ''
754 elif 0 <= fst3 <= size :
755 #
756 # 3' cut inside the site, 5' outside
757 #
758 if fst5 < fst3 :
759 #
760 # 5' cut before the site
761 #
762 ovhgseq = abs(fst5) * 'N' + site[:fst3]
763 elif fst5 > fst3 :
764 #
765 # 5' cut after the site
766 #
767 ovhgseq = site[fst3:] + (fst5 - size) * 'N'
768 else :
769 #
770 # should not happend
771 #
772 raise ValueError('Error in #1')
773 elif fst3 < 0 and size < fst5 :
774 #
775 # 3' overhang. site is included.
776 #
777 ovhgseq = abs(fst3)*'N' + site + (fst5-size)*'N'
778 elif fst5 < 0 and size <fst3 :
779 #
780 # 5' overhang. site is included.
781 #
782 ovhgseq = abs(fst5)*'N' + site + (fst3-size)*'N'
783 else :
784 #
785 # 5' and 3' outside of the site
786 #
787 ovhgseq = 'N' * abs(ovhg1)
788 #
789 # Now line[5] to [8] are the location of the cut but we have to
790 # deal with the weird mathematics of biologists.
791 #
792 # EMBOSS sequence numbering give :
793 # DNA = 'a c g t A C G T'
794 # -1 1 2 3 4
795 #
796 # Biologists do not know about 0. Too much use of latin certainly.
797 #
798 # To compensate, we add 1 to the positions if they are negative.
799 # No need to modify 0 as it means no cut and will not been used.
800 # Positive numbers should be ok since our sequence starts 1.
801 #
802 # Moreover line[6] and line[8] represent cut on the reverse strand.
803 # They will be used for non palindromic sites and sre.finditer
804 # will detect the site in inverse orientation so we need to add the
805 # length of the site to compensate (+1 if they are negative).
806 #
807 for x in (5, 7) :
808 if line[x] < 0 : line[x] += 1
809 for x in (6, 8) :
810 if line[x] > 0 : line[x] -= size
811 elif line[x] < 0 : line[x] = line[x] - size + 1
812 #
813 # now is the site palindromic?
814 # produce the regular expression which correspond to the site.
815 # tag of the regex will be the name of the enzyme for palindromic
816 # enzymesband two tags for the other, the name for the sense sequence
817 # and the name with '_as' at the end for the antisense sequence.
818 #
819 rg = ''
820 if is_palindrom(dna) :
821 line.append(True)
822 rg = ''.join(['(?P<', name, '>', regex(site.upper()), ')'])
823 else :
824 line.append(False)
825 sense = ''.join(['(?P<', name, '>', regex(site.upper()), ')'])
826 antisense = ''.join(['(?P<', name, '_as>',
827 regex(Antiparallel(dna)), ')'])
828 rg = sense + '|' + antisense
829 #
830 # exact frequency of the site. (ie freq(N) == 1, ...)
831 #
832 f = [4/len(dna_alphabet[l]) for l in site.upper()]
833 freq = reduce(lambda x, y : x*y, f)
834 line.append(freq)
835 #
836 # append regex and ovhg1, they have not been appended before not to
837 # break the factory class. simply to leazy to make the changes there.
838 #
839 line.append(rg)
840 line.append(ovhg1)
841 line.append(ovhgseq)
842 return line
843
845 #
846 # remove the heading of the file.
847 #
848 return [l for l in itertools.dropwhile(lambda l:l.startswith('#'),file)]
849
851 #
852 # emboss_r.txt, separation between blocks is //
853 #
854 take = itertools.takewhile
855 block = [l for l in take(lambda l :not l.startswith('//'),file[index:])]
856 index += len(block)+1
857 return block, index
858
860 #
861 # take what we want from the block.
862 # Each block correspond to one enzyme.
863 # block[0] => enzyme name
864 # block[3] => methylation (position and type)
865 # block[5] => suppliers (as a string of single letter)
866 #
867 bl3 = block[3].strip()
868 if not bl3 : bl3 = False # site is not methylable
869 return (block[0].strip(), bl3, block[5].strip())
870
872 #
873 # Mix all the information from the 3 files and produce a coherent
874 # restriction record.
875 #
876 methfile = self.removestart(file1)
877 sitefile = self.removestart(file2)
878 supplier = self.removestart(file3)
879
880 i1, i2= 0, 0
881 try :
882 while True :
883 block, i1 = self.getblock(methfile, i1)
884 bl = self.get(block)
885 line = (sitefile[i2].strip()).split()
886 name = line[0]
887 if name == bl[0] :
888 line.append(bl[1]) # -> methylation
889 line.append(bl[2]) # -> suppliers
890 else :
891 bl = self.get(oldblock)
892 if line[0] == bl[0] :
893 line.append(bl[1])
894 line.append(bl[2])
895 i2 += 1
896 else :
897 raise TypeError
898 oldblock = block
899 i2 += 1
900 try :
901 line = self.parseline(line)
902 except OverhangError : # overhang error
903 n = name # do not include the enzyme
904 if not bl[2] :
905 print 'Anyway, %s is not commercially available.\n' %n
906 else :
907 print 'Unfortunately, %s is commercially available.\n'%n
908
909 continue
910 if name in enzymedict :
911 #
912 # deal with TaqII and its two sites.
913 #
914 print '\nWARNING :',
915 print name, 'has two different sites.\n'
916 dna = DNA(line[1])
917 sense1 = regex(dna.tostring())
918 antisense1 = regex(Antiparallel(dna))
919 dna = DNA(enzymedict[line[0]][0])
920 sense2 = regex(dna.tostring())
921 antisense2 = regex(Antiparallel(dna))
922 sense = '(?P<'+line[0]+'>'+sense1+'|'+sense2+')'
923 antisense = '(?P<'+line[0]+'_as>'+antisense1+'|'+antisense2 + ')'
924 reg = sense + '|' + antisense
925 line[1] = line[1] + '|' + enzymedict[line[0]][0]
926 line[-1] = reg
927 #
928 # the data to produce the enzyme class are then stored in
929 # enzymedict.
930 #
931 enzymedict[name] = line[1:]
932 except IndexError :
933 pass
934 for i in supplier :
935 #
936 # construction of the list of suppliers.
937 #
938 t = i.strip().split(' ', 1)
939 suppliersdict[t[0]] = (t[1], [])
940 return
941
| Trees | Indices | Help |
|---|
| Generated by Epydoc 3.0.1 on Sun May 3 17:15:12 2009 | http://epydoc.sourceforge.net |