| Trees | Indices | Help |
|---|
|
|
1 # Copyright 2000 by Andrew Dalke. All rights reserved.
2 # This code is part of the Biopython distribution and governed by its
3 # license. Please see the LICENSE file that should have been included
4 # as part of this package.
5
6 # The Prosite patterns are defined at http://www.expasy.ch/txt/prosuser.txt
7 #
8 # The PA (PAttern) lines contains the definition of a PROSITE pattern. The
9 # patterns are described using the following conventions:
10 #
11 # - The standard IUPAC one-letter codes for the amino acids are used.
12 # - The symbol `x' is used for a position where any amino acid is accepted.
13 # - Ambiguities are indicated by listing the acceptable amino acids for a
14 # given position, between square parentheses `[ ]'. For example: [ALT]
15 # stands for Ala or Leu or Thr.
16 # - Ambiguities are also indicated by listing between a pair of curly
17 # brackets `{ }' the amino acids that are not accepted at a given
18 # position. For example: {AM} stands for any amino acid except Ala and
19 # Met.
20 # - Each element in a pattern is separated from its neighbor by a `-'.
21 # - Repetition of an element of the pattern can be indicated by following
22 # that element with a numerical value or a numerical range between
23 # parenthesis. Examples: x(3) corresponds to x-x-x, x(2,4) corresponds to
24 # x-x or x-x-x or x-x-x-x.
25 # - When a pattern is restricted to either the N- or C-terminal of a
26 # sequence, that pattern either starts with a `<' symbol or respectively
27 # ends with a `>' symbol.
28 # - A period ends the pattern.
29 #
30 # That boils down to doing these conversions
31 #
32 # [] -> []
33 # {} -> [^ ]
34 # - ->
35 # () -> {}
36 # < -> ^
37 # > -> $
38 # x->X
39 # . ->
40
41 # Note:
42 # [G>] is a valid Prosite pattern, equivalent to "([G]|$)"
43
44 # I assume then that
45 # [>G] is equivalent to "(^|[G])"
46 # It is conceivable that [G>]-G-G is valid, meaning a "G" at the end
47 # of the sequence or followed by two more Gs. I did not implement
48 # this. I haven't gotten an answer to my query on either of these two
49 # non-documented possibilities.
50
51 import string, re
52 from Bio import Seq, Alphabet
53
54
55 # Syntactic conversion to two types of regular expressions
56
57 _prosite_trans = string.maketrans("abcdefghijklmnopqrstuvwxyzX}()<>",
58 "ABCDEFGHIJKLMNOPQRSTUVW.YZ.]{}^$")
59
60 # This does not verify that the pattern is correct - invalid patterns
61 # can be converted!
63 """convert a valid Prosite pattern into an re string"""
64 flg = (pattern[:2] == "[<")
65 s = pattern.replace("{", "[^")
66 s = s.translate(_prosite_trans, "-.")
67 # special case "[<" and ">]", if they exist
68 if flg:
69 i = s.index("]")
70 s = "(?:^|[" + s[2:i] + "])" + s[i+1:]
71 if s[-2:] == "$]":
72 i = s.rindex("[")
73 s = s[:i] + "(?:" + s[i:-2] + "]|$)"
74 elif s[-3:] == "$]$":
75 i = s.rindex("[")
76 s = s[:i] + "(?:" + s[i:-3] + "]|$)$"
77 return s
78
79
80 # This does not verify the pattern is correct - invalid patterns can
81 # be converted!
83 """convert a valid Prosite pattern into an re with groups for each term"""
84 flg = (pattern[:2] == "[<")
85 s = pattern.replace("{", "[^")
86 # Don't delete the "-" characters: use them to place the ()s
87 s = s.translate(_prosite_trans, ".")
88
89 # Get the [< and >] terms correct
90 if flg:
91 i = s.index("]")
92 s = "(?:^|[" + s[2:i] + "])" + s[i+1:]
93 if s[-2:] == "$]":
94 i = s.rindex("[")
95 s = s[:i] + "(?:" + s[i:-2] + "]|$)"
96 if s[-3:] == "$]$":
97 i = s.rindex("[")
98 s = s[:i] + "(?:" + s[i:-3] + "]|$)$"
99
100 # Watch out for unescaped < and > terms
101 if s[:1] == "^":
102 s = "^(" + s[1:]
103 else:
104 s = "(" + s
105 if s[-1:] == "$":
106 s = s[:-1] + ")$"
107 else:
108 s = s + ")"
109
110 return s.replace("-", ")(")
111
112
113
114 # Both the Prosite pattern and match result act like sequences.
117 prosite_alphabet = PrositeAlphabet()
118
120 if not verify_pattern(pattern):
121 raise TypeError("not a legal prosite pattern")
122 return Prosite(pattern = pattern)
123
125 alphabet = prosite_alphabet
126
127 # Don't like having two different types of input - not very pythonic
128 # However, it is faster since I can assume the input has already been
129 # verified (if it's a pattern).
131 assert (pattern is None and data is not None) ^ \
132 (pattern is not None and data is None), \
133 "one and only one of pattern and data can have a value"
134 if pattern is not None:
135 self.pattern = pattern
136 if data is not None:
137 self.data = data
138
149 # Lazy creation of these elements / cache results
150 if name == "re":
151 self.re = re.compile(prosite_to_re(self.pattern))
152 return self.re
153 elif name == "grouped_re":
154 self.grouped_re = re.compile(prosite_to_grouped_re(self.pattern))
155 return self.grouped_re
156 elif name == "data":
157 self.data = find_terms(self.pattern)
158 return self.data
159 elif name == "pattern":
160 self.pattern = str(self)
161 return self.pattern
162 raise AttributeError(name)
163
166
168 if endpos is not None:
169 m = self.grouped_re.search(buffer(seq.data), pos, endpos)
170 else:
171 m = self.grouped_re.search(buffer(seq.data), pos)
172 if m is None:
173 return None
174 return PrositeMatch(self, seq, m)
183
184 # I was thinking about adding sub, subn, findall, etc., but either
185 # you just want the string (in which case, use the ".re") or
186 # you could be changing to a different alphabet (eg, T->U).
187
188
189 # Elements of a Prosite pattern
191 - def __init__(self, letters, ignore, is_begin, is_end, \
192 min_count, max_count, can_begin, can_end):
193 self.letters = letters
194 self.ignore = ignore
195 self.is_begin = is_begin
196 self.is_end = is_end
197 self.min_count = min_count
198 self.max_count = max_count
199 self.can_begin = can_begin
200 self.can_end = can_end
202 return PrositeTerm(self.letters, self.ignore, self.is_begin,
203 self.is_end, self.min_count, self.max_count,
204 self.can_begin, self.can_end)
206 # Convert the term back into Prosite form
207 s = self.base_str()
208
209 if self.min_count == self.max_count:
210 if self.min_count == 1:
211 pass
212 else:
213 s = s + "(%d)" % self.min_count
214 else:
215 s = s + "(%d,%d)" % (self.min_count, self.max_count)
216 if self.is_end:
217 s = s + ">"
218 return s
219
221 # Convert the term back into Prosite form, without the repeat
222 # count fields.
223
224 if self.is_begin:
225 s = "<"
226 else:
227 s = ""
228 if self.ignore:
229 s = s + "{" + self.letters + "}"
230 elif len(self.letters) == 1 and \
231 (not self.can_begin and not self.can_end):
232 s = s + self.letters
233 else:
234 s = s + "["
235 if self.can_begin:
236 s = s + "<"
237 s = s + self.letters
238 if self.can_end:
239 s = s + ">"
240 s = s + "]"
241 return s
242
243 # Results of a Prosite match. Wrapper to the re.MatchObj, but returns
244 # Seq objects instead of strings. And lookee - it implements the Seq
245 # interface too!
248 self.prosite = prosite
249 self.seq = seq
250 self.match = match
251 self.pos = match.pos
252 self.endpos = match.pos
253
254 # for Seq.Seq initialization
255 self.data = match.group(0)
256 self.alphabet = seq.alphabet
257
262 return str(self.data)
268
270 """return a list of numbers mapping to items of the original pattern
271
272 For example, if the Prosite pattern is "[AP](2)-D." matched against
273 "PAD", then the mapping is [1, 1, 2], meaning the first character
274 of the match ("P") is from the first Prosite group ("[AP]"), as
275 is the second letter ("A"). The 3rd letter ("D") is mapped to
276 group 2 of the pattern.
277 """
278
279 vals = []
280 i = 0
281 start = self.start(0)
282 try:
283 while 1:
284 end = self.match.end(i+1)
285 while start < end:
286 vals.append(i)
287 start = start + 1
288 i = i + 1
289 except IndexError:
290 pass
291 return vals
292
294 """returns the specific Prosite pattern used to find this sequence
295
296 >>> p = Prosite.compile("[AP](2,3)-D.")
297 >>> m = p.search(Seq.Seq("PAD"))
298 >>> mapping = m.mapping()
299 >>> mapped = m.mapped_pattern()
300 >>> print str(m[1]), str(p[mapping[1]]), str(mapped[1])
301 P [AP](2,3) [AP]
302 >>> print str(mapped)
303 [AP]-[AP]-D.
304 >>>
305
306 Note that the original term includes the count, while the
307 mapped pattern does the expansion.
308
309 """
310 return pattern_mapping(self.prosite, self.mapping())
311
319 result = []
320 alphabet = self.alphabet
321 for g in self.match.groups(default):
322 result.append( Seq.Seq(g, alphabet) )
323 return tuple(result)
334
336 data = []
337 for i in mapping:
338 x = prosite[i].copy()
339 x.min_count = x.max_count = 1
340 data.append(x)
341 return Prosite(data=data)
342
343 prosite_term_re = re.compile(r"""
344 (?:
345 ([ABCDEFGHIKLMNPQRSTVWXYZx])| # a character OR
346 \[(<?)([ABCDEFGHIKLMNPQRSTVWXYZ]+)(>?)\]| # something in []s OR
347 \{([ABCDEFGHIKLMNPQRSTVWXYZ]+)\} # something in {}s
348 )(?:\((\d+)(,\d+)?\))? # optional count of the form "(i,j)", ",j" optional
349 $
350 """, re.VERBOSE)
351
352 # This does not verify the pattern is correct - invalid patterns can
353 # be converted!
355 if pattern[-1:] != ".":
356 raise TypeError("not a prosite pattern - needs a final '.'")
357 pattern = pattern[:-1]
358 terms = pattern.split("-")
359 result = []
360 i = 0
361 for term in terms:
362 can_begin = can_end = 0
363 # Starts with a "<"?
364 if term[:1] == "<":
365 term = term[1:]
366 is_begin = 1
367 else:
368 is_begin = 0
369
370 # Ends with a ">"?
371 if term[-1:] == ">":
372 term = term[:-1]
373 is_end = 1
374 else:
375 is_end = 0
376
377 match = prosite_term_re.match(term)
378 if match is None:
379 raise TypeError("not a Prosite term (%s)" % repr(term))
380 if match.group(1) is not None:
381 # Single letter
382 ignore = 0
383 letters = match.group(1)
384 elif match.group(3) is not None:
385 # Letters inside of "[]"s
386 ignore = 0
387 letters = match.group(3)
388 if match.group(2):
389 can_begin = 1
390 if i != 0:
391 raise TypeError("[<] only allowed for first term (%s)" \
392 % repr(term))
393
394 if match.group(4):
395 can_end = 1
396 if i != len(terms) - 1:
397 raise TypeError("[>] only allowed for last term (%s)" \
398 % repr(term))
399
400 elif match.group(5) is not None:
401 # Letters inside of "{}"s
402 ignore = 1
403 letters = match.group(5)
404 else:
405 raise TypeError("not a prosite term (%s)" % repr(term))
406
407 if match.group(6) is not None:
408 # there is a minimum number
409 min_count = int(match.group(6))
410 else:
411 # no min, so it's 1
412 min_count = 1
413 if match.group(7) is not None:
414 # there is a maximum number
415 max_count = int(match.group(7)[1:])
416 else:
417 # no max specified, so use the same as the min
418 max_count = min_count
419
420 result.append(PrositeTerm(letters, ignore, is_begin,
421 is_end, min_count, max_count,
422 can_begin, can_end))
423
424 i = i + 1
425 return result
426
427
428
429
430 prosite_re = re.compile(r"""
431 ^<? # starts with an optional "<"
432 (
433 [ABCDEFGHIKLMNPQRSTVWXYZx]| # a character OR
434 (\[<?[ABCDEFGHIKLMNPQRSTVWXYZ]+>?\])| # something in []s OR
435 \{[ABCDEFGHIKLMNPQRSTVWXYZ]+\} # something in {}s
436 )(\(\d+(,\d+)?\))? # optional count of the form "(i,j)" (",j" is optional)
437 (- # new terms seperated by a '-'
438 (
439 [ABCDEFGHIKLMNPQRSTVWXYZx]| # a character OR
440 \[[ABCDEFGHIKLMNPQRSTVWXYZ]+>?\]| # something in []s OR
441 \{[ABCDEFGHIKLMNPQRSTVWXYZ]+\} # something in {}s
442 )(\(\d+(,\d+)?\))? # optional count
443 )* # repeat until done
444 >? # pattern ends with an optional ">"
445 \.$ # description ends with a required "."
446 """, re.VERBOSE)
447
448 # This verifies the pattern is correct.
450 """returns 1 if the Prosite pattern is syntactically correct, else 0"""
451 x = prosite_re.match(pattern)
452 if x is None:
453 return 0
454 # check there's only one [< at the beginning, or >] at the end
455 if pattern.find("[<", 1) != -1:
456 return 0
457 if pattern.find(">]", 0, len(pattern)-2) != -1:
458 return 0
459 return 1
460
462 """verify the patterns from a Prosite file handle"""
463 pattern = ""
464 while 1:
465 line = infile.readline()
466 if not line:
467 break
468 if line[:2] != "PA":
469 continue
470
471 pattern = pattern + line[5:-1]
472 if line[-2] == ".":
473 try:
474 print "*" * 60
475 print pattern
476 p = compile(pattern)
477 print prosite_to_re(pattern)
478 print repr(p.re)
479 print prosite_to_grouped_re(pattern)
480 print repr(p.grouped_re)
481 terms = str(p)
482 if terms != pattern:
483 print "DIFFER", terms, pattern
484 except TypeError, msg:
485 print "PROBLEM", pattern, msg
486 pattern = ""
487
488 # Commented out by jchang 4/13/00.
489 # Specific to Andrew's test environment.
490 #if __name__ == "__main__":
491 # import os
492 # infile = os.popen("bzcat /home/dalke/ftps/prosite/prosite.dat.bz2 | grep ^PA")
493 # _verify_test(infile)
494
| Trees | Indices | Help |
|---|
| Generated by Epydoc 3.0.1 on Sun May 3 17:15:08 2009 | http://epydoc.sourceforge.net |