1
2
3
4
5
6
7 """Module for working with Prosite files from ExPASy (DEPRECATED).
8
9 Most of the functionality in this module has moved to Bio.ExPASy.Prosite;
10 please see
11
12 Bio.ExPASy.Prosite.read To read a Prosite file containing one entry.
13 Bio.ExPASy.Prosite.parse Iterates over entries in a Prosite file.
14 Bio.ExPASy.Prosite.Record Holds Prosite data.
15
16 For
17 scan_sequence_expasy Scan a sequence for occurrences of Prosite patterns.
18 _extract_pattern_hits Extract Prosite patterns from a web page.
19 PatternHit Holds data from a hit against a Prosite pattern.
20 please see the new module Bio.ExPASy.ScanProsite.
21
22 The other functions and classes in Bio.Prosite (including
23 Bio.Prosite.index_file and Bio.Prosite.Dictionary) are considered deprecated,
24 and were not moved to Bio.ExPASy.Prosite. If you use this functionality,
25 please contact the Biopython developers at biopython-dev@biopython.org to
26 avoid permanent removal of this module from Biopython.
27
28
29 This module provides code to work with the prosite dat file from
30 Prosite.
31 http://www.expasy.ch/prosite/
32
33 Tested with:
34 Release 15.0, July 1998
35 Release 16.0, July 1999
36 Release 17.0, Dec 2001
37 Release 19.0, Mar 2006
38
39
40 Functions:
41 parse Iterates over entries in a Prosite file.
42 scan_sequence_expasy Scan a sequence for occurrences of Prosite patterns.
43 index_file Index a Prosite file for a Dictionary.
44 _extract_record Extract Prosite data from a web page.
45 _extract_pattern_hits Extract Prosite patterns from a web page.
46
47
48 Classes:
49 Record Holds Prosite data.
50 PatternHit Holds data from a hit against a Prosite pattern.
51 Dictionary Accesses a Prosite file using a dictionary interface.
52 RecordParser Parses a Prosite record into a Record object.
53
54 _Scanner Scans Prosite-formatted data.
55 _RecordConsumer Consumes Prosite data to a Record object.
56
57 """
58
59 import warnings
60 warnings.warn("Bio.Prosite is deprecated, and will be removed in a"\
61 " future release of Biopython. Most of the functionality "
62 " is now provided by Bio.ExPASy.Prosite. If you want to "
63 " continue to use Bio.Prosite, please get in contact "
64 " via the mailing lists to avoid its permanent removal from"\
65 " Biopython.", DeprecationWarning)
66
67 from types import *
68 import re
69 import sgmllib
70 from Bio import File
71 from Bio import Index
72 from Bio.ParserSupport import *
73
74
75
76
90
105
107 """Holds information from a Prosite record.
108
109 Members:
110 name ID of the record. e.g. ADH_ZINC
111 type Type of entry. e.g. PATTERN, MATRIX, or RULE
112 accession e.g. PS00387
113 created Date the entry was created. (MMM-YYYY)
114 data_update Date the 'primary' data was last updated.
115 info_update Date data other than 'primary' data was last updated.
116 pdoc ID of the PROSITE DOCumentation.
117
118 description Free-format description.
119 pattern The PROSITE pattern. See docs.
120 matrix List of strings that describes a matrix entry.
121 rules List of rule definitions (from RU lines). (strings)
122 prorules List of prorules (from PR lines). (strings)
123
124 NUMERICAL RESULTS
125 nr_sp_release SwissProt release.
126 nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int)
127 nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs)
128 nr_positive True positives. tuple of (hits, seqs)
129 nr_unknown Could be positives. tuple of (hits, seqs)
130 nr_false_pos False positives. tuple of (hits, seqs)
131 nr_false_neg False negatives. (int)
132 nr_partial False negatives, because they are fragments. (int)
133
134 COMMENTS
135 cc_taxo_range Taxonomic range. See docs for format
136 cc_max_repeat Maximum number of repetitions in a protein
137 cc_site Interesting site. list of tuples (pattern pos, desc.)
138 cc_skip_flag Can this entry be ignored?
139 cc_matrix_type
140 cc_scaling_db
141 cc_author
142 cc_ft_key
143 cc_ft_desc
144 cc_version version number (introduced in release 19.0)
145
146 DATA BANK REFERENCES - The following are all
147 lists of tuples (swiss-prot accession,
148 swiss-prot name)
149 dr_positive
150 dr_false_neg
151 dr_false_pos
152 dr_potential Potential hits, but fingerprint region not yet available.
153 dr_unknown Could possibly belong
154
155 pdb_structs List of PDB entries.
156
157 """
159 self.name = ''
160 self.type = ''
161 self.accession = ''
162 self.created = ''
163 self.data_update = ''
164 self.info_update = ''
165 self.pdoc = ''
166
167 self.description = ''
168 self.pattern = ''
169 self.matrix = []
170 self.rules = []
171 self.prorules = []
172 self.postprocessing = []
173
174 self.nr_sp_release = ''
175 self.nr_sp_seqs = ''
176 self.nr_total = (None, None)
177 self.nr_positive = (None, None)
178 self.nr_unknown = (None, None)
179 self.nr_false_pos = (None, None)
180 self.nr_false_neg = None
181 self.nr_partial = None
182
183 self.cc_taxo_range = ''
184 self.cc_max_repeat = ''
185 self.cc_site = []
186 self.cc_skip_flag = ''
187
188 self.dr_positive = []
189 self.dr_false_neg = []
190 self.dr_false_pos = []
191 self.dr_potential = []
192 self.dr_unknown = []
193
194 self.pdb_structs = []
195
197 """Holds information from a hit against a Prosite pattern.
198
199 Members:
200 name ID of the record. e.g. ADH_ZINC
201 accession e.g. PS00387
202 pdoc ID of the PROSITE DOCumentation.
203 description Free-format description.
204 matches List of tuples (start, end, sequence) where
205 start and end are indexes of the match, and sequence is
206 the sequence matched.
207
208 """
216 lines = []
217 lines.append("%s %s %s" % (self.accession, self.pdoc, self.name))
218 lines.append(self.description)
219 lines.append('')
220 if len(self.matches) > 1:
221 lines.append("Number of matches: %s" % len(self.matches))
222 for i in range(len(self.matches)):
223 start, end, seq = self.matches[i]
224 range_str = "%d-%d" % (start, end)
225 if len(self.matches) > 1:
226 lines.append("%7d %10s %s" % (i+1, range_str, seq))
227 else:
228 lines.append("%7s %10s %s" % (' ', range_str, seq))
229 return "\n".join(lines)
230
231
233 """Accesses a Prosite file using a dictionary interface.
234
235 """
236 __filename_key = '__filename'
237
238 - def __init__(self, indexname, parser=None):
239 """__init__(self, indexname, parser=None)
240
241 Open a Prosite Dictionary. indexname is the name of the
242 index for the dictionary. The index should have been created
243 using the index_file function. parser is an optional Parser
244 object to change the results into another form. If set to None,
245 then the raw contents of the file will be returned.
246
247 """
248 self._index = Index.Index(indexname)
249 self._handle = open(self._index[Dictionary.__filename_key])
250 self._parser = parser
251
254
262
265
267 """Parses Prosite data into a Record object.
268
269 """
273
274 - def parse(self, handle):
275 self._scanner.feed(handle, self._consumer)
276 return self._consumer.data
277
279 """Scans Prosite-formatted data.
280
281 Tested with:
282 Release 15.0, July 1998
283
284 """
285 - def feed(self, handle, consumer):
286 """feed(self, handle, consumer)
287
288 Feed in Prosite data for scanning. handle is a file-like
289 object that contains prosite data. consumer is a
290 Consumer object that will receive events as the report is scanned.
291
292 """
293 if isinstance(handle, File.UndoHandle):
294 uhandle = handle
295 else:
296 uhandle = File.UndoHandle(handle)
297
298 consumer.finished = False
299 while not consumer.finished:
300 line = uhandle.peekline()
301 if not line:
302 break
303 elif is_blank_line(line):
304
305 uhandle.readline()
306 continue
307 elif line[:2] == 'ID':
308 self._scan_record(uhandle, consumer)
309 elif line[:2] == 'CC':
310 self._scan_copyrights(uhandle, consumer)
311 else:
312 raise ValueError("There doesn't appear to be a record")
313
315 consumer.start_copyrights()
316 self._scan_line('CC', uhandle, consumer.copyright, any_number=1)
317 self._scan_terminator(uhandle, consumer)
318 consumer.end_copyrights()
319
332
333 - def _scan_line(self, line_type, uhandle, event_fn,
334 exactly_one=None, one_or_more=None, any_number=None,
335 up_to_one=None):
353
356
359
362
365
368
371
372
373
374
375
376
377
378
379
380
381
382
383
384
388
391
395
398
402
406
410
413
416
417
418
419
420 _scan_fns = [
421 _scan_id,
422 _scan_ac,
423 _scan_dt,
424 _scan_de,
425 _scan_pa,
426 _scan_ma,
427 _scan_pp,
428 _scan_ru,
429 _scan_nr,
430 _scan_cc,
431
432
433
434
435
436 _scan_ma,
437 _scan_nr,
438 _scan_cc,
439
440 _scan_dr,
441 _scan_3d,
442 _scan_pr,
443 _scan_do,
444 _scan_terminator
445 ]
446
448 """Consumer that converts a Prosite record to a Record object.
449
450 Members:
451 data Record with Prosite data.
452
453 """
456
459
462
464 cols = line.split()
465 if len(cols) != 3:
466 raise ValueError("I don't understand identification line\n%s" \
467 % line)
468 self.data.name = self._chomp(cols[1])
469 self.data.type = self._chomp(cols[2])
470
472 cols = line.split()
473 if len(cols) != 2:
474 raise ValueError("I don't understand accession line\n%s" % line)
475 self.data.accession = self._chomp(cols[1])
476
477 - def date(self, line):
478 uprline = line.upper()
479 cols = uprline.split()
480
481
482 if cols[2] != '(CREATED);' or \
483 cols[4] != '(DATA' or cols[5] != 'UPDATE);' or \
484 cols[7][:4] != '(INF' or cols[8] != 'UPDATE).':
485 raise ValueError("I don't understand date line\n%s" % line)
486
487 self.data.created = cols[1]
488 self.data.data_update = cols[3]
489 self.data.info_update = cols[6]
490
493
496
499
500 - def postprocessing(self, line):
503
504 - def rule(self, line):
506
508 cols = self._clean(line).split(";")
509 for col in cols:
510 if not col:
511 continue
512 qual, data = [word.lstrip() for word in col.split("=")]
513 if qual == '/RELEASE':
514 release, seqs = data.split(",")
515 self.data.nr_sp_release = release
516 self.data.nr_sp_seqs = int(seqs)
517 elif qual == '/FALSE_NEG':
518 self.data.nr_false_neg = int(data)
519 elif qual == '/PARTIAL':
520 self.data.nr_partial = int(data)
521 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']:
522 m = re.match(r'(\d+)\((\d+)\)', data)
523 if not m:
524 raise Exception("Broken data %s in comment line\n%s" \
525 % (repr(data), line))
526 hits = tuple(map(int, m.groups()))
527 if(qual == "/TOTAL"):
528 self.data.nr_total = hits
529 elif(qual == "/POSITIVE"):
530 self.data.nr_positive = hits
531 elif(qual == "/UNKNOWN"):
532 self.data.nr_unknown = hits
533 elif(qual == "/FALSE_POS"):
534 self.data.nr_false_pos = hits
535 else:
536 raise ValueError("Unknown qual %s in comment line\n%s" \
537 % (repr(qual), line))
538
580
599
604
609
612
615
616 - def _chomp(self, word, to_chomp='.,;'):
617
618 if word[-1] in to_chomp:
619 return word[:-1]
620 return word
621
622 - def _clean(self, line, rstrip=1):
623
624 if rstrip:
625 return line[5:].rstrip()
626 return line[5:]
627
629 """scan_sequence_expasy(seq=None, id=None, exclude_frequent=None) ->
630 list of PatternHit's
631
632 Search a sequence for occurrences of Prosite patterns. You can
633 specify either a sequence in seq or a SwissProt/trEMBL ID or accession
634 in id. Only one of those should be given. If exclude_frequent
635 is true, then the patterns with the high probability of occurring
636 will be excluded.
637
638 """
639 from Bio import ExPASy
640 if (seq and id) or not (seq or id):
641 raise ValueError("Please specify either a sequence or an id")
642 handle = ExPASy.scanprosite1(seq, id, exclude_frequent)
643 return _extract_pattern_hits(handle)
644
646 """_extract_pattern_hits(handle) -> list of PatternHit's
647
648 Extract hits from a web page. Raises a ValueError if there
649 was an error in the query.
650
651 """
652 class parser(sgmllib.SGMLParser):
653 def __init__(self):
654 sgmllib.SGMLParser.__init__(self)
655 self.hits = []
656 self.broken_message = 'Some error occurred'
657 self._in_pre = 0
658 self._current_hit = None
659 self._last_found = None
660 def handle_data(self, data):
661 if data.find('try again') >= 0:
662 self.broken_message = data
663 return
664 elif data == 'illegal':
665 self.broken_message = 'Sequence contains illegal characters'
666 return
667 if not self._in_pre:
668 return
669 elif not data.strip():
670 return
671 if self._last_found is None and data[:4] == 'PDOC':
672 self._current_hit.pdoc = data
673 self._last_found = 'pdoc'
674 elif self._last_found == 'pdoc':
675 if data[:2] != 'PS':
676 raise ValueError("Expected accession but got:\n%s" % data)
677 self._current_hit.accession = data
678 self._last_found = 'accession'
679 elif self._last_found == 'accession':
680 self._current_hit.name = data
681 self._last_found = 'name'
682 elif self._last_found == 'name':
683 self._current_hit.description = data
684 self._last_found = 'description'
685 elif self._last_found == 'description':
686 m = re.findall(r'(\d+)-(\d+) (\w+)', data)
687 for start, end, seq in m:
688 self._current_hit.matches.append(
689 (int(start), int(end), seq))
690
691 def do_hr(self, attrs):
692
693 if self._in_pre:
694 self._current_hit = PatternHit()
695 self.hits.append(self._current_hit)
696 self._last_found = None
697 def start_pre(self, attrs):
698 self._in_pre = 1
699 self.broken_message = None
700 def end_pre(self):
701 self._in_pre = 0
702 p = parser()
703 p.feed(handle.read())
704 if p.broken_message:
705 raise ValueError(p.broken_message)
706 return p.hits
707
708
709
710
711 -def index_file(filename, indexname, rec2key=None):
712 """index_file(filename, indexname, rec2key=None)
713
714 Index a Prosite file. filename is the name of the file.
715 indexname is the name of the dictionary. rec2key is an
716 optional callback that takes a Record and generates a unique key
717 (e.g. the accession number) for the record. If not specified,
718 the id name will be used.
719
720 """
721 import os
722 if not os.path.exists(filename):
723 raise ValueError("%s does not exist" % filename)
724
725 index = Index.Index(indexname, truncate=1)
726 index[Dictionary._Dictionary__filename_key] = filename
727
728 handle = open(filename)
729 records = parse(handle)
730 end = 0L
731 for record in records:
732 start = end
733 end = long(handle.tell())
734 length = end - start
735
736 if rec2key is not None:
737 key = rec2key(record)
738 else:
739 key = record.name
740
741 if not key:
742 raise KeyError("empty key was produced")
743 elif key in index:
744 raise KeyError("duplicate key %s found" % key)
745
746 index[key] = start, length
747