Package Bio :: Package SwissProt :: Module SProt
[hide private]
[frames] | no frames]

Source Code for Module Bio.SwissProt.SProt

   1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """ 
   7  This module provides code to work with the sprotXX.dat file from 
   8  Utilities for working with FASTA-formatted sequences (DEPRECATED). 
   9  http://www.expasy.ch/sprot/sprot-top.html 
  10   
  11  Please see Bio.SwissProt for alternatives for the functionality in this module. 
  12   
  13  Tested with: 
  14  Release 37, Release 38, Release 39 
  15   
  16  Limited testing with: 
  17  Release 51, 54 
  18   
  19   
  20  Classes: 
  21  Record             Holds SwissProt data. 
  22  Reference          Holds reference data from a SwissProt entry. 
  23  Iterator           Iterates over entries in a SwissProt file. 
  24  Dictionary         Accesses a SwissProt file using a dictionary interface. 
  25  RecordParser       Parses a SwissProt record into a Record object. 
  26  SequenceParser     Parses a SwissProt record into a SeqRecord object. 
  27   
  28  _Scanner           Scans SwissProt-formatted data. 
  29  _RecordConsumer    Consumes SwissProt data to a SProt.Record object. 
  30  _SequenceConsumer  Consumes SwissProt data to a SeqRecord object. 
  31   
  32   
  33  Functions: 
  34  index_file         Index a SwissProt file for a Dictionary. 
  35   
  36  """ 
  37  import warnings 
  38  warnings.warn("Bio.SwissProt.SProt is deprecated. Please use the functions Bio.SwissProt.parse or Bio.SwissProt.read if you want to get a SwissProt.Record, or Bio.SeqIO.parse or Bio.SeqIO.read if you want to get a SeqRecord. If these solutions do not work for you, please get in contact with the Biopython developers (biopython-dev@biopython.org).", 
  39                DeprecationWarning) 
  40   
  41  from types import * 
  42  import os 
  43  from Bio import File 
  44  from Bio import Index 
  45  from Bio import Alphabet 
  46  from Bio import Seq 
  47  from Bio import SeqRecord 
  48  from Bio.ParserSupport import * 
  49   
  50  # The parse(), read() functions can probably be simplified if we don't 
  51  # use the "parser = RecordParser(); parser.parse(handle)" approach. 
52 -def parse(handle):
53 from SProt import RecordParser 54 import cStringIO 55 parser = RecordParser() 56 text = "" 57 for line in handle: 58 text += line 59 if line[:2]=='//': 60 handle = cStringIO.StringIO(text) 61 record = parser.parse(handle) 62 text = "" 63 yield record
64
65 -def read(handle):
66 from SProt import RecordParser 67 parser = RecordParser() 68 try: 69 record = parser.parse(handle) 70 except ValueError, error: 71 if error.message.startswith("Line does not start with 'ID':"): 72 raise ValueError("No SwissProt record found") 73 else: 74 raise error 75 # We should have reached the end of the record by now 76 remainder = handle.read() 77 if remainder: 78 raise ValueError("More than one SwissProt record found") 79 return record
80 81 82 _CHOMP = " \n\r\t.,;" #whitespace and trailing punctuation 83
84 -class Record:
85 """Holds information from a SwissProt record. 86 87 Members: 88 entry_name Name of this entry, e.g. RL1_ECOLI. 89 data_class Either 'STANDARD' or 'PRELIMINARY'. 90 molecule_type Type of molecule, 'PRT', 91 sequence_length Number of residues. 92 93 accessions List of the accession numbers, e.g. ['P00321'] 94 created A tuple of (date, release). 95 sequence_update A tuple of (date, release). 96 annotation_update A tuple of (date, release). 97 98 description Free-format description. 99 gene_name Gene name. See userman.txt for description. 100 organism The source of the sequence. 101 organelle The origin of the sequence. 102 organism_classification The taxonomy classification. List of strings. 103 (http://www.ncbi.nlm.nih.gov/Taxonomy/) 104 taxonomy_id A list of NCBI taxonomy id's. 105 host_organism A list of NCBI taxonomy id's of the hosts of a virus, 106 if any. 107 references List of Reference objects. 108 comments List of strings. 109 cross_references List of tuples (db, id1[, id2][, id3]). See the docs. 110 keywords List of the keywords. 111 features List of tuples (key name, from, to, description). 112 from and to can be either integers for the residue 113 numbers, '<', '>', or '?' 114 115 seqinfo tuple of (length, molecular weight, CRC32 value) 116 sequence The sequence. 117 118 """
119 - def __init__(self):
120 self.entry_name = None 121 self.data_class = None 122 self.molecule_type = None 123 self.sequence_length = None 124 125 self.accessions = [] 126 self.created = None 127 self.sequence_update = None 128 self.annotation_update = None 129 130 self.description = '' 131 self.gene_name = '' 132 self.organism = '' 133 self.organelle = '' 134 self.organism_classification = [] 135 self.taxonomy_id = [] 136 self.host_organism = [] 137 self.references = [] 138 self.comments = [] 139 self.cross_references = [] 140 self.keywords = [] 141 self.features = [] 142 143 self.seqinfo = None 144 self.sequence = ''
145
146 -class Reference:
147 """Holds information from 1 references in a SwissProt entry. 148 149 Members: 150 number Number of reference in an entry. 151 positions Describes extent of work. list of strings. 152 comments Comments. List of (token, text). 153 references References. List of (dbname, identifier) 154 authors The authors of the work. 155 title Title of the work. 156 location A citation for the work. 157 158 """
159 - def __init__(self):
160 self.number = None 161 self.positions = [] 162 self.comments = [] 163 self.references = [] 164 self.authors = '' 165 self.title = '' 166 self.location = ''
167 168
169 -class Dictionary:
170 """Accesses a SwissProt file using a dictionary interface. 171 172 """ 173 __filename_key = '__filename' 174
175 - def __init__(self, indexname, parser=None):
176 """__init__(self, indexname, parser=None) 177 178 Open a SwissProt Dictionary. indexname is the name of the 179 index for the dictionary. The index should have been created 180 using the index_file function. parser is an optional Parser 181 object to change the results into another form. If set to None, 182 then the raw contents of the file will be returned. 183 184 """ 185 self._index = Index.Index(indexname) 186 self._handle = open(self._index[self.__filename_key]) 187 self._parser = parser
188
189 - def __len__(self):
190 return len(self._index)
191
192 - def __getitem__(self, key):
193 start, len = self._index[key] 194 self._handle.seek(start) 195 data = self._handle.read(len) 196 if self._parser is not None: 197 return self._parser.parse(File.StringHandle(data)) 198 return data
199
200 - def __getattr__(self, name):
201 return getattr(self._index, name)
202
203 - def keys(self):
204 # I only want to expose the keys for SwissProt. 205 k = self._index.keys() 206 k.remove(self.__filename_key) 207 return k
208 209
210 -class RecordParser(AbstractParser):
211 """Parses SwissProt data into a Record object. 212 213 """
214 - def __init__(self):
215 self._scanner = _Scanner() 216 self._consumer = _RecordConsumer()
217
218 - def parse(self, handle):
219 self._scanner.feed(handle, self._consumer) 220 return self._consumer.data
221
222 -class SequenceParser(AbstractParser):
223 """Parses SwissProt data into a standard SeqRecord object. 224 """
225 - def __init__(self, alphabet = Alphabet.generic_protein):
226 """Initialize a SequenceParser. 227 228 Arguments: 229 o alphabet - The alphabet to use for the generated Seq objects. If 230 not supplied this will default to the generic protein alphabet. 231 """ 232 self._scanner = _Scanner() 233 self._consumer = _SequenceConsumer(alphabet)
234
235 - def parse(self, handle):
236 self._scanner.feed(handle, self._consumer) 237 return self._consumer.data
238
239 -class _Scanner:
240 """Scans SwissProt-formatted data. 241 242 Tested with: 243 Release 37 244 Release 38 245 """ 246
247 - def feed(self, handle, consumer):
248 """feed(self, handle, consumer) 249 250 Feed in SwissProt data for scanning. handle is a file-like 251 object that contains swissprot data. consumer is a 252 Consumer object that will receive events as the report is scanned. 253 254 """ 255 if isinstance(handle, File.UndoHandle): 256 uhandle = handle 257 else: 258 uhandle = File.UndoHandle(handle) 259 self._scan_record(uhandle, consumer)
260
261 - def _skip_starstar(self, uhandle):
262 """Ignores any lines starting **""" 263 #See Bug 2353, some files from the EBI have extra lines 264 #starting "**" (two asterisks/stars), usually between the 265 #features and sequence but not all the time. They appear 266 #to be unofficial automated annotations. e.g. 267 #** 268 #** ################# INTERNAL SECTION ################## 269 #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. 270 while "**" == uhandle.peekline()[:2]: 271 skip = uhandle.readline()
272 #print "Skipping line: %s" % skip.rstrip() 273
274 - def _scan_record(self, uhandle, consumer):
275 consumer.start_record() 276 for fn in self._scan_fns: 277 self._skip_starstar(uhandle) 278 fn(self, uhandle, consumer) 279 280 # In Release 38, ID N33_HUMAN has a DR buried within comments. 281 # Check for this and do more comments, if necessary. 282 # XXX handle this better 283 if fn is self._scan_dr.im_func: 284 self._scan_cc(uhandle, consumer) 285 self._scan_dr(uhandle, consumer) 286 consumer.end_record()
287
288 - def _scan_line(self, line_type, uhandle, event_fn, 289 exactly_one=None, one_or_more=None, any_number=None, 290 up_to_one=None):
291 # Callers must set exactly one of exactly_one, one_or_more, or 292 # any_number to a true value. I do not explicitly check to 293 # make sure this function is called correctly. 294 295 # This does not guarantee any parameter safety, but I 296 # like the readability. The other strategy I tried was have 297 # parameters min_lines, max_lines. 298 299 if exactly_one or one_or_more: 300 read_and_call(uhandle, event_fn, start=line_type) 301 if one_or_more or any_number: 302 while 1: 303 if not attempt_read_and_call(uhandle, event_fn, 304 start=line_type): 305 break 306 if up_to_one: 307 attempt_read_and_call(uhandle, event_fn, start=line_type)
308
309 - def _scan_id(self, uhandle, consumer):
310 self._scan_line('ID', uhandle, consumer.identification, exactly_one=1)
311
312 - def _scan_ac(self, uhandle, consumer):
313 # Until release 38, this used to match exactly_one. 314 # However, in release 39, 1A02_HUMAN has 2 AC lines, and the 315 # definition needed to be expanded. 316 self._scan_line('AC', uhandle, consumer.accession, any_number=1)
317
318 - def _scan_dt(self, uhandle, consumer):
319 self._scan_line('DT', uhandle, consumer.date, exactly_one=1) 320 self._scan_line('DT', uhandle, consumer.date, exactly_one=1) 321 # IPI doesn't necessarily contain the third line about annotations 322 self._scan_line('DT', uhandle, consumer.date, up_to_one=1)
323
324 - def _scan_de(self, uhandle, consumer):
325 # IPI can be missing a DE line 326 self._scan_line('DE', uhandle, consumer.description, any_number=1)
327
328 - def _scan_gn(self, uhandle, consumer):
329 self._scan_line('GN', uhandle, consumer.gene_name, any_number=1)
330
331 - def _scan_os(self, uhandle, consumer):
332 self._scan_line('OS', uhandle, consumer.organism_species, 333 one_or_more=1)
334
335 - def _scan_og(self, uhandle, consumer):
336 self._scan_line('OG', uhandle, consumer.organelle, any_number=1)
337
338 - def _scan_oc(self, uhandle, consumer):
339 self._scan_line('OC', uhandle, consumer.organism_classification, 340 one_or_more=1)
341
342 - def _scan_ox(self, uhandle, consumer):
343 self._scan_line('OX', uhandle, consumer.taxonomy_id, 344 any_number=1)
345
346 - def _scan_oh(self, uhandle, consumer):
347 # viral host organism. introduced after SwissProt 39. 348 self._scan_line('OH', uhandle, consumer.organism_host, any_number=1)
349
350 - def _scan_reference(self, uhandle, consumer):
351 while True: 352 if safe_peekline(uhandle)[:2] != 'RN': 353 break 354 self._scan_rn(uhandle, consumer) 355 self._scan_rp(uhandle, consumer) 356 self._scan_rc(uhandle, consumer) 357 self._scan_rx(uhandle, consumer) 358 # ws:2001-12-05 added, for record with RL before RA 359 self._scan_rl(uhandle, consumer) 360 self._scan_ra(uhandle, consumer) 361 #EBI copy of P72010 is missing the RT line, and has one 362 #of their ** lines in its place noting "** /NO TITLE." 363 #See also bug 2353 364 self._skip_starstar(uhandle) 365 self._scan_rt(uhandle, consumer) 366 self._scan_rl(uhandle, consumer)
367
368 - def _scan_rn(self, uhandle, consumer):
369 self._scan_line('RN', uhandle, consumer.reference_number, 370 exactly_one=1)
371
372 - def _scan_rp(self, uhandle, consumer):
373 self._scan_line('RP', uhandle, consumer.reference_position, 374 one_or_more=1)
375
376 - def _scan_rc(self, uhandle, consumer):
377 self._scan_line('RC', uhandle, consumer.reference_comment, 378 any_number=1)
379
380 - def _scan_rx(self, uhandle, consumer):
381 self._scan_line('RX', uhandle, consumer.reference_cross_reference, 382 any_number=1)
383
384 - def _scan_ra(self, uhandle, consumer):
385 # In UniProt release 1.12 of 6/21/04, there is a new RG 386 # (Reference Group) line, which references a group instead of 387 # an author. Each block must have at least 1 RA or RG line. 388 self._scan_line('RA', uhandle, consumer.reference_author, 389 any_number=1) 390 self._scan_line('RG', uhandle, consumer.reference_author, 391 any_number=1) 392 # PRKN_HUMAN has RG lines, then RA lines. The best solution 393 # is to write code that accepts either of the line types. 394 # This is the quick solution... 395 self._scan_line('RA', uhandle, consumer.reference_author, 396 any_number=1)
397
398 - def _scan_rt(self, uhandle, consumer):
399 self._scan_line('RT', uhandle, consumer.reference_title, 400 any_number=1)
401
402 - def _scan_rl(self, uhandle, consumer):
403 # This was one_or_more, but P82909 in TrEMBL 16.0 does not 404 # have one. 405 self._scan_line('RL', uhandle, consumer.reference_location, 406 any_number=1)
407
408 - def _scan_cc(self, uhandle, consumer):
409 self._scan_line('CC', uhandle, consumer.comment, any_number=1)
410
411 - def _scan_dr(self, uhandle, consumer):
412 self._scan_line('DR', uhandle, consumer.database_cross_reference, 413 any_number=1)
414
415 - def _scan_kw(self, uhandle, consumer):
416 self._scan_line('KW', uhandle, consumer.keyword, any_number=1)
417
418 - def _scan_ft(self, uhandle, consumer):
419 self._scan_line('FT', uhandle, consumer.feature_table, any_number=1)
420
421 - def _scan_pe(self, uhandle, consumer):
422 self._scan_line('PE', uhandle, consumer.protein_existence, any_number=1)
423
424 - def _scan_sq(self, uhandle, consumer):
425 self._scan_line('SQ', uhandle, consumer.sequence_header, exactly_one=1)
426
427 - def _scan_sequence_data(self, uhandle, consumer):
428 self._scan_line(' ', uhandle, consumer.sequence_data, one_or_more=1)
429
430 - def _scan_terminator(self, uhandle, consumer):
431 self._scan_line('//', uhandle, consumer.terminator, exactly_one=1)
432 433 _scan_fns = [ 434 _scan_id, 435 _scan_ac, 436 _scan_dt, 437 _scan_de, 438 _scan_gn, 439 _scan_os, 440 _scan_og, 441 _scan_oc, 442 _scan_ox, 443 _scan_oh, 444 _scan_reference, 445 _scan_cc, 446 _scan_dr, 447 _scan_pe, 448 _scan_kw, 449 _scan_ft, 450 _scan_sq, 451 _scan_sequence_data, 452 _scan_terminator 453 ]
454
455 -class _RecordConsumer(AbstractConsumer):
456 """Consumer that converts a SwissProt record to a Record object. 457 458 Members: 459 data Record with SwissProt data. 460 461 """
462 - def __init__(self):
463 self.data = None
464
465 - def __repr__(self):
466 return "Bio.SwissProt.SProt._RecordConsumer()"
467
468 - def start_record(self):
469 self.data = Record() 470 self._sequence_lines = []
471
472 - def end_record(self):
473 self._clean_record(self.data) 474 self.data.sequence = "".join(self._sequence_lines)
475
476 - def identification(self, line):
477 cols = line.split() 478 #Prior to release 51, included with MoleculeType: 479 #ID EntryName DataClass; MoleculeType; SequenceLength. 480 # 481 #Newer files lack the MoleculeType: 482 #ID EntryName DataClass; SequenceLength. 483 # 484 #Note that cols is split on white space, so the length 485 #should become two fields (number and units) 486 if len(cols) == 6: 487 self.data.entry_name = cols[1] 488 self.data.data_class = cols[2].rstrip(_CHOMP) # don't want ';' 489 self.data.molecule_type = cols[3].rstrip(_CHOMP) # don't want ';' 490 self.data.sequence_length = int(cols[4]) 491 elif len(cols) == 5: 492 self.data.entry_name = cols[1] 493 self.data.data_class = cols[2].rstrip(_CHOMP) # don't want ';' 494 self.data.molecule_type = None 495 self.data.sequence_length = int(cols[3]) 496 else: 497 #Should we print a warning an continue? 498 raise ValueError("ID line has unrecognised format:\n"+line) 499 500 # data class can be 'STANDARD' or 'PRELIMINARY' 501 # ws:2001-12-05 added IPI 502 # pjc:2006-11-02 added 'Reviewed' and 'Unreviewed' 503 if self.data.data_class not in ['STANDARD', 'PRELIMINARY', 'IPI', 504 'Reviewed', 'Unreviewed']: 505 raise ValueError("Unrecognized data class %s in line\n%s" % \ 506 (self.data.data_class, line)) 507 # molecule_type should be 'PRT' for PRoTein 508 # Note that has been removed in recent releases (set to None) 509 if self.data.molecule_type is not None \ 510 and self.data.molecule_type != 'PRT': 511 raise ValueError("Unrecognized molecule type %s in line\n%s" % \ 512 (self.data.molecule_type, line))
513
514 - def accession(self, line):
515 cols = line[5:].rstrip(_CHOMP).strip().split(';') 516 for ac in cols: 517 if ac.strip(): 518 #remove any leading or trailing white space 519 self.data.accessions.append(ac.strip())
520
521 - def date(self, line):
522 uprline = line.upper() 523 cols = line.rstrip().split() 524 525 if uprline.find('CREATED') >= 0 \ 526 or uprline.find('LAST SEQUENCE UPDATE') >= 0 \ 527 or uprline.find('LAST ANNOTATION UPDATE') >= 0: 528 # Old style DT line 529 # ================= 530 # e.g. 531 # DT 01-FEB-1995 (Rel. 31, Created) 532 # DT 01-FEB-1995 (Rel. 31, Last sequence update) 533 # DT 01-OCT-2000 (Rel. 40, Last annotation update) 534 # 535 # or: 536 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 537 # ... 538 539 # find where the version information will be located 540 # This is needed for when you have cases like IPI where 541 # the release verison is in a different spot: 542 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 543 uprcols = uprline.split() 544 rel_index = -1 545 for index in range(len(uprcols)): 546 if uprcols[index].find("REL.") >= 0: 547 rel_index = index 548 assert rel_index >= 0, \ 549 "Could not find Rel. in DT line: %s" % (line) 550 version_index = rel_index + 1 551 # get the version information 552 str_version = cols[version_index].rstrip(_CHOMP) 553 # no version number 554 if str_version == '': 555 version = 0 556 # dot versioned 557 elif str_version.find(".") >= 0: 558 version = str_version 559 # integer versioned 560 else: 561 version = int(str_version) 562 563 if uprline.find('CREATED') >= 0: 564 self.data.created = cols[1], version 565 elif uprline.find('LAST SEQUENCE UPDATE') >= 0: 566 self.data.sequence_update = cols[1], version 567 elif uprline.find( 'LAST ANNOTATION UPDATE') >= 0: 568 self.data.annotation_update = cols[1], version 569 else: 570 assert False, "Shouldn't reach this line!" 571 elif uprline.find('INTEGRATED INTO') >= 0 \ 572 or uprline.find('SEQUENCE VERSION') >= 0 \ 573 or uprline.find('ENTRY VERSION') >= 0: 574 # New style DT line 575 # ================= 576 # As of UniProt Knowledgebase release 7.0 (including 577 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the 578 # format of the DT lines and the version information 579 # in them was changed - the release number was dropped. 580 # 581 # For more information see bug 1948 and 582 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 583 # 584 # e.g. 585 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. 586 # DT 15-OCT-2001, sequence version 3. 587 # DT 01-APR-2004, entry version 14. 588 # 589 #This is a new style DT line... 590 591 # The date should be in string cols[1] 592 # Get the version number if there is one. 593 # For the three DT lines above: 0, 3, 14 594 try: 595 version = int(cols[-1]) 596 except ValueError: 597 version = 0 598 599 # Re-use the historical property names, even though 600 # the meaning has changed slighty: 601 if uprline.find("INTEGRATED") >= 0: 602 self.data.created = cols[1], version 603 elif uprline.find('SEQUENCE VERSION') >= 0: 604 self.data.sequence_update = cols[1], version 605 elif uprline.find( 'ENTRY VERSION') >= 0: 606 self.data.annotation_update = cols[1], version 607 else: 608 assert False, "Shouldn't reach this line!" 609 else: 610 raise ValueError("I don't understand the date line %s" % line)
611
612 - def description(self, line):
613 self.data.description += line[5:].strip() + " "
614
615 - def gene_name(self, line):
616 self.data.gene_name += line[5:]
617
618 - def organism_species(self, line):
619 self.data.organism += line[5:]
620
621 - def organelle(self, line):
622 self.data.organelle += line[5:]
623
624 - def organism_classification(self, line):
625 line = line[5:].rstrip(_CHOMP) 626 cols = line.split(';') 627 for col in cols: 628 self.data.organism_classification.append(col.lstrip())
629
630 - def taxonomy_id(self, line):
631 # The OX line is in the format: 632 # OX DESCRIPTION=ID[, ID]...; 633 # If there are too many id's to fit onto a line, then the ID's 634 # continue directly onto the next line, e.g. 635 # OX DESCRIPTION=ID[, ID]... 636 # OX ID[, ID]...; 637 # Currently, the description is always "NCBI_TaxID". 638 639 # To parse this, I need to check to see whether I'm at the 640 # first line. If I am, grab the description and make sure 641 # it's an NCBI ID. Then, grab all the id's. 642 line = line[5:].rstrip(_CHOMP) 643 index = line.find('=') 644 if index >= 0: 645 descr = line[:index] 646 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 647 ids = line[index+1:].split(',') 648 else: 649 ids = line.split(',') 650 self.data.taxonomy_id.extend([id.strip() for id in ids])
651
652 - def organism_host(self, line):
653 # Line type OH (Organism Host) for viral hosts 654 # same code as in taxonomy_id() 655 line = line[5:].rstrip(_CHOMP) 656 index = line.find('=') 657 if index >= 0: 658 descr = line[:index] 659 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 660 ids = line[index+1:].split(',') 661 else: 662 ids = line.split(',') 663 self.data.host_organism.extend([id.strip() for id in ids])
664
665 - def reference_number(self, line):
666 rn = line[5:].rstrip() 667 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn 668 ref = Reference() 669 ref.number = int(rn[1:-1]) 670 self.data.references.append(ref)
671
672 - def reference_position(self, line):
673 assert self.data.references, "RP: missing RN" 674 self.data.references[-1].positions.append(line[5:].rstrip())
675
676 - def reference_comment(self, line):
677 assert self.data.references, "RC: missing RN" 678 cols = line[5:].rstrip().split( ';') 679 ref = self.data.references[-1] 680 for col in cols: 681 if not col: # last column will be the empty string 682 continue 683 # The token is everything before the first '=' character. 684 index = col.find('=') 685 token, text = col[:index], col[index+1:] 686 # According to the spec, there should only be 1 '=' 687 # character. However, there are too many exceptions to 688 # handle, so we'll ease up and allow anything after the 689 # first '='. 690 #if col == ' STRAIN=TISSUE=BRAIN': 691 # # from CSP_MOUSE, release 38 692 # token, text = "TISSUE", "BRAIN" 693 #elif col == ' STRAIN=NCIB 9816-4, AND STRAIN=G7 / ATCC 17485': 694 # # from NDOA_PSEPU, release 38 695 # token, text = "STRAIN", "NCIB 9816-4 AND G7 / ATCC 17485" 696 #elif col == ' STRAIN=ISOLATE=NO 27, ANNO 1987' or \ 697 # col == ' STRAIN=ISOLATE=NO 27 / ANNO 1987': 698 # # from NU3M_BALPH, release 38, release 39 699 # token, text = "STRAIN", "ISOLATE NO 27, ANNO 1987" 700 #else: 701 # token, text = string.split(col, '=') 702 ref.comments.append((token.lstrip(), text))
703
704 - def reference_cross_reference(self, line):
705 assert self.data.references, "RX: missing RN" 706 # The basic (older?) RX line is of the form: 707 # RX MEDLINE; 85132727. 708 # but there are variants of this that need to be dealt with (see below) 709 710 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 711 # have extraneous information in the RX line. Check for 712 # this and chop it out of the line. 713 # (noticed by katel@worldpath.net) 714 ind = line.find('[NCBI, ExPASy, Israel, Japan]') 715 if ind >= 0: 716 line = line[:ind] 717 718 # RX lines can also be used of the form 719 # RX PubMed=9603189; 720 # reported by edvard@farmasi.uit.no 721 # and these can be more complicated like: 722 # RX MEDLINE=95385798; PubMed=7656980; 723 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 724 # We look for these cases first and deal with them 725 if line.find("=") != -1: 726 cols = line[2:].split("; ") 727 cols = [x.strip() for x in cols] 728 cols = [x for x in cols if x] 729 for col in cols: 730 x = col.split("=") 731 assert len(x) == 2, "I don't understand RX line %s" % line 732 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP) 733 ref = self.data.references[-1].references 734 ref.append((key, value)) 735 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 736 else: 737 cols = line.split() 738 # normally we split into the three parts 739 assert len(cols) == 3, "I don't understand RX line %s" % line 740 self.data.references[-1].references.append( 741 (cols[1].rstrip(_CHOMP), cols[2].rstrip(_CHOMP)))
742
743 - def reference_author(self, line):
744 assert self.data.references, "RA: missing RN" 745 ref = self.data.references[-1] 746 ref.authors += line[5:]
747
748 - def reference_title(self, line):
749 assert self.data.references, "RT: missing RN" 750 ref = self.data.references[-1] 751 ref.title += line[5:]
752
753 - def reference_location(self, line):
754 assert self.data.references, "RL: missing RN" 755 ref = self.data.references[-1] 756 ref.location += line[5:]
757
758 - def comment(self, line):
759 if line[5:8] == '-!-': # Make a new comment 760 self.data.comments.append(line[9:]) 761 elif line[5:8] == ' ': # add to the previous comment 762 if not self.data.comments: 763 # TCMO_STRGA in Release 37 has comment with no topic 764 self.data.comments.append(line[9:]) 765 else: 766 self.data.comments[-1] += line[9:] 767 elif line[5:8] == '---': 768 # If there are no comments, and it's not the closing line, 769 # make a new comment. 770 if not self.data.comments or self.data.comments[-1][:3] != '---': 771 self.data.comments.append(line[5:]) 772 else: 773 self.data.comments[-1] += line[5:] 774 else: # copyright notice 775 self.data.comments[-1] += line[5:]
776
777 - def database_cross_reference(self, line):
778 # From CLD1_HUMAN, Release 39: 779 # DR EMBL; [snip]; -. [EMBL / GenBank / DDBJ] [CoDingSequence] 780 # DR PRODOM [Domain structure / List of seq. sharing at least 1 domai 781 # DR SWISS-2DPAGE; GET REGION ON 2D PAGE. 782 line = line[5:] 783 # Remove the comments at the end of the line 784 i = line.find('[') 785 if i >= 0: 786 line = line[:i] 787 cols = line.rstrip(_CHOMP).split(';') 788 cols = [col.lstrip() for col in cols] 789 self.data.cross_references.append(tuple(cols))
790
791 - def keyword(self, line):
792 cols = line[5:].rstrip(_CHOMP).split(';') 793 self.data.keywords.extend([c.lstrip() for c in cols])
794
795 - def feature_table(self, line):
796 line = line[5:] # get rid of junk in front 797 name = line[0:8].rstrip() 798 try: 799 from_res = int(line[9:15]) 800 except ValueError: 801 from_res = line[9:15].lstrip() 802 try: 803 to_res = int(line[16:22]) 804 except ValueError: 805 to_res = line[16:22].lstrip() 806 description = line[29:70].rstrip() 807 #if there is a feature_id (FTId), store it away 808 if line[29:35]==r"/FTId=": 809 ft_id = line[35:70].rstrip()[:-1] 810 else: 811 ft_id ="" 812 if not name: # is continuation of last one 813 assert not from_res and not to_res 814 name, from_res, to_res, old_description,old_ft_id = self.data.features[-1] 815 del self.data.features[-1] 816 description = "%s %s" % (old_description, description) 817 818 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no 819 if name == "VARSPLIC": 820 description = self._fix_varsplic_sequences(description) 821 self.data.features.append((name, from_res, to_res, description,ft_id))
822
823 - def _fix_varsplic_sequences(self, description):
824 """Remove unwanted spaces in sequences. 825 826 During line carryover, the sequences in VARSPLIC can get mangled 827 with unwanted spaces like: 828 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' 829 We want to check for this case and correct it as it happens. 830 """ 831 descr_cols = description.split(" -> ") 832 if len(descr_cols) == 2: 833 first_seq = descr_cols[0] 834 second_seq = descr_cols[1] 835 extra_info = '' 836 # we might have more information at the end of the 837 # second sequence, which should be in parenthesis 838 extra_info_pos = second_seq.find(" (") 839 if extra_info_pos != -1: 840 extra_info = second_seq[extra_info_pos:] 841 second_seq = second_seq[:extra_info_pos] 842 843 # now clean spaces out of the first and second string 844 first_seq = first_seq.replace(" ", "") 845 second_seq = second_seq.replace(" ", "") 846 847 # reassemble the description 848 description = first_seq + " -> " + second_seq + extra_info 849 850 return description
851
852 - def protein_existence(self, line):
853 #TODO - Record this information? 854 pass
855
856 - def sequence_header(self, line):
857 cols = line.split() 858 assert len(cols) == 8, "I don't understand SQ line %s" % line 859 # Do more checking here? 860 self.data.seqinfo = int(cols[2]), int(cols[4]), cols[6]
861
862 - def sequence_data(self, line):
863 #It should be faster to make a list of strings, and join them at the end. 864 self._sequence_lines.append(line.replace(" ", "").rstrip())
865
866 - def terminator(self, line):
867 pass
868 869 #def _clean(self, line, rstrip=1): 870 # if rstrip: 871 # return string.rstrip(line[5:]) 872 # return line[5:] 873
874 - def _clean_record(self, rec):
875 # Remove trailing newlines 876 members = ['description', 'gene_name', 'organism', 'organelle'] 877 for m in members: 878 attr = getattr(rec, m) 879 setattr(rec, m, attr.rstrip()) 880 for ref in rec.references: 881 self._clean_references(ref)
882
883 - def _clean_references(self, ref):
884 # Remove trailing newlines 885 members = ['authors', 'title', 'location'] 886 for m in members: 887 attr = getattr(ref, m) 888 setattr(ref, m, attr.rstrip())
889
890 -class _SequenceConsumer(AbstractConsumer):
891 """Consumer that converts a SwissProt record to a SeqRecord object. 892 893 Members: 894 data Record with SwissProt data. 895 alphabet The alphabet the generated Seq objects will have. 896 """ 897 #TODO - Cope with references as done for GenBank
898 - def __init__(self, alphabet = Alphabet.generic_protein):
899 """Initialize a Sequence Consumer 900 901 Arguments: 902 o alphabet - The alphabet to use for the generated Seq objects. If 903 not supplied this will default to the generic protein alphabet. 904 """ 905 self.data = None 906 self.alphabet = alphabet
907
908 - def start_record(self):
909 seq = Seq.Seq("", self.alphabet) 910 self.data = SeqRecord.SeqRecord(seq) 911 self.data.description = "" 912 self.data.name = "" 913 self._current_ref = None 914 self._sequence_lines = []
915
916 - def end_record(self):
917 if self._current_ref is not None: 918 self.data.annotations['references'].append(self._current_ref) 919 self._current_ref = None 920 self.data.description = self.data.description.rstrip() 921 self.data.seq = Seq.Seq("".join(self._sequence_lines), self.alphabet) 922 self.data.annotations['organism'] = self.data.annotations['organism'].rstrip(_CHOMP)
923
924 - def identification(self, line):
925 cols = line.split() 926 self.data.name = cols[1]
927
928 - def accession(self, line):
929 #Note that files can and often do contain multiple AC lines. 930 ids = line[5:].strip().split(';') 931 #Remove any white space 932 ids = [x.strip() for x in ids if x.strip()] 933 934 #Use the first as the ID, but record them ALL in the annotations 935 try: 936 self.data.annotations['accessions'].extend(ids) 937 except KeyError: 938 self.data.annotations['accessions'] = ids 939 940 #Use the FIRST accession as the ID, not the first on this line! 941 self.data.id = self.data.annotations['accessions'][0]
942 #self.data.id = ids[0] 943
944 - def description(self, line):
945 self.data.description += line[5:].strip() + " "
946
947 - def sequence_data(self, line):
948 #It should be faster to make a list of strings, and join them at the end. 949 self._sequence_lines.append(line.replace(" ", "").rstrip())
950
951 - def gene_name(self, line):
952 #We already store the identification/accession as the records name/id 953 try: 954 self.data.annotations['gene_name'] += " " + line[5:].rstrip() 955 except KeyError: 956 self.data.annotations['gene_name'] = line[5:].rstrip()
957
958 - def comment(self, line):
959 #Try and agree with SeqRecord convention from the GenBank parser, 960 #which stores the comments as a long string with newlines 961 #with key 'comment' 962 #TODO - Follow SwissProt conventions more closely? 963 prefix = line[5:8] 964 text = line[9:].rstrip() 965 if prefix == '-!-': # Make a new comment 966 try: 967 self.data.annotations['comment'] += "\n" + text 968 except KeyError: 969 self.data.annotations['comment'] = text 970 elif prefix == ' ': 971 try: 972 # add to the previous comment 973 self.data.annotations['comment'] += " " + text 974 except KeyError: 975 # TCMO_STRGA in Release 37 has comment with no topic 976 self.data.annotations['comment'] = text
977
978 - def database_cross_reference(self, line):
979 #Format of the line is described in the manual dated 04-Dec-2007 as: 980 #DR DATABASE; PRIMARY; SECONDARY[; TERTIARY][; QUATERNARY]. 981 #However, some older files only seem to have a single identifier: 982 #DR DATABASE; PRIMARY. 983 # 984 #Also must cope with things like this from Tests/SwissProt/sp007, 985 #DR PRODOM [Domain structure / List of seq. sharing at least 1 domain] 986 # 987 #Store these in the dbxref list, but for consistency with 988 #the GenBank parser and with what BioSQL can cope with, 989 #store only DATABASE_IDENTIFIER:PRIMARY_IDENTIFIER 990 parts = [x.strip() for x in line[5:].strip(_CHOMP).split(";")] 991 if len(parts) > 1: 992 value = "%s:%s" % (parts[0], parts[1]) 993 #Avoid duplicate entries 994 if value not in self.data.dbxrefs: 995 self.data.dbxrefs.append(value)
996 #else: 997 #print "Bad DR line:\n%s" % line 998 999
1000 - def date(self, line):
1001 date_str = line.split()[1] 1002 uprline = line.upper() 1003 if uprline.find('CREATED') >= 0: 1004 #Try and agree with SeqRecord convention from the GenBank parser, 1005 #which stores the submitted date as 'date' 1006 self.data.annotations['date'] = date_str 1007 elif uprline.find('LAST SEQUENCE UPDATE') >= 0: 1008 #There is no existing convention from the GenBank SeqRecord parser 1009 self.data.annotations['date_last_sequence_update'] = date_str 1010 elif uprline.find('LAST ANNOTATION UPDATE') >= 0: 1011 #There is no existing convention from the GenBank SeqRecord parser 1012 self.data.annotations['date_last_annotation_update'] = date_str 1013 elif uprline.find('INTEGRATED INTO') >= 0: 1014 self.data.annotations['date'] = date_str.rstrip(",") 1015 elif uprline.find('SEQUENCE VERSION') >= 0: 1016 self.data.annotations['date_last_sequence_update'] = date_str.rstrip(",") 1017 elif uprline.find('ENTRY VERSION') >= 0: 1018 self.data.annotations['date_last_annotation_update'] = date_str.rstrip(",")
1019
1020 - def keyword(self, line):
1021 #Try and agree with SeqRecord convention from the GenBank parser, 1022 #which stores a list as 'keywords' 1023 cols = line[5:].rstrip(_CHOMP).split(';') 1024 cols = [c.strip() for c in cols] 1025 cols = filter(None, cols) 1026 try: 1027 #Extend any existing list of keywords 1028 self.data.annotations['keywords'].extend(cols) 1029 except KeyError: 1030 #Create the list of keywords 1031 self.data.annotations['keywords'] = cols
1032
1033 - def organism_species(self, line):
1034 #Try and agree with SeqRecord convention from the GenBank parser, 1035 #which stores the organism as a string with key 'organism' 1036 data = line[5:].rstrip() 1037 try: 1038 #Append to any existing data split over multiple lines 1039 self.data.annotations['organism'] += " " + data 1040 except KeyError: 1041 self.data.annotations['organism'] = data
1042
1043 - def organism_host(self, line):
1044 #There is no SeqRecord convention from the GenBank parser, 1045 data = line[5:].rstrip(_CHOMP) 1046 index = data.find('=') 1047 if index >= 0: 1048 descr = data[:index] 1049 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 1050 ids = data[index+1:].split(',') 1051 else: 1052 ids = data.split(',') 1053 1054 try: 1055 #Append to any existing data 1056 self.data.annotations['organism_host'].extend(ids) 1057 except KeyError: 1058 self.data.annotations['organism_host'] = ids
1059
1060 - def organism_classification(self, line):
1061 #Try and agree with SeqRecord convention from the GenBank parser, 1062 #which stores this taxonomy lineage ese as a list of strings with 1063 #key 'taxonomy'. 1064 #Note that 'ncbi_taxid' is used for the taxonomy ID (line OX) 1065 line = line[5:].rstrip(_CHOMP) 1066 cols = [col.strip() for col in line.split(';')] 1067 try: 1068 #Append to any existing data 1069 self.data.annotations['taxonomy'].extend(cols) 1070 except KeyError: 1071 self.data.annotations['taxonomy'] = cols
1072
1073 - def taxonomy_id(self, line):
1074 #Try and agree with SeqRecord convention expected in BioSQL 1075 #the NCBI taxon id with key 'ncbi_taxid'. 1076 #Note that 'taxonomy' is used for the taxonomy lineage 1077 #(held as a list of strings, line type OC) 1078 1079 line = line[5:].rstrip(_CHOMP) 1080 index = line.find('=') 1081 if index >= 0: 1082 descr = line[:index] 1083 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 1084 ids = line[index+1:].split(',') 1085 else: 1086 ids = line.split(',') 1087 1088 try: 1089 #Append to any existing data 1090 self.data.annotations['ncbi_taxid'].extend(ids) 1091 except KeyError: 1092 self.data.annotations['ncbi_taxid'] = ids
1093
1094 - def reference_number(self, line):
1095 """RN line, reference number (start of new reference).""" 1096 from Bio.SeqFeature import Reference 1097 # if we have a current reference that hasn't been added to 1098 # the list of references, add it. 1099 if self._current_ref is not None: 1100 self.data.annotations['references'].append(self._current_ref) 1101 else: 1102 self.data.annotations['references'] = [] 1103 1104 self._current_ref = Reference()
1105
1106 - def reference_position(self, line):
1107 """RP line, reference position.""" 1108 assert self._current_ref is not None, "RP: missing RN" 1109 #Should try and store this in self._current_ref.location 1110 #but the SwissProt locations don't match easily to the 1111 #format used in GenBank... 1112 pass
1113
1114 - def reference_cross_reference(self, line):
1115 """RX line, reference cross-references.""" 1116 assert self._current_ref is not None, "RX: missing RN" 1117 # The basic (older?) RX line is of the form: 1118 # RX MEDLINE; 85132727. 1119 # or more recently: 1120 # RX MEDLINE=95385798; PubMed=7656980; 1121 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 1122 # We look for these cases first and deal with them 1123 if line.find("=") != -1: 1124 cols = line[2:].split("; ") 1125 cols = [x.strip() for x in cols] 1126 cols = [x for x in cols if x] 1127 for col in cols: 1128 x = col.split("=") 1129 assert len(x) == 2, "I don't understand RX line %s" % line 1130 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP) 1131 if key == "MEDLINE": 1132 self._current_ref.medline_id = value 1133 elif key == "PubMed": 1134 self._current_ref.pubmed_id = value 1135 else: 1136 #Sadly the SeqFeature.Reference object doesn't 1137 #support anything else (yet) 1138 pass 1139 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 1140 else: 1141 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 1142 # have extraneous information in the RX line. Check for 1143 # this and chop it out of the line. 1144 # (noticed by katel@worldpath.net) 1145 ind = line.find('[NCBI, ExPASy, Israel, Japan]') 1146 if ind >= 0: 1147 line = line[:ind] 1148 cols = line.split() 1149 # normally we split into the three parts 1150 assert len(cols) == 3, "I don't understand RX line %s" % line 1151 key = cols[1].rstrip(_CHOMP) 1152 value = cols[2].rstrip(_CHOMP) 1153 if key == "MEDLINE": 1154 self._current_ref.medline_id = value 1155 elif key == "PubMed": 1156 self._current_ref.pubmed_id = value 1157 else: 1158 #Sadly the SeqFeature.Reference object doesn't 1159 #support anything else (yet) 1160 pass
1161
1162 - def reference_author(self, line):
1163 """RA line, reference author(s).""" 1164 assert self._current_ref is not None, "RA: missing RN" 1165 if self._current_ref.authors: 1166 self._current_ref.authors += " " 1167 self._current_ref.authors += line[5:].rstrip("\n")
1168
1169 - def reference_title(self, line):
1170 """RT line, reference title.""" 1171 assert self._current_ref is not None, "RT: missing RN" 1172 if self._current_ref.title: 1173 self._current_ref.title += " " 1174 self._current_ref.title += line[5:].rstrip("\n")
1175
1176 - def reference_location(self, line):
1177 """RL line, reference 'location' - journal, volume, pages, year.""" 1178 assert self._current_ref is not None, "RL: missing RN" 1179 if self._current_ref.journal: 1180 self._current_ref.journal += " " 1181 self._current_ref.journal += line[5:].rstrip("\n")
1182
1183 - def reference_comment(self, line):
1184 """RC line, reference comment.""" 1185 assert self._current_ref is not None, "RC: missing RN" 1186 #This has a key=value; structure... 1187 #Can we do a better job with the current Reference class? 1188 if self._current_ref.comment: 1189 self._current_ref.comment += " " 1190 self._current_ref.comment += line[5:].rstrip("\n")
1191
1192 -def index_file(filename, indexname, rec2key=None):
1193 """index_file(filename, indexname, rec2key=None) 1194 1195 Index a SwissProt file. filename is the name of the file. 1196 indexname is the name of the dictionary. rec2key is an 1197 optional callback that takes a Record and generates a unique key 1198 (e.g. the accession number) for the record. If not specified, 1199 the entry name will be used. 1200 1201 """ 1202 from Bio.SwissProt import parse 1203 if not os.path.exists(filename): 1204 raise ValueError("%s does not exist" % filename) 1205 1206 index = Index.Index(indexname, truncate=1) 1207 index[Dictionary._Dictionary__filename_key] = filename 1208 1209 handle = open(filename) 1210 records = parse(handle) 1211 end = 0L 1212 for record in records: 1213 start = end 1214 end = long(handle.tell()) 1215 length = end - start 1216 1217 if rec2key is not None: 1218 key = rec2key(record) 1219 else: 1220 key = record.entry_name 1221 1222 if not key: 1223 raise KeyError("empty sequence key was produced") 1224 elif key in index: 1225 raise KeyError("duplicate key %s found" % key) 1226 1227 index[key] = start, length
1228