Trees | Indices | Help |
---|
|
1 # Copyright 1999 by Jeffrey Chang. All rights reserved. 2 # This code is part of the Biopython distribution and governed by its 3 # license. Please see the LICENSE file that should have been included 4 # as part of this package. 5 6 """ 7 This module provides code to work with the sprotXX.dat file from 8 Utilities for working with FASTA-formatted sequences (DEPRECATED). 9 http://www.expasy.ch/sprot/sprot-top.html 10 11 Please see Bio.SwissProt for alternatives for the functionality in this module. 12 13 Tested with: 14 Release 37, Release 38, Release 39 15 16 Limited testing with: 17 Release 51, 54 18 19 20 Classes: 21 Record Holds SwissProt data. 22 Reference Holds reference data from a SwissProt entry. 23 Iterator Iterates over entries in a SwissProt file. 24 Dictionary Accesses a SwissProt file using a dictionary interface. 25 RecordParser Parses a SwissProt record into a Record object. 26 SequenceParser Parses a SwissProt record into a SeqRecord object. 27 28 _Scanner Scans SwissProt-formatted data. 29 _RecordConsumer Consumes SwissProt data to a SProt.Record object. 30 _SequenceConsumer Consumes SwissProt data to a SeqRecord object. 31 32 33 Functions: 34 index_file Index a SwissProt file for a Dictionary. 35 36 """ 37 import warnings 38 warnings.warn("Bio.SwissProt.SProt is deprecated. Please use the functions Bio.SwissProt.parse or Bio.SwissProt.read if you want to get a SwissProt.Record, or Bio.SeqIO.parse or Bio.SeqIO.read if you want to get a SeqRecord. If these solutions do not work for you, please get in contact with the Biopython developers (biopython-dev@biopython.org).", 39 DeprecationWarning) 40 41 from types import * 42 import os 43 from Bio import File 44 from Bio import Index 45 from Bio import Alphabet 46 from Bio import Seq 47 from Bio import SeqRecord 48 from Bio.ParserSupport import * 49 50 # The parse(), read() functions can probably be simplified if we don't 51 # use the "parser = RecordParser(); parser.parse(handle)" approach.53 from SProt import RecordParser 54 import cStringIO 55 parser = RecordParser() 56 text = "" 57 for line in handle: 58 text += line 59 if line[:2]=='//': 60 handle = cStringIO.StringIO(text) 61 record = parser.parse(handle) 62 text = "" 63 yield record6466 from SProt import RecordParser 67 parser = RecordParser() 68 try: 69 record = parser.parse(handle) 70 except ValueError, error: 71 if error.message.startswith("Line does not start with 'ID':"): 72 raise ValueError("No SwissProt record found") 73 else: 74 raise error 75 # We should have reached the end of the record by now 76 remainder = handle.read() 77 if remainder: 78 raise ValueError("More than one SwissProt record found") 79 return record80 81 82 _CHOMP = " \n\r\t.,;" #whitespace and trailing punctuation 8385 """Holds information from a SwissProt record. 86 87 Members: 88 entry_name Name of this entry, e.g. RL1_ECOLI. 89 data_class Either 'STANDARD' or 'PRELIMINARY'. 90 molecule_type Type of molecule, 'PRT', 91 sequence_length Number of residues. 92 93 accessions List of the accession numbers, e.g. ['P00321'] 94 created A tuple of (date, release). 95 sequence_update A tuple of (date, release). 96 annotation_update A tuple of (date, release). 97 98 description Free-format description. 99 gene_name Gene name. See userman.txt for description. 100 organism The source of the sequence. 101 organelle The origin of the sequence. 102 organism_classification The taxonomy classification. List of strings. 103 (http://www.ncbi.nlm.nih.gov/Taxonomy/) 104 taxonomy_id A list of NCBI taxonomy id's. 105 host_organism A list of NCBI taxonomy id's of the hosts of a virus, 106 if any. 107 references List of Reference objects. 108 comments List of strings. 109 cross_references List of tuples (db, id1[, id2][, id3]). See the docs. 110 keywords List of the keywords. 111 features List of tuples (key name, from, to, description). 112 from and to can be either integers for the residue 113 numbers, '<', '>', or '?' 114 115 seqinfo tuple of (length, molecular weight, CRC32 value) 116 sequence The sequence. 117 118 """145120 self.entry_name = None 121 self.data_class = None 122 self.molecule_type = None 123 self.sequence_length = None 124 125 self.accessions = [] 126 self.created = None 127 self.sequence_update = None 128 self.annotation_update = None 129 130 self.description = '' 131 self.gene_name = '' 132 self.organism = '' 133 self.organelle = '' 134 self.organism_classification = [] 135 self.taxonomy_id = [] 136 self.host_organism = [] 137 self.references = [] 138 self.comments = [] 139 self.cross_references = [] 140 self.keywords = [] 141 self.features = [] 142 143 self.seqinfo = None 144 self.sequence = ''147 """Holds information from 1 references in a SwissProt entry. 148 149 Members: 150 number Number of reference in an entry. 151 positions Describes extent of work. list of strings. 152 comments Comments. List of (token, text). 153 references References. List of (dbname, identifier) 154 authors The authors of the work. 155 title Title of the work. 156 location A citation for the work. 157 158 """167 168170 """Accesses a SwissProt file using a dictionary interface. 171 172 """ 173 __filename_key = '__filename' 174208 209 221176 """__init__(self, indexname, parser=None) 177 178 Open a SwissProt Dictionary. indexname is the name of the 179 index for the dictionary. The index should have been created 180 using the index_file function. parser is an optional Parser 181 object to change the results into another form. If set to None, 182 then the raw contents of the file will be returned. 183 184 """ 185 self._index = Index.Index(indexname) 186 self._handle = open(self._index[self.__filename_key]) 187 self._parser = parser188190 return len(self._index)191193 start, len = self._index[key] 194 self._handle.seek(start) 195 data = self._handle.read(len) 196 if self._parser is not None: 197 return self._parser.parse(File.StringHandle(data)) 198 return data199 202204 # I only want to expose the keys for SwissProt. 205 k = self._index.keys() 206 k.remove(self.__filename_key) 207 return k223 """Parses SwissProt data into a standard SeqRecord object. 224 """238226 """Initialize a SequenceParser. 227 228 Arguments: 229 o alphabet - The alphabet to use for the generated Seq objects. If 230 not supplied this will default to the generic protein alphabet. 231 """ 232 self._scanner = _Scanner() 233 self._consumer = _SequenceConsumer(alphabet)234240 """Scans SwissProt-formatted data. 241 242 Tested with: 243 Release 37 244 Release 38 245 """ 246454248 """feed(self, handle, consumer) 249 250 Feed in SwissProt data for scanning. handle is a file-like 251 object that contains swissprot data. consumer is a 252 Consumer object that will receive events as the report is scanned. 253 254 """ 255 if isinstance(handle, File.UndoHandle): 256 uhandle = handle 257 else: 258 uhandle = File.UndoHandle(handle) 259 self._scan_record(uhandle, consumer)260262 """Ignores any lines starting **""" 263 #See Bug 2353, some files from the EBI have extra lines 264 #starting "**" (two asterisks/stars), usually between the 265 #features and sequence but not all the time. They appear 266 #to be unofficial automated annotations. e.g. 267 #** 268 #** ################# INTERNAL SECTION ################## 269 #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. 270 while "**" == uhandle.peekline()[:2]: 271 skip = uhandle.readline()272 #print "Skipping line: %s" % skip.rstrip() 273275 consumer.start_record() 276 for fn in self._scan_fns: 277 self._skip_starstar(uhandle) 278 fn(self, uhandle, consumer) 279 280 # In Release 38, ID N33_HUMAN has a DR buried within comments. 281 # Check for this and do more comments, if necessary. 282 # XXX handle this better 283 if fn is self._scan_dr.im_func: 284 self._scan_cc(uhandle, consumer) 285 self._scan_dr(uhandle, consumer) 286 consumer.end_record()287288 - def _scan_line(self, line_type, uhandle, event_fn, 289 exactly_one=None, one_or_more=None, any_number=None, 290 up_to_one=None):291 # Callers must set exactly one of exactly_one, one_or_more, or 292 # any_number to a true value. I do not explicitly check to 293 # make sure this function is called correctly. 294 295 # This does not guarantee any parameter safety, but I 296 # like the readability. The other strategy I tried was have 297 # parameters min_lines, max_lines. 298 299 if exactly_one or one_or_more: 300 read_and_call(uhandle, event_fn, start=line_type) 301 if one_or_more or any_number: 302 while 1: 303 if not attempt_read_and_call(uhandle, event_fn, 304 start=line_type): 305 break 306 if up_to_one: 307 attempt_read_and_call(uhandle, event_fn, start=line_type)308 311313 # Until release 38, this used to match exactly_one. 314 # However, in release 39, 1A02_HUMAN has 2 AC lines, and the 315 # definition needed to be expanded. 316 self._scan_line('AC', uhandle, consumer.accession, any_number=1)317319 self._scan_line('DT', uhandle, consumer.date, exactly_one=1) 320 self._scan_line('DT', uhandle, consumer.date, exactly_one=1) 321 # IPI doesn't necessarily contain the third line about annotations 322 self._scan_line('DT', uhandle, consumer.date, up_to_one=1)323325 # IPI can be missing a DE line 326 self._scan_line('DE', uhandle, consumer.description, any_number=1)327 330 334 337 341 345347 # viral host organism. introduced after SwissProt 39. 348 self._scan_line('OH', uhandle, consumer.organism_host, any_number=1)349351 while True: 352 if safe_peekline(uhandle)[:2] != 'RN': 353 break 354 self._scan_rn(uhandle, consumer) 355 self._scan_rp(uhandle, consumer) 356 self._scan_rc(uhandle, consumer) 357 self._scan_rx(uhandle, consumer) 358 # ws:2001-12-05 added, for record with RL before RA 359 self._scan_rl(uhandle, consumer) 360 self._scan_ra(uhandle, consumer) 361 #EBI copy of P72010 is missing the RT line, and has one 362 #of their ** lines in its place noting "** /NO TITLE." 363 #See also bug 2353 364 self._skip_starstar(uhandle) 365 self._scan_rt(uhandle, consumer) 366 self._scan_rl(uhandle, consumer)367 371 375 379 383385 # In UniProt release 1.12 of 6/21/04, there is a new RG 386 # (Reference Group) line, which references a group instead of 387 # an author. Each block must have at least 1 RA or RG line. 388 self._scan_line('RA', uhandle, consumer.reference_author, 389 any_number=1) 390 self._scan_line('RG', uhandle, consumer.reference_author, 391 any_number=1) 392 # PRKN_HUMAN has RG lines, then RA lines. The best solution 393 # is to write code that accepts either of the line types. 394 # This is the quick solution... 395 self._scan_line('RA', uhandle, consumer.reference_author, 396 any_number=1)397 401403 # This was one_or_more, but P82909 in TrEMBL 16.0 does not 404 # have one. 405 self._scan_line('RL', uhandle, consumer.reference_location, 406 any_number=1)407 410 414 417 420 423 426 429 432 433 _scan_fns = [ 434 _scan_id, 435 _scan_ac, 436 _scan_dt, 437 _scan_de, 438 _scan_gn, 439 _scan_os, 440 _scan_og, 441 _scan_oc, 442 _scan_ox, 443 _scan_oh, 444 _scan_reference, 445 _scan_cc, 446 _scan_dr, 447 _scan_pe, 448 _scan_kw, 449 _scan_ft, 450 _scan_sq, 451 _scan_sequence_data, 452 _scan_terminator 453 ]456 """Consumer that converts a SwissProt record to a Record object. 457 458 Members: 459 data Record with SwissProt data. 460 461 """889463 self.data = None464 467 471 475477 cols = line.split() 478 #Prior to release 51, included with MoleculeType: 479 #ID EntryName DataClass; MoleculeType; SequenceLength. 480 # 481 #Newer files lack the MoleculeType: 482 #ID EntryName DataClass; SequenceLength. 483 # 484 #Note that cols is split on white space, so the length 485 #should become two fields (number and units) 486 if len(cols) == 6: 487 self.data.entry_name = cols[1] 488 self.data.data_class = cols[2].rstrip(_CHOMP) # don't want ';' 489 self.data.molecule_type = cols[3].rstrip(_CHOMP) # don't want ';' 490 self.data.sequence_length = int(cols[4]) 491 elif len(cols) == 5: 492 self.data.entry_name = cols[1] 493 self.data.data_class = cols[2].rstrip(_CHOMP) # don't want ';' 494 self.data.molecule_type = None 495 self.data.sequence_length = int(cols[3]) 496 else: 497 #Should we print a warning an continue? 498 raise ValueError("ID line has unrecognised format:\n"+line) 499 500 # data class can be 'STANDARD' or 'PRELIMINARY' 501 # ws:2001-12-05 added IPI 502 # pjc:2006-11-02 added 'Reviewed' and 'Unreviewed' 503 if self.data.data_class not in ['STANDARD', 'PRELIMINARY', 'IPI', 504 'Reviewed', 'Unreviewed']: 505 raise ValueError("Unrecognized data class %s in line\n%s" % \ 506 (self.data.data_class, line)) 507 # molecule_type should be 'PRT' for PRoTein 508 # Note that has been removed in recent releases (set to None) 509 if self.data.molecule_type is not None \ 510 and self.data.molecule_type != 'PRT': 511 raise ValueError("Unrecognized molecule type %s in line\n%s" % \ 512 (self.data.molecule_type, line))513515 cols = line[5:].rstrip(_CHOMP).strip().split(';') 516 for ac in cols: 517 if ac.strip(): 518 #remove any leading or trailing white space 519 self.data.accessions.append(ac.strip())520522 uprline = line.upper() 523 cols = line.rstrip().split() 524 525 if uprline.find('CREATED') >= 0 \ 526 or uprline.find('LAST SEQUENCE UPDATE') >= 0 \ 527 or uprline.find('LAST ANNOTATION UPDATE') >= 0: 528 # Old style DT line 529 # ================= 530 # e.g. 531 # DT 01-FEB-1995 (Rel. 31, Created) 532 # DT 01-FEB-1995 (Rel. 31, Last sequence update) 533 # DT 01-OCT-2000 (Rel. 40, Last annotation update) 534 # 535 # or: 536 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 537 # ... 538 539 # find where the version information will be located 540 # This is needed for when you have cases like IPI where 541 # the release verison is in a different spot: 542 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 543 uprcols = uprline.split() 544 rel_index = -1 545 for index in range(len(uprcols)): 546 if uprcols[index].find("REL.") >= 0: 547 rel_index = index 548 assert rel_index >= 0, \ 549 "Could not find Rel. in DT line: %s" % (line) 550 version_index = rel_index + 1 551 # get the version information 552 str_version = cols[version_index].rstrip(_CHOMP) 553 # no version number 554 if str_version == '': 555 version = 0 556 # dot versioned 557 elif str_version.find(".") >= 0: 558 version = str_version 559 # integer versioned 560 else: 561 version = int(str_version) 562 563 if uprline.find('CREATED') >= 0: 564 self.data.created = cols[1], version 565 elif uprline.find('LAST SEQUENCE UPDATE') >= 0: 566 self.data.sequence_update = cols[1], version 567 elif uprline.find( 'LAST ANNOTATION UPDATE') >= 0: 568 self.data.annotation_update = cols[1], version 569 else: 570 assert False, "Shouldn't reach this line!" 571 elif uprline.find('INTEGRATED INTO') >= 0 \ 572 or uprline.find('SEQUENCE VERSION') >= 0 \ 573 or uprline.find('ENTRY VERSION') >= 0: 574 # New style DT line 575 # ================= 576 # As of UniProt Knowledgebase release 7.0 (including 577 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the 578 # format of the DT lines and the version information 579 # in them was changed - the release number was dropped. 580 # 581 # For more information see bug 1948 and 582 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 583 # 584 # e.g. 585 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. 586 # DT 15-OCT-2001, sequence version 3. 587 # DT 01-APR-2004, entry version 14. 588 # 589 #This is a new style DT line... 590 591 # The date should be in string cols[1] 592 # Get the version number if there is one. 593 # For the three DT lines above: 0, 3, 14 594 try: 595 version = int(cols[-1]) 596 except ValueError: 597 version = 0 598 599 # Re-use the historical property names, even though 600 # the meaning has changed slighty: 601 if uprline.find("INTEGRATED") >= 0: 602 self.data.created = cols[1], version 603 elif uprline.find('SEQUENCE VERSION') >= 0: 604 self.data.sequence_update = cols[1], version 605 elif uprline.find( 'ENTRY VERSION') >= 0: 606 self.data.annotation_update = cols[1], version 607 else: 608 assert False, "Shouldn't reach this line!" 609 else: 610 raise ValueError("I don't understand the date line %s" % line)611 614 617 620 623625 line = line[5:].rstrip(_CHOMP) 626 cols = line.split(';') 627 for col in cols: 628 self.data.organism_classification.append(col.lstrip())629631 # The OX line is in the format: 632 # OX DESCRIPTION=ID[, ID]...; 633 # If there are too many id's to fit onto a line, then the ID's 634 # continue directly onto the next line, e.g. 635 # OX DESCRIPTION=ID[, ID]... 636 # OX ID[, ID]...; 637 # Currently, the description is always "NCBI_TaxID". 638 639 # To parse this, I need to check to see whether I'm at the 640 # first line. If I am, grab the description and make sure 641 # it's an NCBI ID. Then, grab all the id's. 642 line = line[5:].rstrip(_CHOMP) 643 index = line.find('=') 644 if index >= 0: 645 descr = line[:index] 646 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 647 ids = line[index+1:].split(',') 648 else: 649 ids = line.split(',') 650 self.data.taxonomy_id.extend([id.strip() for id in ids])651653 # Line type OH (Organism Host) for viral hosts 654 # same code as in taxonomy_id() 655 line = line[5:].rstrip(_CHOMP) 656 index = line.find('=') 657 if index >= 0: 658 descr = line[:index] 659 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 660 ids = line[index+1:].split(',') 661 else: 662 ids = line.split(',') 663 self.data.host_organism.extend([id.strip() for id in ids])664666 rn = line[5:].rstrip() 667 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn 668 ref = Reference() 669 ref.number = int(rn[1:-1]) 670 self.data.references.append(ref)671673 assert self.data.references, "RP: missing RN" 674 self.data.references[-1].positions.append(line[5:].rstrip())675677 assert self.data.references, "RC: missing RN" 678 cols = line[5:].rstrip().split( ';') 679 ref = self.data.references[-1] 680 for col in cols: 681 if not col: # last column will be the empty string 682 continue 683 # The token is everything before the first '=' character. 684 index = col.find('=') 685 token, text = col[:index], col[index+1:] 686 # According to the spec, there should only be 1 '=' 687 # character. However, there are too many exceptions to 688 # handle, so we'll ease up and allow anything after the 689 # first '='. 690 #if col == ' STRAIN=TISSUE=BRAIN': 691 # # from CSP_MOUSE, release 38 692 # token, text = "TISSUE", "BRAIN" 693 #elif col == ' STRAIN=NCIB 9816-4, AND STRAIN=G7 / ATCC 17485': 694 # # from NDOA_PSEPU, release 38 695 # token, text = "STRAIN", "NCIB 9816-4 AND G7 / ATCC 17485" 696 #elif col == ' STRAIN=ISOLATE=NO 27, ANNO 1987' or \ 697 # col == ' STRAIN=ISOLATE=NO 27 / ANNO 1987': 698 # # from NU3M_BALPH, release 38, release 39 699 # token, text = "STRAIN", "ISOLATE NO 27, ANNO 1987" 700 #else: 701 # token, text = string.split(col, '=') 702 ref.comments.append((token.lstrip(), text))703705 assert self.data.references, "RX: missing RN" 706 # The basic (older?) RX line is of the form: 707 # RX MEDLINE; 85132727. 708 # but there are variants of this that need to be dealt with (see below) 709 710 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 711 # have extraneous information in the RX line. Check for 712 # this and chop it out of the line. 713 # (noticed by katel@worldpath.net) 714 ind = line.find('[NCBI, ExPASy, Israel, Japan]') 715 if ind >= 0: 716 line = line[:ind] 717 718 # RX lines can also be used of the form 719 # RX PubMed=9603189; 720 # reported by edvard@farmasi.uit.no 721 # and these can be more complicated like: 722 # RX MEDLINE=95385798; PubMed=7656980; 723 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 724 # We look for these cases first and deal with them 725 if line.find("=") != -1: 726 cols = line[2:].split("; ") 727 cols = [x.strip() for x in cols] 728 cols = [x for x in cols if x] 729 for col in cols: 730 x = col.split("=") 731 assert len(x) == 2, "I don't understand RX line %s" % line 732 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP) 733 ref = self.data.references[-1].references 734 ref.append((key, value)) 735 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 736 else: 737 cols = line.split() 738 # normally we split into the three parts 739 assert len(cols) == 3, "I don't understand RX line %s" % line 740 self.data.references[-1].references.append( 741 (cols[1].rstrip(_CHOMP), cols[2].rstrip(_CHOMP)))742 747749 assert self.data.references, "RT: missing RN" 750 ref = self.data.references[-1] 751 ref.title += line[5:]752754 assert self.data.references, "RL: missing RN" 755 ref = self.data.references[-1] 756 ref.location += line[5:]757759 if line[5:8] == '-!-': # Make a new comment 760 self.data.comments.append(line[9:]) 761 elif line[5:8] == ' ': # add to the previous comment 762 if not self.data.comments: 763 # TCMO_STRGA in Release 37 has comment with no topic 764 self.data.comments.append(line[9:]) 765 else: 766 self.data.comments[-1] += line[9:] 767 elif line[5:8] == '---': 768 # If there are no comments, and it's not the closing line, 769 # make a new comment. 770 if not self.data.comments or self.data.comments[-1][:3] != '---': 771 self.data.comments.append(line[5:]) 772 else: 773 self.data.comments[-1] += line[5:] 774 else: # copyright notice 775 self.data.comments[-1] += line[5:]776778 # From CLD1_HUMAN, Release 39: 779 # DR EMBL; [snip]; -. [EMBL / GenBank / DDBJ] [CoDingSequence] 780 # DR PRODOM [Domain structure / List of seq. sharing at least 1 domai 781 # DR SWISS-2DPAGE; GET REGION ON 2D PAGE. 782 line = line[5:] 783 # Remove the comments at the end of the line 784 i = line.find('[') 785 if i >= 0: 786 line = line[:i] 787 cols = line.rstrip(_CHOMP).split(';') 788 cols = [col.lstrip() for col in cols] 789 self.data.cross_references.append(tuple(cols))790792 cols = line[5:].rstrip(_CHOMP).split(';') 793 self.data.keywords.extend([c.lstrip() for c in cols])794796 line = line[5:] # get rid of junk in front 797 name = line[0:8].rstrip() 798 try: 799 from_res = int(line[9:15]) 800 except ValueError: 801 from_res = line[9:15].lstrip() 802 try: 803 to_res = int(line[16:22]) 804 except ValueError: 805 to_res = line[16:22].lstrip() 806 description = line[29:70].rstrip() 807 #if there is a feature_id (FTId), store it away 808 if line[29:35]==r"/FTId=": 809 ft_id = line[35:70].rstrip()[:-1] 810 else: 811 ft_id ="" 812 if not name: # is continuation of last one 813 assert not from_res and not to_res 814 name, from_res, to_res, old_description,old_ft_id = self.data.features[-1] 815 del self.data.features[-1] 816 description = "%s %s" % (old_description, description) 817 818 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no 819 if name == "VARSPLIC": 820 description = self._fix_varsplic_sequences(description) 821 self.data.features.append((name, from_res, to_res, description,ft_id))822824 """Remove unwanted spaces in sequences. 825 826 During line carryover, the sequences in VARSPLIC can get mangled 827 with unwanted spaces like: 828 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' 829 We want to check for this case and correct it as it happens. 830 """ 831 descr_cols = description.split(" -> ") 832 if len(descr_cols) == 2: 833 first_seq = descr_cols[0] 834 second_seq = descr_cols[1] 835 extra_info = '' 836 # we might have more information at the end of the 837 # second sequence, which should be in parenthesis 838 extra_info_pos = second_seq.find(" (") 839 if extra_info_pos != -1: 840 extra_info = second_seq[extra_info_pos:] 841 second_seq = second_seq[:extra_info_pos] 842 843 # now clean spaces out of the first and second string 844 first_seq = first_seq.replace(" ", "") 845 second_seq = second_seq.replace(" ", "") 846 847 # reassemble the description 848 description = first_seq + " -> " + second_seq + extra_info 849 850 return description851 855857 cols = line.split() 858 assert len(cols) == 8, "I don't understand SQ line %s" % line 859 # Do more checking here? 860 self.data.seqinfo = int(cols[2]), int(cols[4]), cols[6]861863 #It should be faster to make a list of strings, and join them at the end. 864 self._sequence_lines.append(line.replace(" ", "").rstrip())865 868 869 #def _clean(self, line, rstrip=1): 870 # if rstrip: 871 # return string.rstrip(line[5:]) 872 # return line[5:] 873875 # Remove trailing newlines 876 members = ['description', 'gene_name', 'organism', 'organelle'] 877 for m in members: 878 attr = getattr(rec, m) 879 setattr(rec, m, attr.rstrip()) 880 for ref in rec.references: 881 self._clean_references(ref)882884 # Remove trailing newlines 885 members = ['authors', 'title', 'location'] 886 for m in members: 887 attr = getattr(ref, m) 888 setattr(ref, m, attr.rstrip())891 """Consumer that converts a SwissProt record to a SeqRecord object. 892 893 Members: 894 data Record with SwissProt data. 895 alphabet The alphabet the generated Seq objects will have. 896 """ 897 #TODO - Cope with references as done for GenBank1191899 """Initialize a Sequence Consumer 900 901 Arguments: 902 o alphabet - The alphabet to use for the generated Seq objects. If 903 not supplied this will default to the generic protein alphabet. 904 """ 905 self.data = None 906 self.alphabet = alphabet907909 seq = Seq.Seq("", self.alphabet) 910 self.data = SeqRecord.SeqRecord(seq) 911 self.data.description = "" 912 self.data.name = "" 913 self._current_ref = None 914 self._sequence_lines = []915917 if self._current_ref is not None: 918 self.data.annotations['references'].append(self._current_ref) 919 self._current_ref = None 920 self.data.description = self.data.description.rstrip() 921 self.data.seq = Seq.Seq("".join(self._sequence_lines), self.alphabet) 922 self.data.annotations['organism'] = self.data.annotations['organism'].rstrip(_CHOMP)923 927929 #Note that files can and often do contain multiple AC lines. 930 ids = line[5:].strip().split(';') 931 #Remove any white space 932 ids = [x.strip() for x in ids if x.strip()] 933 934 #Use the first as the ID, but record them ALL in the annotations 935 try: 936 self.data.annotations['accessions'].extend(ids) 937 except KeyError: 938 self.data.annotations['accessions'] = ids 939 940 #Use the FIRST accession as the ID, not the first on this line! 941 self.data.id = self.data.annotations['accessions'][0]942 #self.data.id = ids[0] 943 946948 #It should be faster to make a list of strings, and join them at the end. 949 self._sequence_lines.append(line.replace(" ", "").rstrip())950952 #We already store the identification/accession as the records name/id 953 try: 954 self.data.annotations['gene_name'] += " " + line[5:].rstrip() 955 except KeyError: 956 self.data.annotations['gene_name'] = line[5:].rstrip()957959 #Try and agree with SeqRecord convention from the GenBank parser, 960 #which stores the comments as a long string with newlines 961 #with key 'comment' 962 #TODO - Follow SwissProt conventions more closely? 963 prefix = line[5:8] 964 text = line[9:].rstrip() 965 if prefix == '-!-': # Make a new comment 966 try: 967 self.data.annotations['comment'] += "\n" + text 968 except KeyError: 969 self.data.annotations['comment'] = text 970 elif prefix == ' ': 971 try: 972 # add to the previous comment 973 self.data.annotations['comment'] += " " + text 974 except KeyError: 975 # TCMO_STRGA in Release 37 has comment with no topic 976 self.data.annotations['comment'] = text977979 #Format of the line is described in the manual dated 04-Dec-2007 as: 980 #DR DATABASE; PRIMARY; SECONDARY[; TERTIARY][; QUATERNARY]. 981 #However, some older files only seem to have a single identifier: 982 #DR DATABASE; PRIMARY. 983 # 984 #Also must cope with things like this from Tests/SwissProt/sp007, 985 #DR PRODOM [Domain structure / List of seq. sharing at least 1 domain] 986 # 987 #Store these in the dbxref list, but for consistency with 988 #the GenBank parser and with what BioSQL can cope with, 989 #store only DATABASE_IDENTIFIER:PRIMARY_IDENTIFIER 990 parts = [x.strip() for x in line[5:].strip(_CHOMP).split(";")] 991 if len(parts) > 1: 992 value = "%s:%s" % (parts[0], parts[1]) 993 #Avoid duplicate entries 994 if value not in self.data.dbxrefs: 995 self.data.dbxrefs.append(value)996 #else: 997 #print "Bad DR line:\n%s" % line 998 9991001 date_str = line.split()[1] 1002 uprline = line.upper() 1003 if uprline.find('CREATED') >= 0: 1004 #Try and agree with SeqRecord convention from the GenBank parser, 1005 #which stores the submitted date as 'date' 1006 self.data.annotations['date'] = date_str 1007 elif uprline.find('LAST SEQUENCE UPDATE') >= 0: 1008 #There is no existing convention from the GenBank SeqRecord parser 1009 self.data.annotations['date_last_sequence_update'] = date_str 1010 elif uprline.find('LAST ANNOTATION UPDATE') >= 0: 1011 #There is no existing convention from the GenBank SeqRecord parser 1012 self.data.annotations['date_last_annotation_update'] = date_str 1013 elif uprline.find('INTEGRATED INTO') >= 0: 1014 self.data.annotations['date'] = date_str.rstrip(",") 1015 elif uprline.find('SEQUENCE VERSION') >= 0: 1016 self.data.annotations['date_last_sequence_update'] = date_str.rstrip(",") 1017 elif uprline.find('ENTRY VERSION') >= 0: 1018 self.data.annotations['date_last_annotation_update'] = date_str.rstrip(",")10191021 #Try and agree with SeqRecord convention from the GenBank parser, 1022 #which stores a list as 'keywords' 1023 cols = line[5:].rstrip(_CHOMP).split(';') 1024 cols = [c.strip() for c in cols] 1025 cols = filter(None, cols) 1026 try: 1027 #Extend any existing list of keywords 1028 self.data.annotations['keywords'].extend(cols) 1029 except KeyError: 1030 #Create the list of keywords 1031 self.data.annotations['keywords'] = cols10321034 #Try and agree with SeqRecord convention from the GenBank parser, 1035 #which stores the organism as a string with key 'organism' 1036 data = line[5:].rstrip() 1037 try: 1038 #Append to any existing data split over multiple lines 1039 self.data.annotations['organism'] += " " + data 1040 except KeyError: 1041 self.data.annotations['organism'] = data10421044 #There is no SeqRecord convention from the GenBank parser, 1045 data = line[5:].rstrip(_CHOMP) 1046 index = data.find('=') 1047 if index >= 0: 1048 descr = data[:index] 1049 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 1050 ids = data[index+1:].split(',') 1051 else: 1052 ids = data.split(',') 1053 1054 try: 1055 #Append to any existing data 1056 self.data.annotations['organism_host'].extend(ids) 1057 except KeyError: 1058 self.data.annotations['organism_host'] = ids10591061 #Try and agree with SeqRecord convention from the GenBank parser, 1062 #which stores this taxonomy lineage ese as a list of strings with 1063 #key 'taxonomy'. 1064 #Note that 'ncbi_taxid' is used for the taxonomy ID (line OX) 1065 line = line[5:].rstrip(_CHOMP) 1066 cols = [col.strip() for col in line.split(';')] 1067 try: 1068 #Append to any existing data 1069 self.data.annotations['taxonomy'].extend(cols) 1070 except KeyError: 1071 self.data.annotations['taxonomy'] = cols10721074 #Try and agree with SeqRecord convention expected in BioSQL 1075 #the NCBI taxon id with key 'ncbi_taxid'. 1076 #Note that 'taxonomy' is used for the taxonomy lineage 1077 #(held as a list of strings, line type OC) 1078 1079 line = line[5:].rstrip(_CHOMP) 1080 index = line.find('=') 1081 if index >= 0: 1082 descr = line[:index] 1083 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 1084 ids = line[index+1:].split(',') 1085 else: 1086 ids = line.split(',') 1087 1088 try: 1089 #Append to any existing data 1090 self.data.annotations['ncbi_taxid'].extend(ids) 1091 except KeyError: 1092 self.data.annotations['ncbi_taxid'] = ids10931095 """RN line, reference number (start of new reference).""" 1096 from Bio.SeqFeature import Reference 1097 # if we have a current reference that hasn't been added to 1098 # the list of references, add it. 1099 if self._current_ref is not None: 1100 self.data.annotations['references'].append(self._current_ref) 1101 else: 1102 self.data.annotations['references'] = [] 1103 1104 self._current_ref = Reference()11051107 """RP line, reference position.""" 1108 assert self._current_ref is not None, "RP: missing RN" 1109 #Should try and store this in self._current_ref.location 1110 #but the SwissProt locations don't match easily to the 1111 #format used in GenBank... 1112 pass11131115 """RX line, reference cross-references.""" 1116 assert self._current_ref is not None, "RX: missing RN" 1117 # The basic (older?) RX line is of the form: 1118 # RX MEDLINE; 85132727. 1119 # or more recently: 1120 # RX MEDLINE=95385798; PubMed=7656980; 1121 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 1122 # We look for these cases first and deal with them 1123 if line.find("=") != -1: 1124 cols = line[2:].split("; ") 1125 cols = [x.strip() for x in cols] 1126 cols = [x for x in cols if x] 1127 for col in cols: 1128 x = col.split("=") 1129 assert len(x) == 2, "I don't understand RX line %s" % line 1130 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP) 1131 if key == "MEDLINE": 1132 self._current_ref.medline_id = value 1133 elif key == "PubMed": 1134 self._current_ref.pubmed_id = value 1135 else: 1136 #Sadly the SeqFeature.Reference object doesn't 1137 #support anything else (yet) 1138 pass 1139 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 1140 else: 1141 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 1142 # have extraneous information in the RX line. Check for 1143 # this and chop it out of the line. 1144 # (noticed by katel@worldpath.net) 1145 ind = line.find('[NCBI, ExPASy, Israel, Japan]') 1146 if ind >= 0: 1147 line = line[:ind] 1148 cols = line.split() 1149 # normally we split into the three parts 1150 assert len(cols) == 3, "I don't understand RX line %s" % line 1151 key = cols[1].rstrip(_CHOMP) 1152 value = cols[2].rstrip(_CHOMP) 1153 if key == "MEDLINE": 1154 self._current_ref.medline_id = value 1155 elif key == "PubMed": 1156 self._current_ref.pubmed_id = value 1157 else: 1158 #Sadly the SeqFeature.Reference object doesn't 1159 #support anything else (yet) 1160 pass1161 11681170 """RT line, reference title.""" 1171 assert self._current_ref is not None, "RT: missing RN" 1172 if self._current_ref.title: 1173 self._current_ref.title += " " 1174 self._current_ref.title += line[5:].rstrip("\n")11751177 """RL line, reference 'location' - journal, volume, pages, year.""" 1178 assert self._current_ref is not None, "RL: missing RN" 1179 if self._current_ref.journal: 1180 self._current_ref.journal += " " 1181 self._current_ref.journal += line[5:].rstrip("\n")11821184 """RC line, reference comment.""" 1185 assert self._current_ref is not None, "RC: missing RN" 1186 #This has a key=value; structure... 1187 #Can we do a better job with the current Reference class? 1188 if self._current_ref.comment: 1189 self._current_ref.comment += " " 1190 self._current_ref.comment += line[5:].rstrip("\n")1193 """index_file(filename, indexname, rec2key=None) 1194 1195 Index a SwissProt file. filename is the name of the file. 1196 indexname is the name of the dictionary. rec2key is an 1197 optional callback that takes a Record and generates a unique key 1198 (e.g. the accession number) for the record. If not specified, 1199 the entry name will be used. 1200 1201 """ 1202 from Bio.SwissProt import parse 1203 if not os.path.exists(filename): 1204 raise ValueError("%s does not exist" % filename) 1205 1206 index = Index.Index(indexname, truncate=1) 1207 index[Dictionary._Dictionary__filename_key] = filename 1208 1209 handle = open(filename) 1210 records = parse(handle) 1211 end = 0L 1212 for record in records: 1213 start = end 1214 end = long(handle.tell()) 1215 length = end - start 1216 1217 if rec2key is not None: 1218 key = rec2key(record) 1219 else: 1220 key = record.entry_name 1221 1222 if not key: 1223 raise KeyError("empty sequence key was produced") 1224 elif key in index: 1225 raise KeyError("duplicate key %s found" % key) 1226 1227 index[key] = start, length1228
Trees | Indices | Help |
---|
Generated by Epydoc 3.0.1 on Wed Jul 14 01:51:01 2010 | http://epydoc.sourceforge.net |