Package Bio :: Package Blast :: Module NCBIStandalone
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIStandalone

   1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # Patches by Mike Poidinger to support multiple databases. 
   6  # Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15 
   7   
   8  """Code for calling standalone BLAST and parsing plain text output (OBSOLETE). 
   9   
  10  Rather than parsing the human readable plain text BLAST output (which seems to 
  11  change with every update to BLAST), we and the NBCI recommend you parse the 
  12  XML output instead. The plain text parser in this module still works at the 
  13  time of writing, but is considered obsolete and updating it to cope with the 
  14  latest versions of BLAST is not a priority for us. 
  15   
  16  This module also provides code to work with the "legacy" standalone version of 
  17  NCBI BLAST, tools blastall, rpsblast and blastpgp via three helper functions of 
  18  the same name. These functions are very limited for dealing with the output as 
  19  files rather than handles, for which the wrappers in Bio.Blast.Applications are 
  20  prefered. Furthermore, the NCBI themselves regard these command line tools as 
  21  "legacy", and encourage using the new BLAST+ tools instead. Biopython has 
  22  wrappers for these under Bio.Blast.Applications (see the tutorial). 
  23   
  24  Classes: 
  25  LowQualityBlastError     Except that indicates low quality query sequences. 
  26  BlastParser              Parses output from blast. 
  27  BlastErrorParser         Parses output and tries to diagnose possible errors. 
  28  PSIBlastParser           Parses output from psi-blast. 
  29  Iterator                 Iterates over a file of blast results. 
  30   
  31  _Scanner                 Scans output from standalone BLAST. 
  32  _BlastConsumer           Consumes output from blast. 
  33  _PSIBlastConsumer        Consumes output from psi-blast. 
  34  _HeaderConsumer          Consumes header information. 
  35  _DescriptionConsumer     Consumes description information. 
  36  _AlignmentConsumer       Consumes alignment information. 
  37  _HSPConsumer             Consumes hsp information. 
  38  _DatabaseReportConsumer  Consumes database report information. 
  39  _ParametersConsumer      Consumes parameters information. 
  40   
  41  Functions: 
  42  blastall        Execute blastall (OBSOLETE). 
  43  blastpgp        Execute blastpgp (OBSOLETE). 
  44  rpsblast        Execute rpsblast (OBSOLETE). 
  45   
  46  For calling the BLAST command line tools, we encourage you to use the 
  47  command line wrappers in Bio.Blast.Applications - the three functions 
  48  blastall, blastpgp and rpsblast are considered to be obsolete now, and 
  49  are likely to be deprecated and then removed in future releases. 
  50  """ 
  51   
  52  import os 
  53  import re 
  54   
  55  from Bio import File 
  56  from Bio.ParserSupport import * 
  57  from Bio.Blast import Record 
  58  from Bio.Application import _escape_filename 
  59   
60 -class LowQualityBlastError(Exception):
61 """Error caused by running a low quality sequence through BLAST. 62 63 When low quality sequences (like GenBank entries containing only 64 stretches of a single nucleotide) are BLASTed, they will result in 65 BLAST generating an error and not being able to perform the BLAST. 66 search. This error should be raised for the BLAST reports produced 67 in this case. 68 """ 69 pass
70
71 -class ShortQueryBlastError(Exception):
72 """Error caused by running a short query sequence through BLAST. 73 74 If the query sequence is too short, BLAST outputs warnings and errors: 75 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. 76 [blastall] ERROR: [000.000] AT1G08320: Blast: 77 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize 78 done 79 80 This exception is raised when that condition is detected. 81 82 """ 83 pass
84 85
86 -class _Scanner:
87 """Scan BLAST output from blastall or blastpgp. 88 89 Tested with blastall and blastpgp v2.0.10, v2.0.11 90 91 Methods: 92 feed Feed data into the scanner. 93 94 """
95 - def feed(self, handle, consumer):
96 """S.feed(handle, consumer) 97 98 Feed in a BLAST report for scanning. handle is a file-like 99 object that contains the BLAST report. consumer is a Consumer 100 object that will receive events as the report is scanned. 101 102 """ 103 if isinstance(handle, File.UndoHandle): 104 uhandle = handle 105 else: 106 uhandle = File.UndoHandle(handle) 107 108 # Try to fast-forward to the beginning of the blast report. 109 read_and_call_until(uhandle, consumer.noevent, contains='BLAST') 110 # Now scan the BLAST report. 111 self._scan_header(uhandle, consumer) 112 self._scan_rounds(uhandle, consumer) 113 self._scan_database_report(uhandle, consumer) 114 self._scan_parameters(uhandle, consumer)
115
116 - def _scan_header(self, uhandle, consumer):
117 # BLASTP 2.0.10 [Aug-26-1999] 118 # 119 # 120 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf 121 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 122 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 123 # programs", Nucleic Acids Res. 25:3389-3402. 124 # 125 # Query= test 126 # (140 letters) 127 # 128 # Database: sdqib40-1.35.seg.fa 129 # 1323 sequences; 223,339 total letters 130 # 131 # ======================================================== 132 # This next example is from the online version of Blast, 133 # note there are TWO references, an RID line, and also 134 # the database is BEFORE the query line. 135 # Note there possibleuse of non-ASCII in the author names. 136 # ======================================================== 137 # 138 # BLASTP 2.2.15 [Oct-15-2006] 139 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, 140 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman 141 # (1997), "Gapped BLAST and PSI-BLAST: a new generation of 142 # protein database search programs", Nucleic Acids Res. 25:3389-3402. 143 # 144 # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei 145 # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and 146 # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST 147 # protein database searches with composition-based statistics 148 # and other refinements", Nucleic Acids Res. 29:2994-3005. 149 # 150 # RID: 1166022616-19998-65316425856.BLASTQ1 151 # 152 # 153 # Database: All non-redundant GenBank CDS 154 # translations+PDB+SwissProt+PIR+PRF excluding environmental samples 155 # 4,254,166 sequences; 1,462,033,012 total letters 156 # Query= gi:16127998 157 # Length=428 158 # 159 160 consumer.start_header() 161 162 read_and_call(uhandle, consumer.version, contains='BLAST') 163 read_and_call_while(uhandle, consumer.noevent, blank=1) 164 165 # There might be a <pre> line, for qblast output. 166 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>") 167 168 # Read the reference(s) 169 while attempt_read_and_call(uhandle, 170 consumer.reference, start='Reference'): 171 # References are normally multiline terminated by a blank line 172 # (or, based on the old code, the RID line) 173 while 1: 174 line = uhandle.readline() 175 if is_blank_line(line): 176 consumer.noevent(line) 177 break 178 elif line.startswith("RID"): 179 break 180 else: 181 #More of the reference 182 consumer.reference(line) 183 184 #Deal with the optional RID: ... 185 read_and_call_while(uhandle, consumer.noevent, blank=1) 186 attempt_read_and_call(uhandle, consumer.reference, start="RID:") 187 read_and_call_while(uhandle, consumer.noevent, blank=1) 188 189 # blastpgp may have a reference for compositional score matrix 190 # adjustment (see Bug 2502): 191 if attempt_read_and_call( 192 uhandle, consumer.reference, start="Reference"): 193 read_and_call_until(uhandle, consumer.reference, blank=1) 194 read_and_call_while(uhandle, consumer.noevent, blank=1) 195 196 # blastpgp has a Reference for composition-based statistics. 197 if attempt_read_and_call( 198 uhandle, consumer.reference, start="Reference"): 199 read_and_call_until(uhandle, consumer.reference, blank=1) 200 read_and_call_while(uhandle, consumer.noevent, blank=1) 201 202 line = uhandle.peekline() 203 assert line.strip() != "" 204 assert not line.startswith("RID:") 205 if line.startswith("Query="): 206 #This is an old style query then database... 207 208 # Read the Query lines and the following blank line. 209 read_and_call(uhandle, consumer.query_info, start='Query=') 210 read_and_call_until(uhandle, consumer.query_info, blank=1) 211 read_and_call_while(uhandle, consumer.noevent, blank=1) 212 213 # Read the database lines and the following blank line. 214 read_and_call_until(uhandle, consumer.database_info, end='total letters') 215 read_and_call(uhandle, consumer.database_info, contains='sequences') 216 read_and_call_while(uhandle, consumer.noevent, blank=1) 217 elif line.startswith("Database:"): 218 #This is a new style database then query... 219 read_and_call_until(uhandle, consumer.database_info, end='total letters') 220 read_and_call(uhandle, consumer.database_info, contains='sequences') 221 read_and_call_while(uhandle, consumer.noevent, blank=1) 222 223 # Read the Query lines and the following blank line. 224 # Or, on BLAST 2.2.22+ there is no blank link - need to spot 225 # the "... Score E" line instead. 226 read_and_call(uhandle, consumer.query_info, start='Query=') 227 #read_and_call_until(uhandle, consumer.query_info, blank=1) 228 while True: 229 line = uhandle.peekline() 230 if not line.strip() : break 231 if "Score E" in line : break 232 #It is more of the query (and its length) 233 read_and_call(uhandle, consumer.query_info) 234 read_and_call_while(uhandle, consumer.noevent, blank=1) 235 else: 236 raise ValueError("Invalid header?") 237 238 consumer.end_header()
239
240 - def _scan_rounds(self, uhandle, consumer):
241 # Scan a bunch of rounds. 242 # Each round begins with either a "Searching......" line 243 # or a 'Score E' line followed by descriptions and alignments. 244 # The email server doesn't give the "Searching....." line. 245 # If there is no 'Searching.....' line then you'll first see a 246 # 'Results from round' line 247 248 while not self._eof(uhandle): 249 line = safe_peekline(uhandle) 250 if (not line.startswith('Searching') and 251 not line.startswith('Results from round') and 252 re.search(r"Score +E", line) is None and 253 line.find('No hits found') == -1): 254 break 255 self._scan_descriptions(uhandle, consumer) 256 self._scan_alignments(uhandle, consumer)
257
258 - def _scan_descriptions(self, uhandle, consumer):
259 # Searching..................................................done 260 # Results from round 2 261 # 262 # 263 # Sc 264 # Sequences producing significant alignments: (b 265 # Sequences used in model and found again: 266 # 267 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ... 268 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B... 269 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)] 270 # 271 # Sequences not found previously or not previously below threshold: 272 # 273 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia] 274 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb... 275 # 276 277 # If PSI-BLAST, may also have: 278 # 279 # CONVERGED! 280 281 consumer.start_descriptions() 282 283 # Read 'Searching' 284 # This line seems to be missing in BLASTN 2.1.2 (others?) 285 attempt_read_and_call(uhandle, consumer.noevent, start='Searching') 286 287 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here. 288 # If this happens, the handle will yield no more information. 289 if not uhandle.peekline(): 290 raise ValueError("Unexpected end of blast report. " + \ 291 "Looks suspiciously like a PSI-BLAST crash.") 292 293 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here: 294 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch 295 # [blastall] ERROR: [000.000] AT1G08320: Blast: 296 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas 297 # done 298 # Reported by David Weisman. 299 # Check for these error lines and ignore them for now. Let 300 # the BlastErrorParser deal with them. 301 line = uhandle.peekline() 302 if line.find("ERROR:") != -1 or line.startswith("done"): 303 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:") 304 read_and_call(uhandle, consumer.noevent, start="done") 305 306 # Check to see if this is PSI-BLAST. 307 # If it is, the 'Searching' line will be followed by: 308 # (version 2.0.10) 309 # Searching............................. 310 # Results from round 2 311 # or (version 2.0.11) 312 # Searching............................. 313 # 314 # 315 # Results from round 2 316 317 # Skip a bunch of blank lines. 318 read_and_call_while(uhandle, consumer.noevent, blank=1) 319 # Check for the results line if it's there. 320 if attempt_read_and_call(uhandle, consumer.round, start='Results'): 321 read_and_call_while(uhandle, consumer.noevent, blank=1) 322 323 # Three things can happen here: 324 # 1. line contains 'Score E' 325 # 2. line contains "No hits found" 326 # 3. no descriptions 327 # The first one begins a bunch of descriptions. The last two 328 # indicates that no descriptions follow, and we should go straight 329 # to the alignments. 330 if not attempt_read_and_call( 331 uhandle, consumer.description_header, 332 has_re=re.compile(r'Score +E')): 333 # Either case 2 or 3. Look for "No hits found". 334 attempt_read_and_call(uhandle, consumer.no_hits, 335 contains='No hits found') 336 try: 337 read_and_call_while(uhandle, consumer.noevent, blank=1) 338 except ValueError, err: 339 if str(err) != "Unexpected end of stream." : raise err 340 341 consumer.end_descriptions() 342 # Stop processing. 343 return 344 345 # Read the score header lines 346 read_and_call(uhandle, consumer.description_header, 347 start='Sequences producing') 348 349 # If PSI-BLAST, read the 'Sequences used in model' line. 350 attempt_read_and_call(uhandle, consumer.model_sequences, 351 start='Sequences used in model') 352 read_and_call_while(uhandle, consumer.noevent, blank=1) 353 354 # In BLAT, rather than a "No hits found" line, we just 355 # get no descriptions (and no alignments). This can be 356 # spotted because the next line is the database block: 357 if safe_peekline(uhandle).startswith(" Database:"): 358 consumer.end_descriptions() 359 # Stop processing. 360 return 361 362 # Read the descriptions and the following blank lines, making 363 # sure that there are descriptions. 364 if not uhandle.peekline().startswith('Sequences not found'): 365 read_and_call_until(uhandle, consumer.description, blank=1) 366 read_and_call_while(uhandle, consumer.noevent, blank=1) 367 368 # If PSI-BLAST, read the 'Sequences not found' line followed 369 # by more descriptions. However, I need to watch out for the 370 # case where there were no sequences not found previously, in 371 # which case there will be no more descriptions. 372 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences, 373 start='Sequences not found'): 374 # Read the descriptions and the following blank lines. 375 read_and_call_while(uhandle, consumer.noevent, blank=1) 376 l = safe_peekline(uhandle) 377 # Brad -- added check for QUERY. On some PSI-BLAST outputs 378 # there will be a 'Sequences not found' line followed by no 379 # descriptions. Check for this case since the first thing you'll 380 # get is a blank line and then 'QUERY' 381 if not l.startswith('CONVERGED') and l[0] != '>' \ 382 and not l.startswith('QUERY'): 383 read_and_call_until(uhandle, consumer.description, blank=1) 384 read_and_call_while(uhandle, consumer.noevent, blank=1) 385 386 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED') 387 read_and_call_while(uhandle, consumer.noevent, blank=1) 388 389 consumer.end_descriptions()
390
391 - def _scan_alignments(self, uhandle, consumer):
392 if self._eof(uhandle) : return 393 394 # qblast inserts a helpful line here. 395 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS") 396 397 # First, check to see if I'm at the database report. 398 line = safe_peekline(uhandle) 399 if not line: 400 #EOF 401 return 402 elif line.startswith(' Database') or line.startswith("Lambda"): 403 return 404 elif line[0] == '>': 405 # XXX make a better check here between pairwise and masterslave 406 self._scan_pairwise_alignments(uhandle, consumer) 407 else: 408 # XXX put in a check to make sure I'm in a masterslave alignment 409 self._scan_masterslave_alignment(uhandle, consumer)
410
411 - def _scan_pairwise_alignments(self, uhandle, consumer):
412 while not self._eof(uhandle): 413 line = safe_peekline(uhandle) 414 if line[0] != '>': 415 break 416 self._scan_one_pairwise_alignment(uhandle, consumer)
417
418 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
419 if self._eof(uhandle) : return 420 consumer.start_alignment() 421 422 self._scan_alignment_header(uhandle, consumer) 423 424 # Scan a bunch of score/alignment pairs. 425 while 1: 426 if self._eof(uhandle): 427 #Shouldn't have issued that _scan_alignment_header event... 428 break 429 line = safe_peekline(uhandle) 430 if not line.startswith(' Score'): 431 break 432 self._scan_hsp(uhandle, consumer) 433 consumer.end_alignment()
434
435 - def _scan_alignment_header(self, uhandle, consumer):
436 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus 437 # stearothermophilus] 438 # Length = 81 439 # 440 # Or, more recently with different white space: 441 # 442 # >gi|15799684|ref|NP_285696.1| threonine synthase ... 443 # gi|15829258|ref|NP_308031.1| threonine synthase 444 # ... 445 # Length=428 446 read_and_call(uhandle, consumer.title, start='>') 447 while 1: 448 line = safe_readline(uhandle) 449 if line.lstrip().startswith('Length =') \ 450 or line.lstrip().startswith('Length='): 451 consumer.length(line) 452 break 453 elif is_blank_line(line): 454 # Check to make sure I haven't missed the Length line 455 raise ValueError("I missed the Length in an alignment header") 456 consumer.title(line) 457 458 # Older versions of BLAST will have a line with some spaces. 459 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line. 460 if not attempt_read_and_call(uhandle, consumer.noevent, 461 start=' '): 462 read_and_call(uhandle, consumer.noevent, blank=1)
463
464 - def _scan_hsp(self, uhandle, consumer):
465 consumer.start_hsp() 466 self._scan_hsp_header(uhandle, consumer) 467 self._scan_hsp_alignment(uhandle, consumer) 468 consumer.end_hsp()
469
470 - def _scan_hsp_header(self, uhandle, consumer):
471 # Score = 22.7 bits (47), Expect = 2.5 472 # Identities = 10/36 (27%), Positives = 18/36 (49%) 473 # Strand = Plus / Plus 474 # Frame = +3 475 # 476 477 read_and_call(uhandle, consumer.score, start=' Score') 478 read_and_call(uhandle, consumer.identities, start=' Identities') 479 # BLASTN 480 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand') 481 # BLASTX, TBLASTN, TBLASTX 482 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame') 483 read_and_call(uhandle, consumer.noevent, blank=1)
484
485 - def _scan_hsp_alignment(self, uhandle, consumer):
486 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 487 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 488 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 489 # 490 # Query: 64 AEKILIKR 71 491 # I +K 492 # Sbjct: 70 PNIIQLKD 77 493 # 494 495 while 1: 496 # Blastn adds an extra line filled with spaces before Query 497 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 498 read_and_call(uhandle, consumer.query, start='Query') 499 read_and_call(uhandle, consumer.align, start=' ') 500 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 501 try: 502 read_and_call_while(uhandle, consumer.noevent, blank=1) 503 except ValueError, err: 504 if str(err) != "Unexpected end of stream." : raise err 505 # End of File (well, it looks like it with recent versions 506 # of BLAST for multiple queries after the Iterator class 507 # has broken up the whole file into chunks). 508 break 509 line = safe_peekline(uhandle) 510 # Alignment continues if I see a 'Query' or the spaces for Blastn. 511 if not (line.startswith('Query') or line.startswith(' ')): 512 break
513
514 - def _scan_masterslave_alignment(self, uhandle, consumer):
515 consumer.start_alignment() 516 while 1: 517 line = safe_readline(uhandle) 518 # Check to see whether I'm finished reading the alignment. 519 # This is indicated by 1) database section, 2) next psi-blast 520 # round, which can also be a 'Results from round' if no 521 # searching line is present 522 # patch by chapmanb 523 if line.startswith('Searching') or \ 524 line.startswith('Results from round'): 525 uhandle.saveline(line) 526 break 527 elif line.startswith(' Database'): 528 uhandle.saveline(line) 529 break 530 elif is_blank_line(line): 531 consumer.noevent(line) 532 else: 533 consumer.multalign(line) 534 read_and_call_while(uhandle, consumer.noevent, blank=1) 535 consumer.end_alignment()
536
537 - def _eof(self, uhandle):
538 try: 539 line = safe_peekline(uhandle) 540 except ValueError, err: 541 if str(err) != "Unexpected end of stream." : raise err 542 line = "" 543 return not line
544
545 - def _scan_database_report(self, uhandle, consumer):
546 # Database: sdqib40-1.35.seg.fa 547 # Posted date: Nov 1, 1999 4:25 PM 548 # Number of letters in database: 223,339 549 # Number of sequences in database: 1323 550 # 551 # Lambda K H 552 # 0.322 0.133 0.369 553 # 554 # Gapped 555 # Lambda K H 556 # 0.270 0.0470 0.230 557 # 558 ########################################## 559 # Or, more recently Blast 2.2.15 gives less blank lines 560 ########################################## 561 # Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding 562 # environmental samples 563 # Posted date: Dec 12, 2006 5:51 PM 564 # Number of letters in database: 667,088,753 565 # Number of sequences in database: 2,094,974 566 # Lambda K H 567 # 0.319 0.136 0.395 568 # Gapped 569 # Lambda K H 570 # 0.267 0.0410 0.140 571 572 if self._eof(uhandle) : return 573 574 consumer.start_database_report() 575 576 # Subset of the database(s) listed below 577 # Number of letters searched: 562,618,960 578 # Number of sequences searched: 228,924 579 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"): 580 read_and_call(uhandle, consumer.noevent, contains="letters") 581 read_and_call(uhandle, consumer.noevent, contains="sequences") 582 read_and_call(uhandle, consumer.noevent, start=" ") 583 584 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that 585 # was missing the "Database" stanza completely. 586 while attempt_read_and_call(uhandle, consumer.database, 587 start=' Database'): 588 # BLAT output ends abruptly here, without any of the other 589 # information. Check to see if this is the case. If so, 590 # then end the database report here gracefully. 591 if not uhandle.peekline().strip() \ 592 or uhandle.peekline().startswith("BLAST"): 593 consumer.end_database_report() 594 return 595 596 # Database can span multiple lines. 597 read_and_call_until(uhandle, consumer.database, start=' Posted') 598 read_and_call(uhandle, consumer.posted_date, start=' Posted') 599 read_and_call(uhandle, consumer.num_letters_in_database, 600 start=' Number of letters') 601 read_and_call(uhandle, consumer.num_sequences_in_database, 602 start=' Number of sequences') 603 #There may not be a line starting with spaces... 604 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 605 606 line = safe_readline(uhandle) 607 uhandle.saveline(line) 608 if line.find('Lambda') != -1: 609 break 610 611 read_and_call(uhandle, consumer.noevent, start='Lambda') 612 read_and_call(uhandle, consumer.ka_params) 613 614 #This blank line is optional: 615 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 616 617 # not BLASTP 618 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 619 # not TBLASTX 620 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 621 read_and_call(uhandle, consumer.ka_params_gap) 622 623 # Blast 2.2.4 can sometimes skip the whole parameter section. 624 # Thus, I need to be careful not to read past the end of the 625 # file. 626 try: 627 read_and_call_while(uhandle, consumer.noevent, blank=1) 628 except ValueError, x: 629 if str(x) != "Unexpected end of stream.": 630 raise 631 consumer.end_database_report()
632
633 - def _scan_parameters(self, uhandle, consumer):
634 # Matrix: BLOSUM62 635 # Gap Penalties: Existence: 11, Extension: 1 636 # Number of Hits to DB: 50604 637 # Number of Sequences: 1323 638 # Number of extensions: 1526 639 # Number of successful extensions: 6 640 # Number of sequences better than 10.0: 5 641 # Number of HSP's better than 10.0 without gapping: 5 642 # Number of HSP's successfully gapped in prelim test: 0 643 # Number of HSP's that attempted gapping in prelim test: 1 644 # Number of HSP's gapped (non-prelim): 5 645 # length of query: 140 646 # length of database: 223,339 647 # effective HSP length: 39 648 # effective length of query: 101 649 # effective length of database: 171,742 650 # effective search space: 17345942 651 # effective search space used: 17345942 652 # T: 11 653 # A: 40 654 # X1: 16 ( 7.4 bits) 655 # X2: 38 (14.8 bits) 656 # X3: 64 (24.9 bits) 657 # S1: 41 (21.9 bits) 658 # S2: 42 (20.8 bits) 659 ########################################## 660 # Or, more recently Blast(x) 2.2.15 gives 661 ########################################## 662 # Matrix: BLOSUM62 663 # Gap Penalties: Existence: 11, Extension: 1 664 # Number of Sequences: 4535438 665 # Number of Hits to DB: 2,588,844,100 666 # Number of extensions: 60427286 667 # Number of successful extensions: 126433 668 # Number of sequences better than 2.0: 30 669 # Number of HSP's gapped: 126387 670 # Number of HSP's successfully gapped: 35 671 # Length of query: 291 672 # Length of database: 1,573,298,872 673 # Length adjustment: 130 674 # Effective length of query: 161 675 # Effective length of database: 983,691,932 676 # Effective search space: 158374401052 677 # Effective search space used: 158374401052 678 # Neighboring words threshold: 12 679 # Window for multiple hits: 40 680 # X1: 16 ( 7.3 bits) 681 # X2: 38 (14.6 bits) 682 # X3: 64 (24.7 bits) 683 # S1: 41 (21.7 bits) 684 # S2: 32 (16.9 bits) 685 686 687 # Blast 2.2.4 can sometimes skip the whole parameter section. 688 # BLAT also skips the whole parameter section. 689 # Thus, check to make sure that the parameter section really 690 # exists. 691 if not uhandle.peekline().strip(): 692 return 693 694 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and 695 # "Number of Sequences" lines. 696 consumer.start_parameters() 697 698 # Matrix line may be missing in BLASTN 2.2.9 699 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 700 # not TBLASTX 701 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 702 703 attempt_read_and_call(uhandle, consumer.num_sequences, 704 start='Number of Sequences') 705 attempt_read_and_call(uhandle, consumer.num_hits, 706 start='Number of Hits') 707 attempt_read_and_call(uhandle, consumer.num_sequences, 708 start='Number of Sequences') 709 attempt_read_and_call(uhandle, consumer.num_extends, 710 start='Number of extensions') 711 attempt_read_and_call(uhandle, consumer.num_good_extends, 712 start='Number of successful') 713 714 attempt_read_and_call(uhandle, consumer.num_seqs_better_e, 715 start='Number of sequences') 716 717 # not BLASTN, TBLASTX 718 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 719 start="Number of HSP's better"): 720 # BLASTN 2.2.9 721 if attempt_read_and_call(uhandle, consumer.noevent, 722 start="Number of HSP's gapped:"): 723 read_and_call(uhandle, consumer.noevent, 724 start="Number of HSP's successfully") 725 #This is ommitted in 2.2.15 726 attempt_read_and_call(uhandle, consumer.noevent, 727 start="Number of extra gapped extensions") 728 else: 729 read_and_call(uhandle, consumer.hsps_prelim_gapped, 730 start="Number of HSP's successfully") 731 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 732 start="Number of HSP's that") 733 read_and_call(uhandle, consumer.hsps_gapped, 734 start="Number of HSP's gapped") 735 #e.g. BLASTX 2.2.15 where the "better" line is missing 736 elif attempt_read_and_call(uhandle, consumer.noevent, 737 start="Number of HSP's gapped"): 738 read_and_call(uhandle, consumer.noevent, 739 start="Number of HSP's successfully") 740 741 # not in blastx 2.2.1 742 attempt_read_and_call(uhandle, consumer.query_length, 743 has_re=re.compile(r"[Ll]ength of query")) 744 # Not in BLASTX 2.2.22+ 745 attempt_read_and_call(uhandle, consumer.database_length, 746 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase")) 747 748 # BLASTN 2.2.9 749 attempt_read_and_call(uhandle, consumer.noevent, 750 start="Length adjustment") 751 attempt_read_and_call(uhandle, consumer.effective_hsp_length, 752 start='effective HSP') 753 # Not in blastx 2.2.1 754 attempt_read_and_call( 755 uhandle, consumer.effective_query_length, 756 has_re=re.compile(r'[Ee]ffective length of query')) 757 758 # This is not in BLASTP 2.2.15 759 attempt_read_and_call( 760 uhandle, consumer.effective_database_length, 761 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase')) 762 # Not in blastx 2.2.1, added a ':' to distinguish between 763 # this and the 'effective search space used' line 764 attempt_read_and_call( 765 uhandle, consumer.effective_search_space, 766 has_re=re.compile(r'[Ee]ffective search space:')) 767 # Does not appear in BLASTP 2.0.5 768 attempt_read_and_call( 769 uhandle, consumer.effective_search_space_used, 770 has_re=re.compile(r'[Ee]ffective search space used')) 771 772 # BLASTX, TBLASTN, TBLASTX 773 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 774 775 # not in BLASTN 2.2.9 776 attempt_read_and_call(uhandle, consumer.threshold, start='T') 777 # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12" 778 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold') 779 780 # not in BLASTX 2.2.15 781 attempt_read_and_call(uhandle, consumer.window_size, start='A') 782 # get this instead: "Window for multiple hits: 40" 783 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits') 784 785 # not in BLASTX 2.2.22+ 786 attempt_read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 787 # not TBLASTN 788 attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 789 790 # not BLASTN, TBLASTX 791 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 792 start='X3') 793 794 # not TBLASTN 795 attempt_read_and_call(uhandle, consumer.gap_trigger, start='S1') 796 # not in blastx 2.2.1 797 # first we make sure we have additional lines to work with, if 798 # not then the file is done and we don't have a final S2 799 if not is_blank_line(uhandle.peekline(), allow_spaces=1): 800 read_and_call(uhandle, consumer.blast_cutoff, start='S2') 801 802 consumer.end_parameters()
803
804 -class BlastParser(AbstractParser):
805 """Parses BLAST data into a Record.Blast object. 806 807 """
808 - def __init__(self):
809 """__init__(self)""" 810 self._scanner = _Scanner() 811 self._consumer = _BlastConsumer()
812
813 - def parse(self, handle):
814 """parse(self, handle)""" 815 self._scanner.feed(handle, self._consumer) 816 return self._consumer.data
817
818 -class PSIBlastParser(AbstractParser):
819 """Parses BLAST data into a Record.PSIBlast object. 820 821 """
822 - def __init__(self):
823 """__init__(self)""" 824 self._scanner = _Scanner() 825 self._consumer = _PSIBlastConsumer()
826
827 - def parse(self, handle):
828 """parse(self, handle)""" 829 self._scanner.feed(handle, self._consumer) 830 return self._consumer.data
831
832 -class _HeaderConsumer:
833 - def start_header(self):
834 self._header = Record.Header()
835
836 - def version(self, line):
837 c = line.split() 838 self._header.application = c[0] 839 self._header.version = c[1] 840 if len(c) > 2: 841 #The date is missing in the new C++ output from blastx 2.2.22+ 842 #Just get "BLASTX 2.2.22+\n" and that's all. 843 self._header.date = c[2][1:-1]
844
845 - def reference(self, line):
846 if line.startswith('Reference: '): 847 self._header.reference = line[11:] 848 else: 849 self._header.reference = self._header.reference + line
850
851 - def query_info(self, line):
852 if line.startswith('Query= '): 853 self._header.query = line[7:].lstrip() 854 elif line.startswith('Length='): 855 #New style way to give the query length in BLAST 2.2.22+ (the C++ code) 856 self._header.query_letters = _safe_int(line[7:].strip()) 857 elif not line.startswith(' '): # continuation of query_info 858 self._header.query = "%s%s" % (self._header.query, line) 859 else: 860 #Hope it is the old style way to give the query length: 861 letters, = _re_search( 862 r"([0-9,]+) letters", line, 863 "I could not find the number of letters in line\n%s" % line) 864 self._header.query_letters = _safe_int(letters)
865
866 - def database_info(self, line):
867 line = line.rstrip() 868 if line.startswith('Database: '): 869 self._header.database = line[10:] 870 elif not line.endswith('total letters'): 871 if self._header.database: 872 #Need to include a space when merging multi line datase descr 873 self._header.database = self._header.database + " " + line.strip() 874 else: 875 self._header.database = line.strip() 876 else: 877 sequences, letters =_re_search( 878 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line, 879 "I could not find the sequences and letters in line\n%s" %line) 880 self._header.database_sequences = _safe_int(sequences) 881 self._header.database_letters = _safe_int(letters)
882
883 - def end_header(self):
884 # Get rid of the trailing newlines 885 self._header.reference = self._header.reference.rstrip() 886 self._header.query = self._header.query.rstrip()
887
888 -class _DescriptionConsumer:
889 - def start_descriptions(self):
890 self._descriptions = [] 891 self._model_sequences = [] 892 self._nonmodel_sequences = [] 893 self._converged = 0 894 self._type = None 895 self._roundnum = None 896 897 self.__has_n = 0 # Does the description line contain an N value?
898
899 - def description_header(self, line):
900 if line.startswith('Sequences producing'): 901 cols = line.split() 902 if cols[-1] == 'N': 903 self.__has_n = 1
904
905 - def description(self, line):
906 dh = self._parse(line) 907 if self._type == 'model': 908 self._model_sequences.append(dh) 909 elif self._type == 'nonmodel': 910 self._nonmodel_sequences.append(dh) 911 else: 912 self._descriptions.append(dh)
913
914 - def model_sequences(self, line):
915 self._type = 'model'
916
917 - def nonmodel_sequences(self, line):
918 self._type = 'nonmodel'
919
920 - def converged(self, line):
921 self._converged = 1
922
923 - def no_hits(self, line):
924 pass
925
926 - def round(self, line):
927 if not line.startswith('Results from round'): 928 raise ValueError("I didn't understand the round line\n%s" % line) 929 self._roundnum = _safe_int(line[18:].strip())
930
931 - def end_descriptions(self):
932 pass
933
934 - def _parse(self, description_line):
935 line = description_line # for convenience 936 dh = Record.Description() 937 938 # I need to separate the score and p-value from the title. 939 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 940 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1 941 # special cases to handle: 942 # - title must be preserved exactly (including whitespaces) 943 # - score could be equal to e-value (not likely, but what if??) 944 # - sometimes there's an "N" score of '1'. 945 cols = line.split() 946 if len(cols) < 3: 947 raise ValueError( \ 948 "Line does not appear to contain description:\n%s" % line) 949 if self.__has_n: 950 i = line.rfind(cols[-1]) # find start of N 951 i = line.rfind(cols[-2], 0, i) # find start of p-value 952 i = line.rfind(cols[-3], 0, i) # find start of score 953 else: 954 i = line.rfind(cols[-1]) # find start of p-value 955 i = line.rfind(cols[-2], 0, i) # find start of score 956 if self.__has_n: 957 dh.title, dh.score, dh.e, dh.num_alignments = \ 958 line[:i].rstrip(), cols[-3], cols[-2], cols[-1] 959 else: 960 dh.title, dh.score, dh.e, dh.num_alignments = \ 961 line[:i].rstrip(), cols[-2], cols[-1], 1 962 dh.num_alignments = _safe_int(dh.num_alignments) 963 dh.score = _safe_int(dh.score) 964 dh.e = _safe_float(dh.e) 965 return dh
966
967 -class _AlignmentConsumer:
968 # This is a little bit tricky. An alignment can either be a 969 # pairwise alignment or a multiple alignment. Since it's difficult 970 # to know a-priori which one the blast record will contain, I'm going 971 # to make one class that can parse both of them.
972 - def start_alignment(self):
973 self._alignment = Record.Alignment() 974 self._multiple_alignment = Record.MultipleAlignment()
975
976 - def title(self, line):
977 if self._alignment.title: 978 self._alignment.title += " " 979 self._alignment.title += line.strip()
980
981 - def length(self, line):
982 #e.g. "Length = 81" or more recently, "Length=428" 983 parts = line.replace(" ","").split("=") 984 assert len(parts)==2, "Unrecognised format length line" 985 self._alignment.length = parts[1] 986 self._alignment.length = _safe_int(self._alignment.length)
987
988 - def multalign(self, line):
989 # Standalone version uses 'QUERY', while WWW version uses blast_tmp. 990 if line.startswith('QUERY') or line.startswith('blast_tmp'): 991 # If this is the first line of the multiple alignment, 992 # then I need to figure out how the line is formatted. 993 994 # Format of line is: 995 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60 996 try: 997 name, start, seq, end = line.split() 998 except ValueError: 999 raise ValueError("I do not understand the line\n%s" % line) 1000 self._start_index = line.index(start, len(name)) 1001 self._seq_index = line.index(seq, 1002 self._start_index+len(start)) 1003 # subtract 1 for the space 1004 self._name_length = self._start_index - 1 1005 self._start_length = self._seq_index - self._start_index - 1 1006 self._seq_length = line.rfind(end) - self._seq_index - 1 1007 1008 #self._seq_index = line.index(seq) 1009 ## subtract 1 for the space 1010 #self._seq_length = line.rfind(end) - self._seq_index - 1 1011 #self._start_index = line.index(start) 1012 #self._start_length = self._seq_index - self._start_index - 1 1013 #self._name_length = self._start_index 1014 1015 # Extract the information from the line 1016 name = line[:self._name_length] 1017 name = name.rstrip() 1018 start = line[self._start_index:self._start_index+self._start_length] 1019 start = start.rstrip() 1020 if start: 1021 start = _safe_int(start) 1022 end = line[self._seq_index+self._seq_length:].rstrip() 1023 if end: 1024 end = _safe_int(end) 1025 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip() 1026 # right pad the sequence with spaces if necessary 1027 if len(seq) < self._seq_length: 1028 seq = seq + ' '*(self._seq_length-len(seq)) 1029 1030 # I need to make sure the sequence is aligned correctly with the query. 1031 # First, I will find the length of the query. Then, if necessary, 1032 # I will pad my current sequence with spaces so that they will line 1033 # up correctly. 1034 1035 # Two possible things can happen: 1036 # QUERY 1037 # 504 1038 # 1039 # QUERY 1040 # 403 1041 # 1042 # Sequence 504 will need padding at the end. Since I won't know 1043 # this until the end of the alignment, this will be handled in 1044 # end_alignment. 1045 # Sequence 403 will need padding before being added to the alignment. 1046 1047 align = self._multiple_alignment.alignment # for convenience 1048 align.append((name, start, seq, end))
1049 1050 # This is old code that tried to line up all the sequences 1051 # in a multiple alignment by using the sequence title's as 1052 # identifiers. The problem with this is that BLAST assigns 1053 # different HSP's from the same sequence the same id. Thus, 1054 # in one alignment block, there may be multiple sequences with 1055 # the same id. I'm not sure how to handle this, so I'm not 1056 # going to. 1057 1058 # # If the sequence is the query, then just add it. 1059 # if name == 'QUERY': 1060 # if len(align) == 0: 1061 # align.append((name, start, seq)) 1062 # else: 1063 # aname, astart, aseq = align[0] 1064 # if name != aname: 1065 # raise ValueError, "Query is not the first sequence" 1066 # aseq = aseq + seq 1067 # align[0] = aname, astart, aseq 1068 # else: 1069 # if len(align) == 0: 1070 # raise ValueError, "I could not find the query sequence" 1071 # qname, qstart, qseq = align[0] 1072 # 1073 # # Now find my sequence in the multiple alignment. 1074 # for i in range(1, len(align)): 1075 # aname, astart, aseq = align[i] 1076 # if name == aname: 1077 # index = i 1078 # break 1079 # else: 1080 # # If I couldn't find it, then add a new one. 1081 # align.append((None, None, None)) 1082 # index = len(align)-1 1083 # # Make sure to left-pad it. 1084 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq)) 1085 # 1086 # if len(qseq) != len(aseq) + len(seq): 1087 # # If my sequences are shorter than the query sequence, 1088 # # then I will need to pad some spaces to make them line up. 1089 # # Since I've already right padded seq, that means aseq 1090 # # must be too short. 1091 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq)) 1092 # aseq = aseq + seq 1093 # if astart is None: 1094 # astart = start 1095 # align[index] = aname, astart, aseq 1096
1097 - def end_alignment(self):
1098 # Remove trailing newlines 1099 if self._alignment: 1100 self._alignment.title = self._alignment.title.rstrip() 1101 1102 # This code is also obsolete. See note above. 1103 # If there's a multiple alignment, I will need to make sure 1104 # all the sequences are aligned. That is, I may need to 1105 # right-pad the sequences. 1106 # if self._multiple_alignment is not None: 1107 # align = self._multiple_alignment.alignment 1108 # seqlen = None 1109 # for i in range(len(align)): 1110 # name, start, seq = align[i] 1111 # if seqlen is None: 1112 # seqlen = len(seq) 1113 # else: 1114 # if len(seq) < seqlen: 1115 # seq = seq + ' '*(seqlen - len(seq)) 1116 # align[i] = name, start, seq 1117 # elif len(seq) > seqlen: 1118 # raise ValueError, \ 1119 # "Sequence %s is longer than the query" % name 1120 1121 # Clean up some variables, if they exist. 1122 try: 1123 del self._seq_index 1124 del self._seq_length 1125 del self._start_index 1126 del self._start_length 1127 del self._name_length 1128 except AttributeError: 1129 pass
1130
1131 -class _HSPConsumer:
1132 - def start_hsp(self):
1133 self._hsp = Record.HSP()
1134
1135 - def score(self, line):
1136 self._hsp.bits, self._hsp.score = _re_search( 1137 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line, 1138 "I could not find the score in line\n%s" % line) 1139 self._hsp.score = _safe_float(self._hsp.score) 1140 self._hsp.bits = _safe_float(self._hsp.bits) 1141 1142 x, y = _re_search( 1143 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line, 1144 "I could not find the expect in line\n%s" % line) 1145 if x: 1146 self._hsp.num_alignments = _safe_int(x) 1147 else: 1148 self._hsp.num_alignments = 1 1149 self._hsp.expect = _safe_float(y)
1150
1151 - def identities(self, line):
1152 x, y = _re_search( 1153 r"Identities = (\d+)\/(\d+)", line, 1154 "I could not find the identities in line\n%s" % line) 1155 self._hsp.identities = _safe_int(x), _safe_int(y) 1156 self._hsp.align_length = _safe_int(y) 1157 1158 if line.find('Positives') != -1: 1159 x, y = _re_search( 1160 r"Positives = (\d+)\/(\d+)", line, 1161 "I could not find the positives in line\n%s" % line) 1162 self._hsp.positives = _safe_int(x), _safe_int(y) 1163 assert self._hsp.align_length == _safe_int(y) 1164 1165 if line.find('Gaps') != -1: 1166 x, y = _re_search( 1167 r"Gaps = (\d+)\/(\d+)", line, 1168 "I could not find the gaps in line\n%s" % line) 1169 self._hsp.gaps = _safe_int(x), _safe_int(y) 1170 assert self._hsp.align_length == _safe_int(y)
1171 1172
1173 - def strand(self, line):
1174 self._hsp.strand = _re_search( 1175 r"Strand = (\w+) / (\w+)", line, 1176 "I could not find the strand in line\n%s" % line)
1177
1178 - def frame(self, line):
1179 # Frame can be in formats: 1180 # Frame = +1 1181 # Frame = +2 / +2 1182 if line.find('/') != -1: 1183 self._hsp.frame = _re_search( 1184 r"Frame = ([-+][123]) / ([-+][123])", line, 1185 "I could not find the frame in line\n%s" % line) 1186 else: 1187 self._hsp.frame = _re_search( 1188 r"Frame = ([-+][123])", line, 1189 "I could not find the frame in line\n%s" % line)
1190 1191 # Match a space, if one is available. Masahir Ishikawa found a 1192 # case where there's no space between the start and the sequence: 1193 # Query: 100tt 101 1194 # line below modified by Yair Benita, Sep 2004 1195 # Note that the colon is not always present. 2006 1196 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)")
1197 - def query(self, line):
1198 m = self._query_re.search(line) 1199 if m is None: 1200 raise ValueError("I could not find the query in line\n%s" % line) 1201 1202 # line below modified by Yair Benita, Sep 2004. 1203 # added the end attribute for the query 1204 colon, start, seq, end = m.groups() 1205 self._hsp.query = self._hsp.query + seq 1206 if self._hsp.query_start is None: 1207 self._hsp.query_start = _safe_int(start) 1208 1209 # line below added by Yair Benita, Sep 2004. 1210 # added the end attribute for the query 1211 self._hsp.query_end = _safe_int(end) 1212 1213 #Get index for sequence start (regular expression element 3) 1214 self._query_start_index = m.start(3) 1215 self._query_len = len(seq)
1216
1217 - def align(self, line):
1218 seq = line[self._query_start_index:].rstrip() 1219 if len(seq) < self._query_len: 1220 # Make sure the alignment is the same length as the query 1221 seq = seq + ' ' * (self._query_len-len(seq)) 1222 elif len(seq) < self._query_len: 1223 raise ValueError("Match is longer than the query in line\n%s" \ 1224 % line) 1225 self._hsp.match = self._hsp.match + seq
1226 1227 # To match how we do the query, cache the regular expression. 1228 # Note that the colon is not always present. 1229 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)")
1230 - def sbjct(self, line):
1231 m = self._sbjct_re.search(line) 1232 if m is None: 1233 raise ValueError("I could not find the sbjct in line\n%s" % line) 1234 colon, start, seq, end = m.groups() 1235 #mikep 26/9/00 1236 #On occasion, there is a blast hit with no subject match 1237 #so far, it only occurs with 1-line short "matches" 1238 #I have decided to let these pass as they appear 1239 if not seq.strip(): 1240 seq = ' ' * self._query_len 1241 self._hsp.sbjct = self._hsp.sbjct + seq 1242 if self._hsp.sbjct_start is None: 1243 self._hsp.sbjct_start = _safe_int(start) 1244 1245 self._hsp.sbjct_end = _safe_int(end) 1246 if len(seq) != self._query_len: 1247 raise ValueError( \ 1248 "QUERY and SBJCT sequence lengths don't match in line\n%s" \ 1249 % line) 1250 1251 del self._query_start_index # clean up unused variables 1252 del self._query_len
1253
1254 - def end_hsp(self):
1255 pass
1256
1257 -class _DatabaseReportConsumer:
1258
1259 - def start_database_report(self):
1260 self._dr = Record.DatabaseReport()
1261
1262 - def database(self, line):
1263 m = re.search(r"Database: (.+)$", line) 1264 if m: 1265 self._dr.database_name.append(m.group(1)) 1266 elif self._dr.database_name: 1267 # This must be a continuation of the previous name. 1268 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1], 1269 line.strip())
1270
1271 - def posted_date(self, line):
1272 self._dr.posted_date.append(_re_search( 1273 r"Posted date:\s*(.+)$", line, 1274 "I could not find the posted date in line\n%s" % line))
1275
1276 - def num_letters_in_database(self, line):
1277 letters, = _get_cols( 1278 line, (-1,), ncols=6, expected={2:"letters", 4:"database:"}) 1279 self._dr.num_letters_in_database.append(_safe_int(letters))
1280
1281 - def num_sequences_in_database(self, line):
1282 sequences, = _get_cols( 1283 line, (-1,), ncols=6, expected={2:"sequences", 4:"database:"}) 1284 self._dr.num_sequences_in_database.append(_safe_int(sequences))
1285
1286 - def ka_params(self, line):
1287 x = line.split() 1288 self._dr.ka_params = map(_safe_float, x)
1289
1290 - def gapped(self, line):
1291 self._dr.gapped = 1
1292
1293 - def ka_params_gap(self, line):
1294 x = line.split() 1295 self._dr.ka_params_gap = map(_safe_float, x)
1296
1297 - def end_database_report(self):
1298 pass
1299
1300 -class _ParametersConsumer:
1301 - def start_parameters(self):
1302 self._params = Record.Parameters()
1303
1304 - def matrix(self, line):
1305 self._params.matrix = line[8:].rstrip()
1306
1307 - def gap_penalties(self, line):
1308 x = _get_cols( 1309 line, (3, 5), ncols=6, expected={2:"Existence:", 4:"Extension:"}) 1310 self._params.gap_penalties = map(_safe_float, x)
1311
1312 - def num_hits(self, line):
1313 if line.find('1st pass') != -1: 1314 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"}) 1315 self._params.num_hits = _safe_int(x) 1316 else: 1317 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"}) 1318 self._params.num_hits = _safe_int(x)
1319
1320 - def num_sequences(self, line):
1321 if line.find('1st pass') != -1: 1322 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"}) 1323 self._params.num_sequences = _safe_int(x) 1324 else: 1325 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"}) 1326 self._params.num_sequences = _safe_int(x)
1327
1328 - def num_extends(self, line):
1329 if line.find('1st pass') != -1: 1330 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"}) 1331 self._params.num_extends = _safe_int(x) 1332 else: 1333 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"}) 1334 self._params.num_extends = _safe_int(x)
1335
1336 - def num_good_extends(self, line):
1337 if line.find('1st pass') != -1: 1338 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"}) 1339 self._params.num_good_extends = _safe_int(x) 1340 else: 1341 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"}) 1342 self._params.num_good_extends = _safe_int(x)
1343
1344 - def num_seqs_better_e(self, line):
1345 self._params.num_seqs_better_e, = _get_cols( 1346 line, (-1,), ncols=7, expected={2:"sequences"}) 1347 self._params.num_seqs_better_e = _safe_int( 1348 self._params.num_seqs_better_e)
1349
1350 - def hsps_no_gap(self, line):
1351 self._params.hsps_no_gap, = _get_cols( 1352 line, (-1,), ncols=9, expected={3:"better", 7:"gapping:"}) 1353 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1354
1355 - def hsps_prelim_gapped(self, line):
1356 self._params.hsps_prelim_gapped, = _get_cols( 1357 line, (-1,), ncols=9, expected={4:"gapped", 6:"prelim"}) 1358 self._params.hsps_prelim_gapped = _safe_int( 1359 self._params.hsps_prelim_gapped)
1360
1361 - def hsps_prelim_gapped_attempted(self, line):
1362 self._params.hsps_prelim_gapped_attempted, = _get_cols( 1363 line, (-1,), ncols=10, expected={4:"attempted", 7:"prelim"}) 1364 self._params.hsps_prelim_gapped_attempted = _safe_int( 1365 self._params.hsps_prelim_gapped_attempted)
1366
1367 - def hsps_gapped(self, line):
1368 self._params.hsps_gapped, = _get_cols( 1369 line, (-1,), ncols=6, expected={3:"gapped"}) 1370 self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
1371
1372 - def query_length(self, line):
1373 self._params.query_length, = _get_cols( 1374 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"query:"}) 1375 self._params.query_length = _safe_int(self._params.query_length)
1376
1377 - def database_length(self, line):
1378 self._params.database_length, = _get_cols( 1379 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"database:"}) 1380 self._params.database_length = _safe_int(self._params.database_length)
1381
1382 - def effective_hsp_length(self, line):
1383 self._params.effective_hsp_length, = _get_cols( 1384 line, (-1,), ncols=4, expected={1:"HSP", 2:"length:"}) 1385 self._params.effective_hsp_length = _safe_int( 1386 self._params.effective_hsp_length)
1387
1388 - def effective_query_length(self, line):
1389 self._params.effective_query_length, = _get_cols( 1390 line, (-1,), ncols=5, expected={1:"length", 3:"query:"}) 1391 self._params.effective_query_length = _safe_int( 1392 self._params.effective_query_length)
1393
1394 - def effective_database_length(self, line):
1395 self._params.effective_database_length, = _get_cols( 1396 line.lower(), (-1,), ncols=5, expected={1:"length", 3:"database:"}) 1397 self._params.effective_database_length = _safe_int( 1398 self._params.effective_database_length)
1399
1400 - def effective_search_space(self, line):
1401 self._params.effective_search_space, = _get_cols( 1402 line, (-1,), ncols=4, expected={1:"search"}) 1403 self._params.effective_search_space = _safe_int( 1404 self._params.effective_search_space)
1405
1406 - def effective_search_space_used(self, line):
1407 self._params.effective_search_space_used, = _get_cols( 1408 line, (-1,), ncols=5, expected={1:"search", 3:"used:"}) 1409 self._params.effective_search_space_used = _safe_int( 1410 self._params.effective_search_space_used)
1411
1412 - def frameshift(self, line):
1413 self._params.frameshift = _get_cols( 1414 line, (4, 5), ncols=6, expected={0:"frameshift", 2:"decay"})
1415
1416 - def threshold(self, line):
1417 if line[:2] == "T:": 1418 #Assume its an old stlye line like "T: 123" 1419 self._params.threshold, = _get_cols( 1420 line, (1,), ncols=2, expected={0:"T:"}) 1421 elif line[:28] == "Neighboring words threshold:": 1422 self._params.threshold, = _get_cols( 1423 line, (3,), ncols=4, expected={0:"Neighboring", 1:"words", 2:"threshold:"}) 1424 else: 1425 raise ValueError("Unrecognised threshold line:\n%s" % line) 1426 self._params.threshold = _safe_int(self._params.threshold)
1427
1428 - def window_size(self, line):
1429 if line[:2] == "A:": 1430 self._params.window_size, = _get_cols( 1431 line, (1,), ncols=2, expected={0:"A:"}) 1432 elif line[:25] == "Window for multiple hits:": 1433 self._params.window_size, = _get_cols( 1434 line, (4,), ncols=5, expected={0:"Window", 2:"multiple", 3:"hits:"}) 1435 else: 1436 raise ValueError("Unrecognised window size line:\n%s" % line) 1437 self._params.window_size = _safe_int(self._params.window_size)
1438
1439 - def dropoff_1st_pass(self, line):
1440 score, bits = _re_search( 1441 r"X1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1442 "I could not find the dropoff in line\n%s" % line) 1443 self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
1444
1445 - def gap_x_dropoff(self, line):
1446 score, bits = _re_search( 1447 r"X2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1448 "I could not find the gap dropoff in line\n%s" % line) 1449 self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
1450
1451 - def gap_x_dropoff_final(self, line):
1452 score, bits = _re_search( 1453 r"X3: (\d+) \(\s*([0-9,.]+) bits\)", line, 1454 "I could not find the gap dropoff final in line\n%s" % line) 1455 self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
1456
1457 - def gap_trigger(self, line):
1458 score, bits = _re_search( 1459 r"S1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1460 "I could not find the gap trigger in line\n%s" % line) 1461 self._params.gap_trigger = _safe_int(score), _safe_float(bits)
1462
1463 - def blast_cutoff(self, line):
1464 score, bits = _re_search( 1465 r"S2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1466 "I could not find the blast cutoff in line\n%s" % line) 1467 self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
1468
1469 - def end_parameters(self):
1470 pass
1471 1472
1473 -class _BlastConsumer(AbstractConsumer, 1474 _HeaderConsumer, 1475 _DescriptionConsumer, 1476 _AlignmentConsumer, 1477 _HSPConsumer, 1478 _DatabaseReportConsumer, 1479 _ParametersConsumer 1480 ):
1481 # This Consumer is inherits from many other consumer classes that handle 1482 # the actual dirty work. An alternate way to do it is to create objects 1483 # of those classes and then delegate the parsing tasks to them in a 1484 # decorator-type pattern. The disadvantage of that is that the method 1485 # names will need to be resolved in this classes. However, using 1486 # a decorator will retain more control in this class (which may or 1487 # may not be a bad thing). In addition, having each sub-consumer as 1488 # its own object prevents this object's dictionary from being cluttered 1489 # with members and reduces the chance of member collisions.
1490 - def __init__(self):
1491 self.data = None
1492
1493 - def round(self, line):
1494 # Make sure nobody's trying to pass me PSI-BLAST data! 1495 raise ValueError("This consumer doesn't handle PSI-BLAST data")
1496
1497 - def start_header(self):
1498 self.data = Record.Blast() 1499 _HeaderConsumer.start_header(self)
1500
1501 - def end_header(self):
1502 _HeaderConsumer.end_header(self) 1503 self.data.__dict__.update(self._header.__dict__)
1504
1505 - def end_descriptions(self):
1506 self.data.descriptions = self._descriptions
1507
1508 - def end_alignment(self):
1509 _AlignmentConsumer.end_alignment(self) 1510 if self._alignment.hsps: 1511 self.data.alignments.append(self._alignment) 1512 if self._multiple_alignment.alignment: 1513 self.data.multiple_alignment = self._multiple_alignment
1514
1515 - def end_hsp(self):
1516 _HSPConsumer.end_hsp(self) 1517 try: 1518 self._alignment.hsps.append(self._hsp) 1519 except AttributeError: 1520 raise ValueError("Found an HSP before an alignment")
1521
1522 - def end_database_report(self):
1523 _DatabaseReportConsumer.end_database_report(self) 1524 self.data.__dict__.update(self._dr.__dict__)
1525
1526 - def end_parameters(self):
1527 _ParametersConsumer.end_parameters(self) 1528 self.data.__dict__.update(self._params.__dict__)
1529
1530 -class _PSIBlastConsumer(AbstractConsumer, 1531 _HeaderConsumer, 1532 _DescriptionConsumer, 1533 _AlignmentConsumer, 1534 _HSPConsumer, 1535 _DatabaseReportConsumer, 1536 _ParametersConsumer 1537 ):
1538 - def __init__(self):
1539 self.data = None
1540
1541 - def start_header(self):
1542 self.data = Record.PSIBlast() 1543 _HeaderConsumer.start_header(self)
1544
1545 - def end_header(self):
1546 _HeaderConsumer.end_header(self) 1547 self.data.__dict__.update(self._header.__dict__)
1548
1549 - def start_descriptions(self):
1550 self._round = Record.Round() 1551 self.data.rounds.append(self._round) 1552 _DescriptionConsumer.start_descriptions(self)
1553
1554 - def end_descriptions(self):
1555 _DescriptionConsumer.end_descriptions(self) 1556 self._round.number = self._roundnum 1557 if self._descriptions: 1558 self._round.new_seqs.extend(self._descriptions) 1559 self._round.reused_seqs.extend(self._model_sequences) 1560 self._round.new_seqs.extend(self._nonmodel_sequences) 1561 if self._converged: 1562 self.data.converged = 1
1563
1564 - def end_alignment(self):
1565 _AlignmentConsumer.end_alignment(self) 1566 if self._alignment.hsps: 1567 self._round.alignments.append(self._alignment) 1568 if self._multiple_alignment: 1569 self._round.multiple_alignment = self._multiple_alignment
1570
1571 - def end_hsp(self):
1572 _HSPConsumer.end_hsp(self) 1573 try: 1574 self._alignment.hsps.append(self._hsp) 1575 except AttributeError: 1576 raise ValueError("Found an HSP before an alignment")
1577
1578 - def end_database_report(self):
1579 _DatabaseReportConsumer.end_database_report(self) 1580 self.data.__dict__.update(self._dr.__dict__)
1581
1582 - def end_parameters(self):
1583 _ParametersConsumer.end_parameters(self) 1584 self.data.__dict__.update(self._params.__dict__)
1585
1586 -class Iterator:
1587 """Iterates over a file of multiple BLAST results. 1588 1589 Methods: 1590 next Return the next record from the stream, or None. 1591 1592 """
1593 - def __init__(self, handle, parser=None):
1594 """__init__(self, handle, parser=None) 1595 1596 Create a new iterator. handle is a file-like object. parser 1597 is an optional Parser object to change the results into another form. 1598 If set to None, then the raw contents of the file will be returned. 1599 1600 """ 1601 try: 1602 handle.readline 1603 except AttributeError: 1604 raise ValueError( 1605 "I expected a file handle or file-like object, got %s" 1606 % type(handle)) 1607 self._uhandle = File.UndoHandle(handle) 1608 self._parser = parser 1609 self._header = []
1610
1611 - def next(self):
1612 """next(self) -> object 1613 1614 Return the next Blast record from the file. If no more records, 1615 return None. 1616 1617 """ 1618 lines = [] 1619 query = False 1620 while 1: 1621 line = self._uhandle.readline() 1622 if not line: 1623 break 1624 # If I've reached the next one, then put the line back and stop. 1625 if lines and (line.startswith('BLAST') 1626 or line.startswith('BLAST', 1) 1627 or line.startswith('<?xml ')): 1628 self._uhandle.saveline(line) 1629 break 1630 # New style files ommit the BLAST line to mark a new query: 1631 if line.startswith("Query="): 1632 if not query: 1633 if not self._header: 1634 self._header = lines[:] 1635 query = True 1636 else: 1637 #Start of another record 1638 self._uhandle.saveline(line) 1639 break 1640 lines.append(line) 1641 1642 if query and "BLAST" not in lines[0]: 1643 #Cheat and re-insert the header 1644 #print "-"*50 1645 #print "".join(self._header) 1646 #print "-"*50 1647 #print "".join(lines) 1648 #print "-"*50 1649 lines = self._header + lines 1650 1651 if not lines: 1652 return None 1653 1654 data = ''.join(lines) 1655 if self._parser is not None: 1656 return self._parser.parse(File.StringHandle(data)) 1657 return data
1658
1659 - def __iter__(self):
1660 return iter(self.next, None)
1661
1662 -def blastall(blastcmd, program, database, infile, align_view='7', **keywds):
1663 """Execute and retrieve data from standalone BLASTPALL as handles (OBSOLETE). 1664 1665 NOTE - This function is obsolete, you are encouraged to the command 1666 line wrapper Bio.Blast.Applications.BlastallCommandline instead. 1667 1668 Execute and retrieve data from blastall. blastcmd is the command 1669 used to launch the 'blastall' executable. program is the blast program 1670 to use, e.g. 'blastp', 'blastn', etc. database is the path to the database 1671 to search against. infile is the path to the file containing 1672 the sequence to search with. 1673 1674 The return values are two handles, for standard output and standard error. 1675 1676 You may pass more parameters to **keywds to change the behavior of 1677 the search. Otherwise, optional values will be chosen by blastall. 1678 The Blast output is by default in XML format. Use the align_view keyword 1679 for output in a different format. 1680 1681 Scoring 1682 matrix Matrix to use. 1683 gap_open Gap open penalty. 1684 gap_extend Gap extension penalty. 1685 nuc_match Nucleotide match reward. (BLASTN) 1686 nuc_mismatch Nucleotide mismatch penalty. (BLASTN) 1687 query_genetic_code Genetic code for Query. 1688 db_genetic_code Genetic code for database. (TBLAST[NX]) 1689 1690 Algorithm 1691 gapped Whether to do a gapped alignment. T/F (not for TBLASTX) 1692 expectation Expectation value cutoff. 1693 wordsize Word size. 1694 strands Query strands to search against database.([T]BLAST[NX]) 1695 keep_hits Number of best hits from a region to keep. 1696 xdrop Dropoff value (bits) for gapped alignments. 1697 hit_extend Threshold for extending hits. 1698 region_length Length of region used to judge hits. 1699 db_length Effective database length. 1700 search_length Effective length of search space. 1701 1702 Processing 1703 filter Filter query sequence for low complexity (with SEG)? T/F 1704 believe_query Believe the query defline. T/F 1705 restrict_gi Restrict search to these GI's. 1706 nprocessors Number of processors to use. 1707 oldengine Force use of old engine T/F 1708 1709 Formatting 1710 html Produce HTML output? T/F 1711 descriptions Number of one-line descriptions. 1712 alignments Number of alignments. 1713 align_view Alignment view. Integer 0-11, 1714 passed as a string or integer. 1715 show_gi Show GI's in deflines? T/F 1716 seqalign_file seqalign file to output. 1717 outfile Output file for report. Filename to write to, if 1718 ommitted standard output is used (which you can access 1719 from the returned handles). 1720 """ 1721 1722 _security_check_parameters(keywds) 1723 1724 att2param = { 1725 'matrix' : '-M', 1726 'gap_open' : '-G', 1727 'gap_extend' : '-E', 1728 'nuc_match' : '-r', 1729 'nuc_mismatch' : '-q', 1730 'query_genetic_code' : '-Q', 1731 'db_genetic_code' : '-D', 1732 1733 'gapped' : '-g', 1734 'expectation' : '-e', 1735 'wordsize' : '-W', 1736 'strands' : '-S', 1737 'keep_hits' : '-K', 1738 'xdrop' : '-X', 1739 'hit_extend' : '-f', 1740 'region_length' : '-L', 1741 'db_length' : '-z', 1742 'search_length' : '-Y', 1743 1744 'program' : '-p', 1745 'database' : '-d', 1746 'infile' : '-i', 1747 'filter' : '-F', 1748 'believe_query' : '-J', 1749 'restrict_gi' : '-l', 1750 'nprocessors' : '-a', 1751 'oldengine' : '-V', 1752 1753 'html' : '-T', 1754 'descriptions' : '-v', 1755 'alignments' : '-b', 1756 'align_view' : '-m', 1757 'show_gi' : '-I', 1758 'seqalign_file' : '-O', 1759 'outfile' : '-o', 1760 } 1761 from Applications import BlastallCommandline 1762 cline = BlastallCommandline(blastcmd) 1763 cline.set_parameter(att2param['program'], program) 1764 cline.set_parameter(att2param['database'], database) 1765 cline.set_parameter(att2param['infile'], infile) 1766 cline.set_parameter(att2param['align_view'], str(align_view)) 1767 for key, value in keywds.iteritems(): 1768 cline.set_parameter(att2param[key], str(value)) 1769 return _invoke_blast(cline)
1770 1771
1772 -def blastpgp(blastcmd, database, infile, align_view='7', **keywds):
1773 """Execute and retrieve data from standalone BLASTPGP as handles (OBSOLETE). 1774 1775 NOTE - This function is obsolete, you are encouraged to the command 1776 line wrapper Bio.Blast.Applications.BlastpgpCommandline instead. 1777 1778 Execute and retrieve data from blastpgp. blastcmd is the command 1779 used to launch the 'blastpgp' executable. database is the path to the 1780 database to search against. infile is the path to the file containing 1781 the sequence to search with. 1782 1783 The return values are two handles, for standard output and standard error. 1784 1785 You may pass more parameters to **keywds to change the behavior of 1786 the search. Otherwise, optional values will be chosen by blastpgp. 1787 The Blast output is by default in XML format. Use the align_view keyword 1788 for output in a different format. 1789 1790 Scoring 1791 matrix Matrix to use. 1792 gap_open Gap open penalty. 1793 gap_extend Gap extension penalty. 1794 window_size Multiple hits window size. 1795 npasses Number of passes. 1796 passes Hits/passes. Integer 0-2. 1797 1798 Algorithm 1799 gapped Whether to do a gapped alignment. T/F 1800 expectation Expectation value cutoff. 1801 wordsize Word size. 1802 keep_hits Number of beset hits from a region to keep. 1803 xdrop Dropoff value (bits) for gapped alignments. 1804 hit_extend Threshold for extending hits. 1805 region_length Length of region used to judge hits. 1806 db_length Effective database length. 1807 search_length Effective length of search space. 1808 nbits_gapping Number of bits to trigger gapping. 1809 pseudocounts Pseudocounts constants for multiple passes. 1810 xdrop_final X dropoff for final gapped alignment. 1811 xdrop_extension Dropoff for blast extensions. 1812 model_threshold E-value threshold to include in multipass model. 1813 required_start Start of required region in query. 1814 required_end End of required region in query. 1815 1816 Processing 1817 XXX should document default values 1818 program The blast program to use. (PHI-BLAST) 1819 filter Filter query sequence for low complexity (with SEG)? T/F 1820 believe_query Believe the query defline? T/F 1821 nprocessors Number of processors to use. 1822 1823 Formatting 1824 html Produce HTML output? T/F 1825 descriptions Number of one-line descriptions. 1826 alignments Number of alignments. 1827 align_view Alignment view. Integer 0-11, 1828 passed as a string or integer. 1829 show_gi Show GI's in deflines? T/F 1830 seqalign_file seqalign file to output. 1831 align_outfile Output file for alignment. 1832 checkpoint_outfile Output file for PSI-BLAST checkpointing. 1833 restart_infile Input file for PSI-BLAST restart. 1834 hit_infile Hit file for PHI-BLAST. 1835 matrix_outfile Output file for PSI-BLAST matrix in ASCII. 1836 align_outfile Output file for alignment. Filename to write to, if 1837 ommitted standard output is used (which you can access 1838 from the returned handles). 1839 1840 align_infile Input alignment file for PSI-BLAST restart. 1841 1842 """ 1843 1844 _security_check_parameters(keywds) 1845 1846 att2param = { 1847 'matrix' : '-M', 1848 'gap_open' : '-G', 1849 'gap_extend' : '-E', 1850 'window_size' : '-A', 1851 'npasses' : '-j', 1852 'passes' : '-P', 1853 1854 'gapped' : '-g', 1855 'expectation' : '-e', 1856 'wordsize' : '-W', 1857 'keep_hits' : '-K', 1858 'xdrop' : '-X', 1859 'hit_extend' : '-f', 1860 'region_length' : '-L', 1861 'db_length' : '-Z', 1862 'search_length' : '-Y', 1863 'nbits_gapping' : '-N', 1864 'pseudocounts' : '-c', 1865 'xdrop_final' : '-Z', 1866 'xdrop_extension' : '-y', 1867 'model_threshold' : '-h', 1868 'required_start' : '-S', 1869 'required_end' : '-H', 1870 1871 'program' : '-p', 1872 'database' : '-d', 1873 'infile' : '-i', 1874 'filter' : '-F', 1875 'believe_query' : '-J', 1876 'nprocessors' : '-a', 1877 1878 'html' : '-T', 1879 'descriptions' : '-v', 1880 'alignments' : '-b', 1881 'align_view' : '-m', 1882 'show_gi' : '-I', 1883 'seqalign_file' : '-O', 1884 'align_outfile' : '-o', 1885 'checkpoint_outfile' : '-C', 1886 'restart_infile' : '-R', 1887 'hit_infile' : '-k', 1888 'matrix_outfile' : '-Q', 1889 'align_infile' : '-B', 1890 } 1891 from Applications import BlastpgpCommandline 1892 cline = BlastpgpCommandline(blastcmd) 1893 cline.set_parameter(att2param['database'], database) 1894 cline.set_parameter(att2param['infile'], infile) 1895 cline.set_parameter(att2param['align_view'], str(align_view)) 1896 for key, value in keywds.iteritems(): 1897 cline.set_parameter(att2param[key], str(value)) 1898 return _invoke_blast(cline)
1899 1900
1901 -def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
1902 """Execute and retrieve data from standalone RPS-BLAST as handles (OBSOLETE). 1903 1904 NOTE - This function is obsolete, you are encouraged to the command 1905 line wrapper Bio.Blast.Applications.RpsBlastCommandline instead. 1906 1907 Execute and retrieve data from standalone RPS-BLAST. blastcmd is the 1908 command used to launch the 'rpsblast' executable. database is the path 1909 to the database to search against. infile is the path to the file 1910 containing the sequence to search with. 1911 1912 The return values are two handles, for standard output and standard error. 1913 1914 You may pass more parameters to **keywds to change the behavior of 1915 the search. Otherwise, optional values will be chosen by rpsblast. 1916 1917 Please note that this function will give XML output by default, by 1918 setting align_view to seven (i.e. command line option -m 7). 1919 You should use the NCBIXML.parse() function to read the resulting output. 1920 This is because NCBIStandalone.BlastParser() does not understand the 1921 plain text output format from rpsblast. 1922 1923 WARNING - The following text and associated parameter handling has not 1924 received extensive testing. Please report any errors we might have made... 1925 1926 Algorithm/Scoring 1927 gapped Whether to do a gapped alignment. T/F 1928 multihit 0 for multiple hit (default), 1 for single hit 1929 expectation Expectation value cutoff. 1930 range_restriction Range restriction on query sequence (Format: start,stop) blastp only 1931 0 in 'start' refers to the beginning of the sequence 1932 0 in 'stop' refers to the end of the sequence 1933 Default = 0,0 1934 xdrop Dropoff value (bits) for gapped alignments. 1935 xdrop_final X dropoff for final gapped alignment (in bits). 1936 xdrop_extension Dropoff for blast extensions (in bits). 1937 search_length Effective length of search space. 1938 nbits_gapping Number of bits to trigger gapping. 1939 protein Query sequence is protein. T/F 1940 db_length Effective database length. 1941 1942 Processing 1943 filter Filter query sequence for low complexity? T/F 1944 case_filter Use lower case filtering of FASTA sequence T/F, default F 1945 believe_query Believe the query defline. T/F 1946 nprocessors Number of processors to use. 1947 logfile Name of log file to use, default rpsblast.log 1948 1949 Formatting 1950 html Produce HTML output? T/F 1951 descriptions Number of one-line descriptions. 1952 alignments Number of alignments. 1953 align_view Alignment view. Integer 0-11, 1954 passed as a string or integer. 1955 show_gi Show GI's in deflines? T/F 1956 seqalign_file seqalign file to output. 1957 align_outfile Output file for alignment. Filename to write to, if 1958 ommitted standard output is used (which you can access 1959 from the returned handles). 1960 """ 1961 1962 _security_check_parameters(keywds) 1963 1964 att2param = { 1965 'multihit' : '-P', 1966 'gapped' : '-g', 1967 'expectation' : '-e', 1968 'range_restriction' : '-L', 1969 'xdrop' : '-X', 1970 'xdrop_final' : '-Z', 1971 'xdrop_extension' : '-y', 1972 'search_length' : '-Y', 1973 'nbits_gapping' : '-N', 1974 'protein' : '-p', 1975 'db_length' : '-z', 1976 1977 'database' : '-d', 1978 'infile' : '-i', 1979 'filter' : '-F', 1980 'case_filter' : '-U', 1981 'believe_query' : '-J', 1982 'nprocessors' : '-a', 1983 'logfile' : '-l', 1984 1985 'html' : '-T', 1986 'descriptions' : '-v', 1987 'alignments' : '-b', 1988 'align_view' : '-m', 1989 'show_gi' : '-I', 1990 'seqalign_file' : '-O', 1991 'align_outfile' : '-o', 1992 } 1993 1994 from Applications import RpsBlastCommandline 1995 cline = RpsBlastCommandline(blastcmd) 1996 cline.set_parameter(att2param['database'], database) 1997 cline.set_parameter(att2param['infile'], infile) 1998 cline.set_parameter(att2param['align_view'], str(align_view)) 1999 for key, value in keywds.iteritems(): 2000 cline.set_parameter(att2param[key], str(value)) 2001 return _invoke_blast(cline)
2002 2003
2004 -def _re_search(regex, line, error_msg):
2005 m = re.search(regex, line) 2006 if not m: 2007 raise ValueError(error_msg) 2008 return m.groups()
2009
2010 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
2011 cols = line.split() 2012 2013 # Check to make sure number of columns is correct 2014 if ncols is not None and len(cols) != ncols: 2015 raise ValueError("I expected %d columns (got %d) in line\n%s" \ 2016 % (ncols, len(cols), line)) 2017 2018 # Check to make sure columns contain the correct data 2019 for k in expected.keys(): 2020 if cols[k] != expected[k]: 2021 raise ValueError("I expected '%s' in column %d in line\n%s" \ 2022 % (expected[k], k, line)) 2023 2024 # Construct the answer tuple 2025 results = [] 2026 for c in cols_to_get: 2027 results.append(cols[c]) 2028 return tuple(results)
2029
2030 -def _safe_int(str):
2031 try: 2032 return int(str) 2033 except ValueError: 2034 # Something went wrong. Try to clean up the string. 2035 # Remove all commas from the string 2036 str = str.replace(',', '') 2037 try: 2038 # try again. 2039 return int(str) 2040 except ValueError: 2041 pass 2042 # If it fails again, maybe it's too long? 2043 # XXX why converting to float? 2044 return long(float(str))
2045
2046 -def _safe_float(str):
2047 # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that 2048 # float('e-172') does not produce an error on his platform. Thus, 2049 # we need to check the string for this condition. 2050 2051 # Sometimes BLAST leaves of the '1' in front of an exponent. 2052 if str and str[0] in ['E', 'e']: 2053 str = '1' + str 2054 try: 2055 return float(str) 2056 except ValueError: 2057 # Remove all commas from the string 2058 str = str.replace(',', '') 2059 # try again. 2060 return float(str)
2061 2062
2063 -def _invoke_blast(cline):
2064 """Start BLAST and returns handles for stdout and stderr (PRIVATE). 2065 2066 Expects a command line wrapper object from Bio.Blast.Applications 2067 """ 2068 import subprocess, sys 2069 blast_cmd = cline.program_name 2070 if not os.path.exists(blast_cmd): 2071 raise ValueError("BLAST executable does not exist at %s" % blast_cmd) 2072 #We don't need to supply any piped input, but we setup the 2073 #standard input pipe anyway as a work around for a python 2074 #bug if this is called from a Windows GUI program. For 2075 #details, see http://bugs.python.org/issue1124861 2076 blast_process = subprocess.Popen(str(cline), 2077 stdin=subprocess.PIPE, 2078 stdout=subprocess.PIPE, 2079 stderr=subprocess.PIPE, 2080 shell=(sys.platform!="win32")) 2081 blast_process.stdin.close() 2082 return blast_process.stdout, blast_process.stderr
2083 2084
2085 -def _security_check_parameters(param_dict):
2086 """Look for any attempt to insert a command into a parameter. 2087 2088 e.g. blastall(..., matrix='IDENTITY -F 0; rm -rf /etc/passwd') 2089 2090 Looks for ";" or "&&" in the strings (Unix and Windows syntax 2091 for appending a command line), or ">", "<" or "|" (redirection) 2092 and if any are found raises an exception. 2093 """ 2094 for key, value in param_dict.iteritems(): 2095 str_value = str(value) # Could easily be an int or a float 2096 for bad_str in [";", "&&", ">", "<", "|"]: 2097 if bad_str in str_value: 2098 raise ValueError("Rejecting suspicious argument for %s" % key)
2099
2100 -class _BlastErrorConsumer(_BlastConsumer):
2101 - def __init__(self):
2103 - def noevent(self, line):
2104 if line.find("Query must be at least wordsize") != -1: 2105 raise ShortQueryBlastError("Query must be at least wordsize") 2106 # Now pass the line back up to the superclass. 2107 method = getattr(_BlastConsumer, 'noevent', 2108 _BlastConsumer.__getattr__(self, 'noevent')) 2109 method(line)
2110
2111 -class BlastErrorParser(AbstractParser):
2112 """Attempt to catch and diagnose BLAST errors while parsing. 2113 2114 This utilizes the BlastParser module but adds an additional layer 2115 of complexity on top of it by attempting to diagnose ValueErrors 2116 that may actually indicate problems during BLAST parsing. 2117 2118 Current BLAST problems this detects are: 2119 o LowQualityBlastError - When BLASTing really low quality sequences 2120 (ie. some GenBank entries which are just short streches of a single 2121 nucleotide), BLAST will report an error with the sequence and be 2122 unable to search with this. This will lead to a badly formatted 2123 BLAST report that the parsers choke on. The parser will convert the 2124 ValueError to a LowQualityBlastError and attempt to provide useful 2125 information. 2126 2127 """
2128 - def __init__(self, bad_report_handle = None):
2129 """Initialize a parser that tries to catch BlastErrors. 2130 2131 Arguments: 2132 o bad_report_handle - An optional argument specifying a handle 2133 where bad reports should be sent. This would allow you to save 2134 all of the bad reports to a file, for instance. If no handle 2135 is specified, the bad reports will not be saved. 2136 """ 2137 self._bad_report_handle = bad_report_handle 2138 2139 #self._b_parser = BlastParser() 2140 self._scanner = _Scanner() 2141 self._consumer = _BlastErrorConsumer()
2142
2143 - def parse(self, handle):
2144 """Parse a handle, attempting to diagnose errors. 2145 """ 2146 results = handle.read() 2147 2148 try: 2149 self._scanner.feed(File.StringHandle(results), self._consumer) 2150 except ValueError, msg: 2151 # if we have a bad_report_file, save the info to it first 2152 if self._bad_report_handle: 2153 # send the info to the error handle 2154 self._bad_report_handle.write(results) 2155 2156 # now we want to try and diagnose the error 2157 self._diagnose_error( 2158 File.StringHandle(results), self._consumer.data) 2159 2160 # if we got here we can't figure out the problem 2161 # so we should pass along the syntax error we got 2162 raise 2163 return self._consumer.data
2164
2165 - def _diagnose_error(self, handle, data_record):
2166 """Attempt to diagnose an error in the passed handle. 2167 2168 Arguments: 2169 o handle - The handle potentially containing the error 2170 o data_record - The data record partially created by the consumer. 2171 """ 2172 line = handle.readline() 2173 2174 while line: 2175 # 'Searchingdone' instead of 'Searching......done' seems 2176 # to indicate a failure to perform the BLAST due to 2177 # low quality sequence 2178 if line.startswith('Searchingdone'): 2179 raise LowQualityBlastError("Blast failure occured on query: ", 2180 data_record.query) 2181 line = handle.readline()
2182