Package Bio :: Package Blast :: Module NCBIWWW
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIWWW

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # Patched by Brad Chapman. 
  7  # Chris Wroe added modifications for work in myGrid 
  8   
  9  """ 
 10  This module provides code to work with the WWW version of BLAST 
 11  provided by the NCBI. 
 12  http://blast.ncbi.nlm.nih.gov/ 
 13   
 14  Functions: 
 15  qblast        Do a BLAST search using the QBLAST API. 
 16  """ 
 17   
 18  try: 
 19      import cStringIO as StringIO 
 20  except ImportError: 
 21      import StringIO 
 22   
 23   
 24   
25 -def qblast(program, database, sequence, 26 auto_format=None,composition_based_statistics=None, 27 db_genetic_code=None,endpoints=None,entrez_query='(none)', 28 expect=10.0,filter=None,gapcosts=None,genetic_code=None, 29 hitlist_size=50,i_thresh=None,layout=None,lcase_mask=None, 30 matrix_name=None,nucl_penalty=None,nucl_reward=None, 31 other_advanced=None,perc_ident=None,phi_pattern=None, 32 query_file=None,query_believe_defline=None,query_from=None, 33 query_to=None,searchsp_eff=None,service=None,threshold=None, 34 ungapped_alignment=None,word_size=None, 35 alignments=500,alignment_view=None,descriptions=500, 36 entrez_links_new_window=None,expect_low=None,expect_high=None, 37 format_entrez_query=None,format_object=None,format_type='XML', 38 ncbi_gi=None,results_file=None,show_overview=None 39 ):
40 """Do a BLAST search using the QBLAST server at NCBI. 41 42 Supports all parameters of the qblast API for Put and Get. 43 Some useful parameters: 44 program blastn, blastp, blastx, tblastn, or tblastx (lower case) 45 database Which database to search against (e.g. "nr"). 46 sequence The sequence to search. 47 ncbi_gi TRUE/FALSE whether to give 'gi' identifier. 48 descriptions Number of descriptions to show. Def 500. 49 alignments Number of alignments to show. Def 500. 50 expect An expect value cutoff. Def 10.0. 51 matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). 52 filter "none" turns off filtering. Default no filtering 53 format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". 54 entrez_query Entrez query to limit Blast search 55 hitlist_size Number of hits to return. Default 50 56 57 This function does no checking of the validity of the parameters 58 and passes the values to the server as is. More help is available at: 59 http://www.ncbi.nlm.nih.gov/BLAST/blast_overview.html 60 61 """ 62 import urllib, urllib2 63 import time 64 65 assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] 66 67 # Format the "Put" command, which sends search requests to qblast. 68 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 69 parameters = [ 70 ('AUTO_FORMAT',auto_format), 71 ('COMPOSITION_BASED_STATISTICS',composition_based_statistics), 72 ('DATABASE',database), 73 ('DB_GENETIC_CODE',db_genetic_code), 74 ('ENDPOINTS',endpoints), 75 ('ENTREZ_QUERY',entrez_query), 76 ('EXPECT',expect), 77 ('FILTER',filter), 78 ('GAPCOSTS',gapcosts), 79 ('GENETIC_CODE',genetic_code), 80 ('HITLIST_SIZE',hitlist_size), 81 ('I_THRESH',i_thresh), 82 ('LAYOUT',layout), 83 ('LCASE_MASK',lcase_mask), 84 ('MATRIX_NAME',matrix_name), 85 ('NUCL_PENALTY',nucl_penalty), 86 ('NUCL_REWARD',nucl_reward), 87 ('OTHER_ADVANCED',other_advanced), 88 ('PERC_IDENT',perc_ident), 89 ('PHI_PATTERN',phi_pattern), 90 ('PROGRAM',program), 91 ('QUERY',sequence), 92 ('QUERY_FILE',query_file), 93 ('QUERY_BELIEVE_DEFLINE',query_believe_defline), 94 ('QUERY_FROM',query_from), 95 ('QUERY_TO',query_to), 96 ('SEARCHSP_EFF',searchsp_eff), 97 ('SERVICE',service), 98 ('THRESHOLD',threshold), 99 ('UNGAPPED_ALIGNMENT',ungapped_alignment), 100 ('WORD_SIZE',word_size), 101 ('CMD', 'Put'), 102 ] 103 query = [x for x in parameters if x[1] is not None] 104 message = urllib.urlencode(query) 105 106 # Send off the initial query to qblast. 107 # Note the NCBI do not currently impose a rate limit here, other 108 # than the request not to make say 50 queries at once using multiple 109 # threads. 110 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", 111 message, 112 {"User-Agent":"BiopythonClient"}) 113 handle = urllib2.urlopen(request) 114 115 # Format the "Get" command, which gets the formatted results from qblast 116 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 117 rid, rtoe = _parse_qblast_ref_page(handle) 118 parameters = [ 119 ('ALIGNMENTS',alignments), 120 ('ALIGNMENT_VIEW',alignment_view), 121 ('DESCRIPTIONS',descriptions), 122 ('ENTREZ_LINKS_NEW_WINDOW',entrez_links_new_window), 123 ('EXPECT_LOW',expect_low), 124 ('EXPECT_HIGH',expect_high), 125 ('FORMAT_ENTREZ_QUERY',format_entrez_query), 126 ('FORMAT_OBJECT',format_object), 127 ('FORMAT_TYPE',format_type), 128 ('NCBI_GI',ncbi_gi), 129 ('RID',rid), 130 ('RESULTS_FILE',results_file), 131 ('SERVICE',service), 132 ('SHOW_OVERVIEW',show_overview), 133 ('CMD', 'Get'), 134 ] 135 query = [x for x in parameters if x[1] is not None] 136 message = urllib.urlencode(query) 137 138 # Poll NCBI until the results are ready. Use a 3 second wait 139 delay = 3.0 140 previous = time.time() 141 while True: 142 current = time.time() 143 wait = previous + delay - current 144 if wait > 0: 145 time.sleep(wait) 146 previous = current + wait 147 else: 148 previous = current 149 150 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", 151 message, 152 {"User-Agent":"BiopythonClient"}) 153 handle = urllib2.urlopen(request) 154 results = handle.read() 155 # Can see an "\n\n" page while results are in progress, 156 # if so just wait a bit longer... 157 if results=="\n\n": 158 continue 159 # XML results don't have the Status tag when finished 160 if results.find("Status=") < 0: 161 break 162 i = results.index("Status=") 163 j = results.index("\n", i) 164 status = results[i+len("Status="):j].strip() 165 if status.upper() == "READY": 166 break 167 168 return StringIO.StringIO(results)
169
170 -def _parse_qblast_ref_page(handle):
171 """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE). 172 173 The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is proably 174 'Request Time of Execution' and RID would be 'Request Identifier'. 175 """ 176 s = handle.read() 177 i = s.find("RID =") 178 if i == -1: 179 rid = None 180 else: 181 j = s.find("\n", i) 182 rid = s[i+len("RID ="):j].strip() 183 184 i = s.find("RTOE =") 185 if i == -1: 186 rtoe = None 187 else: 188 j = s.find("\n", i) 189 rtoe = s[i+len("RTOE ="):j].strip() 190 191 if not rid and not rtoe: 192 #Can we reliably extract the error message from the HTML page? 193 #e.g. "Message ID#24 Error: Failed to read the Blast query: 194 # Nucleotide FASTA provided for protein sequence" 195 #or "Message ID#32 Error: Query contains no data: Query 196 # contains no sequence data" 197 # 198 #This used to occur inside a <div class="error msInf"> entry: 199 i = s.find('<div class="error msInf">') 200 if i != -1: 201 msg = s[i+len('<div class="error msInf">'):].strip() 202 msg = msg.split("</div>",1)[0].split("\n",1)[0].strip() 203 if msg: 204 raise ValueError("Error message from NCBI: %s" % msg) 205 #In spring 2010 the markup was like this: 206 i = s.find('<p class="error">') 207 if i != -1: 208 msg = s[i+len('<p class="error">'):].strip() 209 msg = msg.split("</p>",1)[0].split("\n",1)[0].strip() 210 if msg: 211 raise ValueError("Error message from NCBI: %s" % msg) 212 #Generic search based on the way the error messages start: 213 i = s.find('Message ID#') 214 if i != -1: 215 #Break the message at the first HTML tag 216 msg = s[i:].split("<",1)[0].split("\n",1)[0].strip() 217 raise ValueError("Error message from NCBI: %s" % msg) 218 #We didn't recognise the error layout :( 219 #print s 220 raise ValueError("No RID and no RTOE found in the 'please wait' page, " 221 "there was probably an error in your request but we " 222 "could not extract a helpful error message.") 223 elif not rid: 224 #Can this happen? 225 raise ValueError("No RID found in the 'please wait' page." 226 " (although RTOE = %s)" % repr(rtoe)) 227 elif not rtoe: 228 #Can this happen? 229 raise ValueError("No RTOE found in the 'please wait' page." 230 " (although RID = %s)" % repr(rid)) 231 232 try: 233 return rid, int(rtoe) 234 except ValueError: 235 raise ValueError("A non-integer RTOE found in " \ 236 +"the 'please wait' page, %s" % repr(rtoe))
237