Package BioSQL :: Module BioSeqDatabase
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeqDatabase

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2009 copyright by Peter Cock.  All rights reserved. 
  3  # Revisions 2009 copyright by Cymon J. Cox.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  # 
  8  # Note that BioSQL (including the database schema and scripts) is 
  9  # available and licensed separately.  Please consult www.biosql.org 
 10  """Connect with a BioSQL database and load Biopython like objects from it. 
 11   
 12  This provides interfaces for loading biological objects from a relational 
 13  database, and is compatible with the BioSQL standards. 
 14  """ 
 15  import BioSeq 
 16  import Loader 
 17  import DBUtils 
 18   
 19  _POSTGRES_RULES_PRESENT = False # Hack for BioSQL Bug 2839 
 20   
21 -def open_database(driver = "MySQLdb", **kwargs):
22 """Main interface for loading a existing BioSQL-style database. 23 24 This function is the easiest way to retrieve a connection to a 25 database, doing something like: 26 27 >>> from BioSeq import BioSeqDatabase 28 >>> server = BioSeqDatabase.open_database(user = "root", db="minidb") 29 30 the various options are: 31 driver -> The name of the database driver to use for connecting. The 32 driver should implement the python DB API. By default, the MySQLdb 33 driver is used. 34 user -> the username to connect to the database with. 35 password, passwd -> the password to connect with 36 host -> the hostname of the database 37 database or db -> the name of the database 38 """ 39 module = __import__(driver) 40 connect = getattr(module, "connect") 41 42 # Different drivers use different keywords... 43 kw = kwargs.copy() 44 if driver == "MySQLdb": 45 if "database" in kw: 46 kw["db"] = kw["database"] 47 del kw["database"] 48 if "password" in kw: 49 kw["passwd"] = kw["password"] 50 del kw["password"] 51 else: 52 # DB-API recommendations 53 if "db" in kw: 54 kw["database"] = kw["db"] 55 del kw["db"] 56 if "passwd" in kw: 57 kw["password"] = kw["passwd"] 58 del kw["passwd"] 59 if driver in ["psycopg", "psycopg2", "pgdb"] and not kw.get("database"): 60 kw["database"] = "template1" 61 # SQLite connect takes the database name as input 62 if driver in ["sqlite3"]: 63 conn = connect(kw["database"]) 64 else: 65 try: 66 conn = connect(**kw) 67 except module.InterfaceError: 68 # Ok, so let's try building a DSN 69 # (older releases of psycopg need this) 70 if "database" in kw: 71 kw["dbname"] = kw["database"] 72 del kw["database"] 73 elif "db" in kw: 74 kw["dbname"] = kw["db"] 75 del kw["db"] 76 dsn = ' '.join(['='.join(i) for i in kw.items()]) 77 conn = connect(dsn) 78 79 server = DBServer(conn, module) 80 81 if driver == "psycopg": 82 import warnings 83 warnings.warn("Using BioSQL with psycopg (version one) is deprecated. " 84 "It still works for now, but we recommend you update " 85 "to using psycopg2 as a future release of Biopython " 86 "will drop support for psycop (version one).", 87 DeprecationWarning) 88 89 # TODO - Remove the following once BioSQL Bug 2839 is fixed. 90 # Test for RULES in PostgreSQL schema, see also Bug 2833. 91 if driver in ["psycopg", "psycopg2", "pgdb"]: 92 sql = "SELECT ev_class FROM pg_rewrite WHERE " + \ 93 "rulename='rule_bioentry_i1' OR " + \ 94 "rulename='rule_bioentry_i2';" 95 if server.adaptor.execute_and_fetchall(sql): 96 import warnings 97 warnings.warn("Your BioSQL PostgreSQL schema includes some " 98 "rules currently required for bioperl-db but " 99 "which may cause problems loading data using " 100 "Biopython (see BioSQL Bug 2839). If you do not " 101 "use BioPerl, please remove these rules. " 102 "Biopython should cope with the rules present, " 103 "but with a performance penalty when loading " 104 "new records.") 105 global _POSTGRES_RULES_PRESENT 106 _POSTGRES_RULES_PRESENT = True 107 108 return server
109
110 -class DBServer:
111 - def __init__(self, conn, module, module_name=None):
112 self.module = module 113 if module_name is None: 114 module_name = module.__name__ 115 self.adaptor = Adaptor(conn, DBUtils.get_dbutils(module_name)) 116 self.module_name = module_name
117
118 - def __repr__(self):
119 return self.__class__.__name__ + "(%r)" % self.adaptor.conn
120 - def __getitem__(self, name):
121 return BioSeqDatabase(self.adaptor, name)
122 - def keys(self):
123 return self.adaptor.list_biodatabase_names()
124 - def values(self):
125 return [self[key] for key in self.keys()]
126 - def items(self):
127 return [(key, self[key]) for key in self.keys()]
128
129 - def remove_database(self, db_name):
130 """Try to remove all references to items in a database. 131 """ 132 db_id = self.adaptor.fetch_dbid_by_dbname(db_name) 133 remover = Loader.DatabaseRemover(self.adaptor, db_id) 134 remover.remove()
135
136 - def new_database(self, db_name, authority=None, description=None):
137 """Add a new database to the server and return it. 138 """ 139 # make the database 140 sql = r"INSERT INTO biodatabase (name, authority, description)" \ 141 r" VALUES (%s, %s, %s)" 142 self.adaptor.execute(sql, (db_name,authority, description)) 143 return BioSeqDatabase(self.adaptor, db_name)
144
145 - def load_database_sql(self, sql_file):
146 """Load a database schema into the given database. 147 148 This is used to create tables, etc when a database is first created. 149 sql_file should specify the complete path to a file containing 150 SQL entries for building the tables. 151 """ 152 # Not sophisticated enough for PG schema. Is it needed by MySQL? 153 # Looks like we need this more complicated way for both. Leaving it 154 # the default and removing the simple-minded approach. 155 156 # read the file with all comment lines removed 157 sql_handle = open(sql_file, "rb") 158 sql = r"" 159 for line in sql_handle.xreadlines(): 160 if line.find("--") == 0: # don't include comment lines 161 pass 162 elif line.find("#") == 0: # ditto for MySQL comments 163 pass 164 elif line.strip(): # only include non-blank lines 165 sql += line.strip() 166 sql += ' ' 167 168 # two ways to load the SQL 169 # 1. PostgreSQL can load it all at once and actually needs to 170 # due to FUNCTION defines at the end of the SQL which mess up 171 # the splitting by semicolons 172 if self.module_name in ["psycopg", "psycopg2", "pgdb"]: 173 self.adaptor.cursor.execute(sql) 174 # 2. MySQL needs the database loading split up into single lines of 175 # SQL executed one at a time 176 elif self.module_name in ["MySQLdb", "sqlite3"]: 177 sql_parts = sql.split(";") # one line per sql command 178 for sql_line in sql_parts[:-1]: # don't use the last item, it's blank 179 self.adaptor.cursor.execute(sql_line) 180 else: 181 raise ValueError("Module %s not supported by the loader." % 182 (self.module_name))
183
184 - def commit(self):
185 """Commits the current transaction to the database.""" 186 return self.adaptor.commit()
187
188 - def rollback(self):
189 """Rolls backs the current transaction.""" 190 return self.adaptor.rollback()
191
192 - def close(self):
193 """Close the connection. No further activity possible.""" 194 return self.adaptor.close()
195
196 -class Adaptor:
197 - def __init__(self, conn, dbutils):
198 self.conn = conn 199 self.cursor = conn.cursor() 200 self.dbutils = dbutils
201
202 - def last_id(self, table):
203 return self.dbutils.last_id(self.cursor, table)
204
205 - def autocommit(self, y=True):
206 """Set the autocommit mode. True values enable; False value disable.""" 207 return self.dbutils.autocommit(self.conn, y)
208
209 - def commit(self):
210 """Commits the current transaction.""" 211 return self.conn.commit()
212
213 - def rollback(self):
214 """Rolls backs the current transaction.""" 215 return self.conn.rollback()
216
217 - def close(self):
218 """Close the connection. No further activity possible.""" 219 return self.conn.close()
220
221 - def fetch_dbid_by_dbname(self, dbname):
222 self.execute( 223 r"select biodatabase_id from biodatabase where name = %s", 224 (dbname,)) 225 rv = self.cursor.fetchall() 226 if not rv: 227 raise KeyError("Cannot find biodatabase with name %r" % dbname) 228 # Cannot happen (UK) 229 ## assert len(rv) == 1, "More than one biodatabase with name %r" % dbname 230 return rv[0][0]
231
232 - def fetch_seqid_by_display_id(self, dbid, name):
233 sql = r"select bioentry_id from bioentry where name = %s" 234 fields = [name] 235 if dbid: 236 sql += " and biodatabase_id = %s" 237 fields.append(dbid) 238 self.execute(sql, fields) 239 rv = self.cursor.fetchall() 240 if not rv: 241 raise IndexError("Cannot find display id %r" % name) 242 if len(rv) > 1: 243 raise IndexError("More than one entry with display id %r" % name) 244 return rv[0][0]
245
246 - def fetch_seqid_by_accession(self, dbid, name):
247 sql = r"select bioentry_id from bioentry where accession = %s" 248 fields = [name] 249 if dbid: 250 sql += " and biodatabase_id = %s" 251 fields.append(dbid) 252 self.execute(sql, fields) 253 rv = self.cursor.fetchall() 254 if not rv: 255 raise IndexError("Cannot find accession %r" % name) 256 if len(rv) > 1: 257 raise IndexError("More than one entry with accession %r" % name) 258 return rv[0][0]
259
260 - def fetch_seqids_by_accession(self, dbid, name):
261 sql = r"select bioentry_id from bioentry where accession = %s" 262 fields = [name] 263 if dbid: 264 sql += " and biodatabase_id = %s" 265 fields.append(dbid) 266 return self.execute_and_fetch_col0(sql, fields)
267
268 - def fetch_seqid_by_version(self, dbid, name):
269 acc_version = name.split(".") 270 if len(acc_version) > 2: 271 raise IndexError("Bad version %r" % name) 272 acc = acc_version[0] 273 if len(acc_version) == 2: 274 version = acc_version[1] 275 else: 276 version = "0" 277 sql = r"SELECT bioentry_id FROM bioentry WHERE accession = %s" \ 278 r" AND version = %s" 279 fields = [acc, version] 280 if dbid: 281 sql += " and biodatabase_id = %s" 282 fields.append(dbid) 283 self.execute(sql, fields) 284 rv = self.cursor.fetchall() 285 if not rv: 286 raise IndexError("Cannot find version %r" % name) 287 if len(rv) > 1: 288 raise IndexError("More than one entry with version %r" % name) 289 return rv[0][0]
290
291 - def fetch_seqid_by_identifier(self, dbid, identifier):
292 # YB: was fetch_seqid_by_seqid 293 sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s" 294 fields = [identifier] 295 if dbid: 296 sql += " and biodatabase_id = %s" 297 fields.append(dbid) 298 self.execute(sql, fields) 299 rv = self.cursor.fetchall() 300 if not rv: 301 raise IndexError("Cannot find display id %r" % identifier) 302 return rv[0][0]
303
304 - def list_biodatabase_names(self):
305 return self.execute_and_fetch_col0( 306 "SELECT name FROM biodatabase")
307
308 - def list_bioentry_ids(self, dbid):
309 return self.execute_and_fetch_col0( 310 "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s", 311 (dbid,))
312
313 - def list_bioentry_display_ids(self, dbid):
314 return self.execute_and_fetch_col0( 315 "SELECT name FROM bioentry WHERE biodatabase_id = %s", 316 (dbid,))
317
318 - def list_any_ids(self, sql, args):
319 """Return ids given a SQL statement to select for them. 320 321 This assumes that the given SQL does a SELECT statement that 322 returns a list of items. This parses them out of the 2D list 323 they come as and just returns them in a list. 324 """ 325 return self.execute_and_fetch_col0(sql, args)
326
327 - def execute_one(self, sql, args=None):
328 self.execute(sql, args or ()) 329 rv = self.cursor.fetchall() 330 assert len(rv) == 1, "Expected 1 response, got %d" % len(rv) 331 return rv[0]
332
333 - def execute(self, sql, args=None):
334 """Just execute an sql command. 335 """ 336 self.dbutils.execute(self.cursor, sql, args)
337
338 - def get_subseq_as_string(self, seqid, start, end):
339 length = end - start 340 # XXX Check this on MySQL and PostgreSQL. substr should be general, 341 # does it need dbutils? 342 #return self.execute_one( 343 # """select SUBSTRING(seq FROM %s FOR %s) 344 # from biosequence where bioentry_id = %s""", 345 # (start+1, length, seqid))[0] 346 # 347 # Convert to a string on returning for databases that give back 348 # unicode. Shouldn't need unicode for sequences so this seems safe. 349 return str(self.execute_one( 350 """select SUBSTR(seq, %s, %s) 351 from biosequence where bioentry_id = %s""", 352 (start+1, length, seqid))[0])
353
354 - def execute_and_fetch_col0(self, sql, args=None):
355 self.execute(sql, args or ()) 356 return [field[0] for field in self.cursor.fetchall()]
357
358 - def execute_and_fetchall(self, sql, args=None):
359 self.execute(sql, args or ()) 360 return self.cursor.fetchall()
361 362 _allowed_lookups = { 363 # Lookup name / function name to get id, function to list all ids 364 'primary_id': "fetch_seqid_by_identifier", 365 'gi': "fetch_seqid_by_identifier", 366 'display_id': "fetch_seqid_by_display_id", 367 'name': "fetch_seqid_by_display_id", 368 'accession': "fetch_seqid_by_accession", 369 'version': "fetch_seqid_by_version", 370 } 371
372 -class BioSeqDatabase:
373 - def __init__(self, adaptor, name):
374 self.adaptor = adaptor 375 self.name = name 376 self.dbid = self.adaptor.fetch_dbid_by_dbname(name)
377
378 - def __repr__(self):
379 return "BioSeqDatabase(%r, %r)" % (self.adaptor, self.name)
380
381 - def get_Seq_by_id(self, name):
382 """Gets a Bio::Seq object by its name 383 384 Example: seq = db.get_Seq_by_id('ROA1_HUMAN') 385 386 """ 387 seqid = self.adaptor.fetch_seqid_by_display_id(self.dbid, name) 388 return BioSeq.DBSeqRecord(self.adaptor, seqid)
389
390 - def get_Seq_by_acc(self, name):
391 """Gets a Bio::Seq object by accession number 392 393 Example: seq = db.get_Seq_by_acc('X77802') 394 395 """ 396 seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name) 397 return BioSeq.DBSeqRecord(self.adaptor, seqid)
398
399 - def get_Seq_by_ver(self, name):
400 """Gets a Bio::Seq object by version number 401 402 Example: seq = db.get_Seq_by_ver('X77802.1') 403 404 """ 405 seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name) 406 return BioSeq.DBSeqRecord(self.adaptor, seqid)
407
408 - def get_Seqs_by_acc(self, name):
409 """Gets a *list* of Bio::Seq objects by accession number 410 411 Example: seqs = db.get_Seq_by_acc('X77802') 412 413 """ 414 seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name) 415 return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids]
416
417 - def get_PrimarySeq_stream(self):
418 # my @array = $self->get_all_primary_ids; 419 # my $stream = Bio::DB::BioDatabasePSeqStream->new( 420 # -adaptor => $self->_adaptor->db->get_PrimarySeqAdaptor, 421 # -idlist => \@array); 422 raise NotImplementedError("waiting for Python 2.2's iter")
423
424 - def get_all_primary_ids(self):
425 """Array of all the primary_ids of the sequences in the database. 426 427 These maybe ids (display style) or accession numbers or 428 something else completely different - they *are not* 429 meaningful outside of this database implementation. 430 """ 431 return self.adaptor.list_bioentry_ids(self.dbid)
432
433 - def __getitem__(self, key):
434 return BioSeq.DBSeqRecord(self.adaptor, key)
435 - def keys(self):
436 return self.get_all_primary_ids()
437 - def values(self):
438 return [self[key] for key in self.keys()]
439 - def items(self):
440 return [(key, self[key]) for key in self.keys()]
441
442 - def lookup(self, **kwargs):
443 if len(kwargs) != 1: 444 raise TypeError("single key/value parameter expected") 445 k, v = kwargs.items()[0] 446 if k not in _allowed_lookups: 447 raise TypeError("lookup() expects one of %s, not %r" % \ 448 (repr(_allowed_lookups.keys())[1:-1], repr(k))) 449 lookup_name = _allowed_lookups[k] 450 lookup_func = getattr(self.adaptor, lookup_name) 451 seqid = lookup_func(self.dbid, v) 452 return BioSeq.DBSeqRecord(self.adaptor, seqid)
453
454 - def get_Seq_by_primary_id(self, seqid):
455 """Gets a Bio::Seq object by the primary (internal) id. 456 457 The primary id in these cases has to come from 458 $db->get_all_primary_ids. There is no other way to get (or 459 guess) the primary_ids in a database. 460 """ 461 return self[seqid]
462
463 - def load(self, record_iterator, fetch_NCBI_taxonomy=False):
464 """Load a set of SeqRecords into the BioSQL database. 465 466 record_iterator is either a list of SeqRecord objects, or an 467 Iterator object that returns SeqRecord objects (such as the 468 output from the Bio.SeqIO.parse() function), which will be 469 used to populate the database. 470 471 fetch_NCBI_taxonomy is boolean flag allowing or preventing 472 connection to the taxonomic database on the NCBI server 473 (via Bio.Entrez) to fetch a detailed taxonomy for each 474 SeqRecord. 475 476 Example: 477 from Bio import SeqIO 478 count = db.load(SeqIO.parse(open(filename), format)) 479 480 Returns the number of records loaded. 481 """ 482 db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid, \ 483 fetch_NCBI_taxonomy) 484 num_records = 0 485 global _POSTGRES_RULES_PRESENT 486 for cur_record in record_iterator: 487 num_records += 1 488 #Hack to work arround BioSQL Bug 2839 - If using PostgreSQL and 489 #the RULES are present check for a duplicate record before loading 490 if _POSTGRES_RULES_PRESENT: 491 #Recreate what the Loader's _load_bioentry_table will do: 492 if cur_record.id.count(".") == 1: 493 accession, version = cur_record.id.split('.') 494 try: 495 version = int(version) 496 except ValueError: 497 accession = cur_record.id 498 version = 0 499 else: 500 accession = cur_record.id 501 version = 0 502 gi = cur_record.annotations.get("gi", None) 503 sql = "SELECT bioentry_id FROM bioentry WHERE (identifier " + \ 504 "= '%s' AND biodatabase_id = '%s') OR (accession = " + \ 505 "'%s' AND version = '%s' AND biodatabase_id = '%s')" 506 self.adaptor.execute(sql % (gi, self.dbid, accession, version, self.dbid)) 507 if self.adaptor.cursor.fetchone(): 508 try: 509 raise self.adaptor.conn.IntegrityError("Duplicate record " 510 "detected: record has not been inserted") 511 except AttributeError: #psycopg version 1 512 import psycopg 513 raise psycopg.IntegrityError("Psycopg1: Duplicate record " 514 "detected: record has not been inserted") 515 #End of hack 516 db_loader.load_seqrecord(cur_record) 517 return num_records
518