1
2
3
4
5
6
7
8
9
10 """Connect with a BioSQL database and load Biopython like objects from it.
11
12 This provides interfaces for loading biological objects from a relational
13 database, and is compatible with the BioSQL standards.
14 """
15 import BioSeq
16 import Loader
17 import DBUtils
18
19 _POSTGRES_RULES_PRESENT = False
20
22 """Main interface for loading a existing BioSQL-style database.
23
24 This function is the easiest way to retrieve a connection to a
25 database, doing something like:
26
27 >>> from BioSeq import BioSeqDatabase
28 >>> server = BioSeqDatabase.open_database(user = "root", db="minidb")
29
30 the various options are:
31 driver -> The name of the database driver to use for connecting. The
32 driver should implement the python DB API. By default, the MySQLdb
33 driver is used.
34 user -> the username to connect to the database with.
35 password, passwd -> the password to connect with
36 host -> the hostname of the database
37 database or db -> the name of the database
38 """
39 module = __import__(driver)
40 connect = getattr(module, "connect")
41
42
43 kw = kwargs.copy()
44 if driver == "MySQLdb":
45 if "database" in kw:
46 kw["db"] = kw["database"]
47 del kw["database"]
48 if "password" in kw:
49 kw["passwd"] = kw["password"]
50 del kw["password"]
51 else:
52
53 if "db" in kw:
54 kw["database"] = kw["db"]
55 del kw["db"]
56 if "passwd" in kw:
57 kw["password"] = kw["passwd"]
58 del kw["passwd"]
59 if driver in ["psycopg", "psycopg2", "pgdb"] and not kw.get("database"):
60 kw["database"] = "template1"
61
62 if driver in ["sqlite3"]:
63 conn = connect(kw["database"])
64 else:
65 try:
66 conn = connect(**kw)
67 except module.InterfaceError:
68
69
70 if "database" in kw:
71 kw["dbname"] = kw["database"]
72 del kw["database"]
73 elif "db" in kw:
74 kw["dbname"] = kw["db"]
75 del kw["db"]
76 dsn = ' '.join(['='.join(i) for i in kw.items()])
77 conn = connect(dsn)
78
79 server = DBServer(conn, module)
80
81 if driver == "psycopg":
82 import warnings
83 warnings.warn("Using BioSQL with psycopg (version one) is deprecated. "
84 "It still works for now, but we recommend you update "
85 "to using psycopg2 as a future release of Biopython "
86 "will drop support for psycop (version one).",
87 DeprecationWarning)
88
89
90
91 if driver in ["psycopg", "psycopg2", "pgdb"]:
92 sql = "SELECT ev_class FROM pg_rewrite WHERE " + \
93 "rulename='rule_bioentry_i1' OR " + \
94 "rulename='rule_bioentry_i2';"
95 if server.adaptor.execute_and_fetchall(sql):
96 import warnings
97 warnings.warn("Your BioSQL PostgreSQL schema includes some "
98 "rules currently required for bioperl-db but "
99 "which may cause problems loading data using "
100 "Biopython (see BioSQL Bug 2839). If you do not "
101 "use BioPerl, please remove these rules. "
102 "Biopython should cope with the rules present, "
103 "but with a performance penalty when loading "
104 "new records.")
105 global _POSTGRES_RULES_PRESENT
106 _POSTGRES_RULES_PRESENT = True
107
108 return server
109
111 - def __init__(self, conn, module, module_name=None):
112 self.module = module
113 if module_name is None:
114 module_name = module.__name__
115 self.adaptor = Adaptor(conn, DBUtils.get_dbutils(module_name))
116 self.module_name = module_name
117
119 return self.__class__.__name__ + "(%r)" % self.adaptor.conn
128
135
136 - def new_database(self, db_name, authority=None, description=None):
137 """Add a new database to the server and return it.
138 """
139
140 sql = r"INSERT INTO biodatabase (name, authority, description)" \
141 r" VALUES (%s, %s, %s)"
142 self.adaptor.execute(sql, (db_name,authority, description))
143 return BioSeqDatabase(self.adaptor, db_name)
144
146 """Load a database schema into the given database.
147
148 This is used to create tables, etc when a database is first created.
149 sql_file should specify the complete path to a file containing
150 SQL entries for building the tables.
151 """
152
153
154
155
156
157 sql_handle = open(sql_file, "rb")
158 sql = r""
159 for line in sql_handle.xreadlines():
160 if line.find("--") == 0:
161 pass
162 elif line.find("#") == 0:
163 pass
164 elif line.strip():
165 sql += line.strip()
166 sql += ' '
167
168
169
170
171
172 if self.module_name in ["psycopg", "psycopg2", "pgdb"]:
173 self.adaptor.cursor.execute(sql)
174
175
176 elif self.module_name in ["MySQLdb", "sqlite3"]:
177 sql_parts = sql.split(";")
178 for sql_line in sql_parts[:-1]:
179 self.adaptor.cursor.execute(sql_line)
180 else:
181 raise ValueError("Module %s not supported by the loader." %
182 (self.module_name))
183
185 """Commits the current transaction to the database."""
186 return self.adaptor.commit()
187
189 """Rolls backs the current transaction."""
190 return self.adaptor.rollback()
191
193 """Close the connection. No further activity possible."""
194 return self.adaptor.close()
195
198 self.conn = conn
199 self.cursor = conn.cursor()
200 self.dbutils = dbutils
201
204
206 """Set the autocommit mode. True values enable; False value disable."""
207 return self.dbutils.autocommit(self.conn, y)
208
210 """Commits the current transaction."""
211 return self.conn.commit()
212
214 """Rolls backs the current transaction."""
215 return self.conn.rollback()
216
218 """Close the connection. No further activity possible."""
219 return self.conn.close()
220
222 self.execute(
223 r"select biodatabase_id from biodatabase where name = %s",
224 (dbname,))
225 rv = self.cursor.fetchall()
226 if not rv:
227 raise KeyError("Cannot find biodatabase with name %r" % dbname)
228
229
230 return rv[0][0]
231
233 sql = r"select bioentry_id from bioentry where name = %s"
234 fields = [name]
235 if dbid:
236 sql += " and biodatabase_id = %s"
237 fields.append(dbid)
238 self.execute(sql, fields)
239 rv = self.cursor.fetchall()
240 if not rv:
241 raise IndexError("Cannot find display id %r" % name)
242 if len(rv) > 1:
243 raise IndexError("More than one entry with display id %r" % name)
244 return rv[0][0]
245
247 sql = r"select bioentry_id from bioentry where accession = %s"
248 fields = [name]
249 if dbid:
250 sql += " and biodatabase_id = %s"
251 fields.append(dbid)
252 self.execute(sql, fields)
253 rv = self.cursor.fetchall()
254 if not rv:
255 raise IndexError("Cannot find accession %r" % name)
256 if len(rv) > 1:
257 raise IndexError("More than one entry with accession %r" % name)
258 return rv[0][0]
259
261 sql = r"select bioentry_id from bioentry where accession = %s"
262 fields = [name]
263 if dbid:
264 sql += " and biodatabase_id = %s"
265 fields.append(dbid)
266 return self.execute_and_fetch_col0(sql, fields)
267
269 acc_version = name.split(".")
270 if len(acc_version) > 2:
271 raise IndexError("Bad version %r" % name)
272 acc = acc_version[0]
273 if len(acc_version) == 2:
274 version = acc_version[1]
275 else:
276 version = "0"
277 sql = r"SELECT bioentry_id FROM bioentry WHERE accession = %s" \
278 r" AND version = %s"
279 fields = [acc, version]
280 if dbid:
281 sql += " and biodatabase_id = %s"
282 fields.append(dbid)
283 self.execute(sql, fields)
284 rv = self.cursor.fetchall()
285 if not rv:
286 raise IndexError("Cannot find version %r" % name)
287 if len(rv) > 1:
288 raise IndexError("More than one entry with version %r" % name)
289 return rv[0][0]
290
292
293 sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s"
294 fields = [identifier]
295 if dbid:
296 sql += " and biodatabase_id = %s"
297 fields.append(dbid)
298 self.execute(sql, fields)
299 rv = self.cursor.fetchall()
300 if not rv:
301 raise IndexError("Cannot find display id %r" % identifier)
302 return rv[0][0]
303
307
308 - def list_bioentry_ids(self, dbid):
309 return self.execute_and_fetch_col0(
310 "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s",
311 (dbid,))
312
314 return self.execute_and_fetch_col0(
315 "SELECT name FROM bioentry WHERE biodatabase_id = %s",
316 (dbid,))
317
319 """Return ids given a SQL statement to select for them.
320
321 This assumes that the given SQL does a SELECT statement that
322 returns a list of items. This parses them out of the 2D list
323 they come as and just returns them in a list.
324 """
325 return self.execute_and_fetch_col0(sql, args)
326
328 self.execute(sql, args or ())
329 rv = self.cursor.fetchall()
330 assert len(rv) == 1, "Expected 1 response, got %d" % len(rv)
331 return rv[0]
332
333 - def execute(self, sql, args=None):
334 """Just execute an sql command.
335 """
336 self.dbutils.execute(self.cursor, sql, args)
337
339 length = end - start
340
341
342
343
344
345
346
347
348
349 return str(self.execute_one(
350 """select SUBSTR(seq, %s, %s)
351 from biosequence where bioentry_id = %s""",
352 (start+1, length, seqid))[0])
353
355 self.execute(sql, args or ())
356 return [field[0] for field in self.cursor.fetchall()]
357
359 self.execute(sql, args or ())
360 return self.cursor.fetchall()
361
362 _allowed_lookups = {
363
364 'primary_id': "fetch_seqid_by_identifier",
365 'gi': "fetch_seqid_by_identifier",
366 'display_id': "fetch_seqid_by_display_id",
367 'name': "fetch_seqid_by_display_id",
368 'accession': "fetch_seqid_by_accession",
369 'version': "fetch_seqid_by_version",
370 }
371
377
379 return "BioSeqDatabase(%r, %r)" % (self.adaptor, self.name)
380
389
391 """Gets a Bio::Seq object by accession number
392
393 Example: seq = db.get_Seq_by_acc('X77802')
394
395 """
396 seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name)
397 return BioSeq.DBSeqRecord(self.adaptor, seqid)
398
400 """Gets a Bio::Seq object by version number
401
402 Example: seq = db.get_Seq_by_ver('X77802.1')
403
404 """
405 seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name)
406 return BioSeq.DBSeqRecord(self.adaptor, seqid)
407
409 """Gets a *list* of Bio::Seq objects by accession number
410
411 Example: seqs = db.get_Seq_by_acc('X77802')
412
413 """
414 seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name)
415 return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids]
416
418
419
420
421
422 raise NotImplementedError("waiting for Python 2.2's iter")
423
425 """Array of all the primary_ids of the sequences in the database.
426
427 These maybe ids (display style) or accession numbers or
428 something else completely different - they *are not*
429 meaningful outside of this database implementation.
430 """
431 return self.adaptor.list_bioentry_ids(self.dbid)
432
441
443 if len(kwargs) != 1:
444 raise TypeError("single key/value parameter expected")
445 k, v = kwargs.items()[0]
446 if k not in _allowed_lookups:
447 raise TypeError("lookup() expects one of %s, not %r" % \
448 (repr(_allowed_lookups.keys())[1:-1], repr(k)))
449 lookup_name = _allowed_lookups[k]
450 lookup_func = getattr(self.adaptor, lookup_name)
451 seqid = lookup_func(self.dbid, v)
452 return BioSeq.DBSeqRecord(self.adaptor, seqid)
453
455 """Gets a Bio::Seq object by the primary (internal) id.
456
457 The primary id in these cases has to come from
458 $db->get_all_primary_ids. There is no other way to get (or
459 guess) the primary_ids in a database.
460 """
461 return self[seqid]
462
463 - def load(self, record_iterator, fetch_NCBI_taxonomy=False):
464 """Load a set of SeqRecords into the BioSQL database.
465
466 record_iterator is either a list of SeqRecord objects, or an
467 Iterator object that returns SeqRecord objects (such as the
468 output from the Bio.SeqIO.parse() function), which will be
469 used to populate the database.
470
471 fetch_NCBI_taxonomy is boolean flag allowing or preventing
472 connection to the taxonomic database on the NCBI server
473 (via Bio.Entrez) to fetch a detailed taxonomy for each
474 SeqRecord.
475
476 Example:
477 from Bio import SeqIO
478 count = db.load(SeqIO.parse(open(filename), format))
479
480 Returns the number of records loaded.
481 """
482 db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid, \
483 fetch_NCBI_taxonomy)
484 num_records = 0
485 global _POSTGRES_RULES_PRESENT
486 for cur_record in record_iterator:
487 num_records += 1
488
489
490 if _POSTGRES_RULES_PRESENT:
491
492 if cur_record.id.count(".") == 1:
493 accession, version = cur_record.id.split('.')
494 try:
495 version = int(version)
496 except ValueError:
497 accession = cur_record.id
498 version = 0
499 else:
500 accession = cur_record.id
501 version = 0
502 gi = cur_record.annotations.get("gi", None)
503 sql = "SELECT bioentry_id FROM bioentry WHERE (identifier " + \
504 "= '%s' AND biodatabase_id = '%s') OR (accession = " + \
505 "'%s' AND version = '%s' AND biodatabase_id = '%s')"
506 self.adaptor.execute(sql % (gi, self.dbid, accession, version, self.dbid))
507 if self.adaptor.cursor.fetchone():
508 try:
509 raise self.adaptor.conn.IntegrityError("Duplicate record "
510 "detected: record has not been inserted")
511 except AttributeError:
512 import psycopg
513 raise psycopg.IntegrityError("Psycopg1: Duplicate record "
514 "detected: record has not been inserted")
515
516 db_loader.load_seqrecord(cur_record)
517 return num_records
518