Package Bio :: Package GFF :: Module easy
[hide private]
[frames] | no frames]

Source Code for Module Bio.GFF.easy

  1  #!/usr/bin/env python 
  2  # 
  3  # Copyright 2002 by Michael Hoffman.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Bio.GFF.easy: some functions to ease the use of Biopython (DEPRECATED) 
  9   
 10  This is part of the "old" Bio.GFF module by Michael Hoffman, which offered 
 11  access to a MySQL database holding GFF data loaded by BioPerl. This code has 
 12  now been deprecated, and will probably be removed in order to free the Bio.GFF 
 13  namespace for a new GFF parser in Biopython (including GFF3 support). 
 14   
 15  Some of the more useful ideas of Bio.GFF.easy may be reworked for Bio.GenBank, 
 16  using the standard SeqFeature objects used elsewhere in Biopython. 
 17  """ 
 18   
 19  import copy 
 20  import re 
 21  import sys 
 22   
 23  import Bio 
 24  from Bio import GenBank 
 25  from Bio.Data import IUPACData 
 26  from Bio.Seq import Seq 
 27   
 28  from Bio import SeqIO 
 29  from Bio import SeqUtils 
 30   
 31  import GenericTools 
 32   
33 -class FeatureDict(dict):
34 """ JH: accessing feature.qualifiers as a list is stupid. Here's a dict that does it"""
35 - def __init__(self, feature_list, default=None):
36 dict.__init__(self) 37 self.default = default 38 key_re = re.compile(r'/(\S+)=') 39 40 for i in feature_list: 41 key = key_re.match(i.key).group(1) 42 val = i.value.replace('"','') 43 self[key] = val
44 - def __getitem__(self, key):
45 try: 46 return dict.__getitem__(self, key) 47 except KeyError: 48 return self.default
49
50 -class Location(GenericTools.VerboseList):
51 """ 52 this is really best interfaced through LocationFromString 53 fuzzy: < or > 54 join: {0 = no join, 1 = join, 2 = order} 55 56 >>> location = Location([Location([339]), Location([564])]) # zero-based 57 >>> location 58 Location(Location(339), Location(564)) 59 >>> print location # one-based 60 340..565 61 >>> print location.five_prime() 62 340 63 >>> location_rev = Location([Location([339]), Location([564])], 1) 64 >>> print location_rev 65 complement(340..565) 66 >>> print location_rev.five_prime() 67 565 68 """
69 - def __init__(self, the_list, complement=0, seqname=None):
70 self.complement = complement 71 self.join = 0 72 self.fuzzy = None 73 self.seqname = seqname 74 list.__init__(self, the_list)
75
76 - def _joinstr(self):
77 if self.join == 1: 78 label = 'join' 79 elif self.join == 2: 80 label = 'order' 81 return "%s(%s)" % (label, ",".join(map(str, self)))
82
83 - def __str__(self):
84 if self.seqname: 85 format = "%s:%%s" % self.seqname 86 else: 87 format = "%s" 88 89 if self.complement: 90 format = format % "complement(%s)" 91 92 if self.join: 93 return format % self._joinstr() 94 95 elif isinstance(self[0], list): 96 return format % "%s..%s" % (str(self[0]), str(self[1])) 97 else: 98 if self.fuzzy: 99 format = format % self.fuzzy + "%s" 100 return format % str(self[0] + 1)
101
102 - def __repr__(self):
103 return "Location(%s)" % ", ".join(map(repr, self))
104 105 direction2index = {1: 0, -1: -1}
106 - def direction_and_index(self, direction):
107 """ 108 1: 5' 109 -1: 3' 110 111 >>> loc1 = LocationFromString("join(1..3,complement(5..6))") 112 >>> loc1.direction_and_index(1) 113 (1, 0) 114 >>> loc1.direction_and_index(-1) 115 (-1, -1) 116 >>> loc1.reverse() 117 >>> print loc1 118 complement(join(1..3,complement(5..6))) 119 >>> loc1.direction_and_index(1) 120 (-1, -1) 121 """ 122 if self.complement: 123 direction = direction * -1 124 index = self.direction2index[direction] 125 return direction, index
126
127 - def findside(self, direction):
128 """ 129 >>> loc = LocationFromString("complement(join(1..5,complement(6..10)))") 130 >>> loc.findside(1) 131 Location(5) 132 >>> loc.findside(-1) 133 Location(0) 134 """ 135 direction, index = self.direction_and_index(direction) 136 if self.join or isinstance(self[0], list): 137 return self[index].findside(direction) 138 else: 139 return self
140
141 - def findseqname_3prime(self):
142 """ 143 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))") 144 >>> loc.findseqname_3prime() 145 'MOOCOW' 146 """ 147 return self.findseqname(-1)
148
149 - def findseqname(self, direction=1): # find 5' seqname
150 """ 151 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))") 152 >>> loc.findseqname() 153 'SEQ' 154 >>> loc.findseqname(-1) 155 'MOOCOW' 156 """ 157 direction, index = self.direction_and_index(direction) 158 if self.seqname: 159 return self.seqname 160 elif self.join: 161 return self[index].findseqname(direction) 162 else: 163 raise AttributeError('no sequence name')
164
165 - def five_prime(self):
166 return self.findside(1)
167 - def three_prime(self):
168 return self.findside(-1)
169
170 - def length(self):
171 """ 172 WARNING: doesn't deal with joins!!!! 173 """ 174 return self.end()-self.start()
175
176 - def intersection(self, other):
177 """ 178 WARNING: doesn't deal with joins!!!! 179 180 >>> location1 = LocationFromString("1..50") 181 >>> location2 = LocationFromString("25..200") 182 >>> print location1.intersection(location2) 183 25..50 184 >>> print location1.intersection(location2) 185 25..50 186 """ 187 if self.start() >= other.start(): 188 start = self.start() 189 else: 190 start = other.start() 191 if self.end() <= other.end(): 192 end = self.end() 193 else: 194 end = other.end() 195 return Location([Location([start]), Location([end])])
196
197 - def start(self):
198 # zero-based 199 if self.complement: 200 return self.three_prime()[0] 201 else: 202 return self.five_prime()[0]
203
204 - def end(self):
205 # zero-based 206 if self.complement: 207 return self.five_prime()[0] 208 else: 209 return self.three_prime()[0]
210
211 - def three_prime_range(self, window):
212 three_prime_loc = self.three_prime() 213 if self.complement: 214 return Location([three_prime_loc-window, three_prime_loc], complement=1) 215 else: 216 return Location([three_prime_loc, three_prime_loc+window])
217
218 - def sublocation(self, sub_location):
219 """ 220 >>> fwd_location = LocationFromString('X:5830132..5831528') 221 >>> print fwd_location.sublocation(LocationFromString('1..101')) 222 X:5830132..5830232 223 >>> print fwd_location.sublocation(LocationFromString('1267..1286')) 224 X:5831398..5831417 225 >>> rev_location = LocationFromString('I:complement(8415686..8416216)') 226 >>> print rev_location.sublocation(LocationFromString('1..101')) 227 I:complement(8416116..8416216) 228 >>> print rev_location.sublocation(LocationFromString('100..200')) 229 I:complement(8416017..8416117) 230 """ 231 232 absolute_location = copy.deepcopy(self) 233 for i in xrange(2): 234 absolute_location[i] = self.five_prime().add(sub_location[i], self.complement) 235 if absolute_location.complement: 236 list.reverse(absolute_location) 237 return absolute_location
238
239 - def __add__(self, addend):
240 return self.add(addend)
241
242 - def add(self, addend, complement=0):
243 self_copy = copy.deepcopy(self) 244 if isinstance(addend, Location): 245 addend = addend[0] 246 if complement: 247 addend *= -1 248 self_copy[0] += addend 249 return self_copy
250
251 - def __sub__(self, subtrahend):
252 return self + -subtrahend
253
254 - def reverse(self):
255 self.complement = [1, 0][self.complement]
256
257 - def reorient(self):
258 """ 259 >>> loc1 = LocationFromString("join(I:complement(1..9000),I:complement(9001..10000))") 260 >>> loc1.reorient() 261 >>> print loc1 262 complement(join(I:1..9000,I:9001..10000)) 263 >>> loc2 = LocationFromString("join(I:complement(1..9000),I:9001..10000)") 264 >>> loc2.reorient() 265 >>> print loc2 266 join(I:complement(1..9000),I:9001..10000) 267 """ 268 if self.join: 269 if len([x for x in self if x.complement]) == len(self): 270 self.reverse() 271 for segment in self: 272 segment.reverse()
273
274 - def bounding(self):
275 """ 276 works for single level non-complex joins 277 278 >>> LOC = LocationFromString 279 >>> l1 = LOC("join(alpha:1..30,alpha:50..70)") 280 >>> print l1.bounding() 281 join(alpha:1..70) 282 >>> l2 = LOC("join(alpha:1..30,alpha:complement(50..70))") 283 >>> print l2.bounding() 284 join(alpha:1..30,alpha:complement(50..70)) 285 >>> l3 = LOC("join(alpha:1..30,alpha:complement(50..70),beta:6..20,alpha:25..45)") 286 >>> print l3.bounding() 287 join(alpha:1..45,alpha:complement(50..70),beta:6..20) 288 289 """ 290 if not self.join: 291 return 292 293 seqdict = {} 294 seqkeys = [] 295 for subloc in self: 296 assert subloc.seqname 297 assert not subloc.join 298 try: 299 seqdict[_hashname(subloc)].append(subloc) 300 except KeyError: 301 key = _hashname(subloc) 302 seqdict[key] = [subloc] 303 seqkeys.append(key) 304 305 res = LocationJoin() 306 for key in seqkeys: 307 locations = seqdict[key] 308 coords = [] 309 for subloc in locations: 310 coords.append(subloc.start()) 311 coords.append(subloc.end()) 312 res.append(LocationFromCoords(min(coords), max(coords), locations[0].complement, locations[0].seqname)) 313 return res
314
315 -def _hashname(location):
316 return str(location.complement) + location.seqname
317
318 -class LocationJoin(Location):
319 """ 320 >>> join = LocationJoin([LocationFromCoords(339, 564, 1), LocationFromString("complement(100..339)")]) 321 >>> appendloc = LocationFromString("order(complement(66..99),complement(5..55))") 322 >>> join.append(appendloc) 323 >>> print join 324 join(complement(340..565),complement(100..339),order(complement(66..99),complement(5..55))) 325 >>> join2 = LocationJoin() 326 >>> join2.append(LocationFromString("complement(66..99)")) 327 >>> join2.append(LocationFromString("complement(5..55)")) 328 >>> print join2 329 join(complement(66..99),complement(5..55)) 330 """
331 - def __init__(self, the_list = [], complement=0, seqname=None):
332 self.complement = complement 333 self.join = 1 334 self.fuzzy = None 335 self.seqname = seqname 336 list.__init__(self, the_list)
337
338 -class LocationFromCoords(Location):
339 """ 340 >>> print LocationFromCoords(339, 564) 341 340..565 342 >>> print LocationFromCoords(339, 564, seqname="I") 343 I:340..565 344 >>> print LocationFromCoords(999, 3234, "-", seqname="NC_343434") 345 NC_343434:complement(1000..3235) 346 """
347 - def __init__(self, start, end, strand=0, seqname=None):
348 if strand == "+": 349 strand = 0 350 elif strand == "-": 351 strand = 1 352 Location.__init__(self, [Location([start]), Location([end])], strand, seqname)
353 354 # see http://www.ncbi.nlm.nih.gov/collab/FT/index.html#backus-naur 355 # for how this should actually be implemented 356 re_complement = re.compile(r"^complement\((.*)\)$") 357 re_seqname = re.compile(r"^(?!join|order|complement)([^\:]+?):(.*)$") # not every character is allowed by spec 358 re_join = re.compile(r"^(join|order)\((.*)\)$") 359 re_dotdot = re.compile(r"^([><]*\d+)\.\.([><]*\d+)$") 360 re_fuzzy = re.compile(r"^([><])(\d+)")
361 -class LocationFromString(Location):
362 """ 363 >>> # here are some tests from http://www.ncbi.nlm.nih.gov/collab/FT/index.html#location 364 >>> print LocationFromString("467") 365 467 366 >>> print LocationFromString("340..565") 367 340..565 368 >>> print LocationFromString("<345..500") 369 <345..500 370 >>> print LocationFromString("<1..888") 371 <1..888 372 >>> # (102.110) and 123^124 syntax unimplemented 373 >>> print LocationFromString("join(12..78,134..202)") 374 join(12..78,134..202) 375 >>> print LocationFromString("complement(join(2691..4571,4918..5163))") 376 complement(join(2691..4571,4918..5163)) 377 >>> print LocationFromString("join(complement(4918..5163),complement(2691..4571))") 378 join(complement(4918..5163),complement(2691..4571)) 379 >>> print LocationFromString("order(complement(4918..5163),complement(2691..4571))") 380 order(complement(4918..5163),complement(2691..4571)) 381 >>> print LocationFromString("NC_001802x.fna:73..78") 382 NC_001802x.fna:73..78 383 >>> print LocationFromString("J00194:100..202") 384 J00194:100..202 385 386 >>> print LocationFromString("join(117505..118584,1..609)") 387 join(117505..118584,1..609) 388 >>> print LocationFromString("join(test3:complement(4..6),test3:complement(1..3))") 389 join(test3:complement(4..6),test3:complement(1..3)) 390 >>> print LocationFromString("test3:join(test1:complement(1..3),4..6)") 391 test3:join(test1:complement(1..3),4..6) 392 """
393 - def __init__(self, location_str):
394 match_seqname = re_seqname.match(location_str) 395 if match_seqname: 396 self.seqname = match_seqname.group(1) 397 location_str = match_seqname.group(2) 398 else: 399 self.seqname = None 400 match_complement = re_complement.match(location_str) 401 if match_complement: 402 self.complement = 1 403 location_str = match_complement.group(1) 404 else: 405 self.complement = 0 406 match_join = re_join.match(location_str) 407 if match_join: 408 self.join = {'join':1, 'order':2}[match_join.group(1)] 409 list.__init__(self, map(lambda x: LocationFromString(x), match_join.group(2).split(","))) 410 else: 411 self.join = 0 412 match_dotdot = re_dotdot.match(location_str) 413 if match_dotdot: 414 list.__init__(self, map(lambda x: LocationFromString(match_dotdot.group(x)), (1, 2))) 415 else: 416 match_fuzzy = re_fuzzy.match(location_str) 417 if match_fuzzy: 418 self.fuzzy = match_fuzzy.group(1) 419 location_str = match_fuzzy.group(2) 420 else: 421 self.fuzzy = None 422 423 list.__init__(self, [int(location_str)-1]) # zero based, nip it in the bud
424
425 -def open_file(filename):
426 if filename: 427 return open(filename) 428 else: 429 return sys.stdin
430
431 -def fasta_single(filename=None, string=None):
432 """ 433 >>> record = fasta_single(string=\""" 434 ... >gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1] 435 ... MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAVNPGLLETSEGCRQILGQLQPSLQT 436 ... GSEELRSLYNTVATLYCVHQRIEIKDTKEALDKIEEEQNKSKKKAQQAAADTGHSNQVSQNYPIVQNIQG 437 ... QMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAA 438 ... EWDRVHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTNNPPIPVGEIYKRWIILGLNKIVRMYSPT 439 ... SILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPAATLEEMMTAC 440 ... QGVGGPGHKARVLAEAMSQVTNSATIMMQRGNFRNQRKIVKCFNCGKEGHTARNCRAPRKKGCWKCGKEG 441 ... HQMKDCTERQANFLGKIWPSYKGRPGNFLQSRPEPTAPPEESFRSGVETTTPPQKQEPIDKELYPLTSLR 442 ... SLFGNDPSSQ 443 ... \""") 444 >>> record.id 445 'gi|9629360|ref|NP_057850.1|' 446 >>> record.description 447 'gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]' 448 >>> record.seq[0:5] 449 Seq('MGARA', SingleLetterAlphabet()) 450 """ 451 #Returns the first record in a fasta file as a SeqRecord, 452 #or None if there are no records in the file. 453 if string: 454 import cStringIO 455 handle = cStringIO.StringIO(string) 456 else: 457 handle = open_file(filename) 458 try: 459 record = SeqIO.parse(handle, format="fasta").next() 460 except StopIteration: 461 record = None 462 return record
463
464 -def fasta_multi(filename=None):
465 #Simple way is just: 466 #return SeqIO.parse(open_file(filename), format="fasta") 467 #However, for backwards compatibility make sure we raise 468 #the StopIteration exception rather than returning None. 469 reader = SeqIO.parse(open_file(filename), format="fasta") 470 while True: 471 record = reader.next() 472 if record is None: 473 raise StopIteration 474 else: 475 yield record
476
477 -def fasta_readrecords(filename=None):
478 """ 479 >>> records = fasta_readrecords('GFF/multi.fna') 480 >>> records[0].id 481 'test1' 482 >>> records[2].seq 483 Seq('AAACACAC', SingleLetterAlphabet()) 484 """ 485 return list(SeqIO.parse(open_file(filename), format="fasta"))
486
487 -def fasta_write(filename, records):
488 handle = open(filename, "w") 489 SeqIO.write(records, handle, format="fasta") 490 handle.close()
491
492 -def genbank_single(filename):
493 """ 494 >>> record = genbank_single("GFF/NC_001422.gbk") 495 >>> record.taxonomy 496 ['Viruses', 'ssDNA viruses', 'Microviridae', 'Microvirus'] 497 >>> cds = record.features[-4] 498 >>> cds.key 499 'CDS' 500 >>> location = LocationFromString(cds.location) 501 >>> print location 502 2931..3917 503 >>> subseq = record_subseq(record, location) 504 >>> subseq[0:20] 505 Seq('ATGTTTGGTGCTATTGCTGG', Alphabet()) 506 """ 507 return GenBank.RecordParser().parse(open(filename))
508
509 -def record_subseq(record, location, *args, **keywds):
510 """ 511 >>> from Bio.SeqRecord import SeqRecord 512 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"), 513 ... "ref|NC_001422", 514 ... "Coliphage phiX174, complete genome", 515 ... "bases 1-11") 516 >>> record_subseq(record, LocationFromString("1..4")) # one-based 517 Seq('GAGT', Alphabet()) 518 >>> record_subseq(record, LocationFromString("complement(1..4)")) # one-based 519 Seq('ACTC', Alphabet()) 520 >>> record_subseq(record, LocationFromString("join(complement(1..4),1..4)")) # what an idea! 521 Seq('ACTCGAGT', Alphabet()) 522 >>> loc = LocationFromString("complement(join(complement(5..7),1..4))") 523 >>> print loc 524 complement(join(complement(5..7),1..4)) 525 >>> record_subseq(record, loc) 526 Seq('ACTCTTT', Alphabet()) 527 >>> print loc 528 complement(join(complement(5..7),1..4)) 529 >>> loc.reverse() 530 >>> record_subseq(record, loc) 531 Seq('AAAGAGT', Alphabet()) 532 >>> record_subseq(record, loc, upper=1) 533 Seq('AAAGAGT', Alphabet()) 534 """ 535 if location.join: 536 subsequence_list = [] 537 if location.complement: 538 location_copy = copy.copy(location) 539 list.reverse(location_copy) 540 else: 541 location_copy = location 542 for sublocation in location_copy: 543 if location.complement: 544 sublocation_copy = copy.copy(sublocation) 545 sublocation_copy.reverse() 546 else: 547 sublocation_copy = sublocation 548 subsequence_list.append(record_subseq(record, sublocation_copy, *args, **keywds).tostring()) 549 return Seq(''.join(subsequence_list), record_sequence(record).alphabet) 550 else: 551 return record_coords(record, location.start(), location.end()+1, location.complement, *args, **keywds)
552
553 -def record_sequence(record):
554 """ 555 returns the sequence of a record 556 557 can be Bio.SeqRecord.SeqRecord or Bio.GenBank.Record.Record 558 """ 559 if isinstance(record, Bio.SeqRecord.SeqRecord): 560 return record.seq 561 elif isinstance(record, Bio.GenBank.Record.Record): 562 return Seq(record.sequence) 563 else: 564 raise TypeError('not Bio.SeqRecord.SeqRecord or Bio.GenBank.Record.Record')
565
566 -def record_coords(record, start, end, strand=0, upper=0):
567 """ 568 >>> from Bio.SeqRecord import SeqRecord 569 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"), 570 ... "ref|NC_001422", 571 ... "Coliphage phiX174, complete genome", 572 ... "bases 1-11") 573 >>> record_coords(record, 0, 4) # zero-based 574 Seq('GAGT', Alphabet()) 575 >>> record_coords(record, 0, 4, "-") # zero-based 576 Seq('ACTC', Alphabet()) 577 >>> record_coords(record, 0, 4, "-", upper=1) # zero-based 578 Seq('ACTC', Alphabet()) 579 """ 580 581 subseq = record_sequence(record)[start:end] 582 subseq_str = subseq.tostring() 583 subseq_str = subseq_str.upper() 584 subseq = Seq(subseq_str, subseq.alphabet) 585 if strand == '-' or strand == 1: 586 return subseq.reverse_complement() 587 else: 588 return subseq
589
590 -def _test():
591 """Run the Bio.GFF.easy module's doctests (PRIVATE). 592 593 This will try and locate the unit tests directory, and run the doctests 594 from there in order that the relative paths used in the examples work. 595 """ 596 import doctest 597 import os 598 if os.path.isdir(os.path.join("..","..","Tests")): 599 print "Runing doctests..." 600 cur_dir = os.path.abspath(os.curdir) 601 os.chdir(os.path.join("..","..","Tests")) 602 doctest.testmod() 603 os.chdir(cur_dir) 604 del cur_dir 605 print "Done" 606 elif os.path.isdir(os.path.join("Tests")) : 607 print "Runing doctests..." 608 cur_dir = os.path.abspath(os.curdir) 609 os.chdir(os.path.join("Tests")) 610 doctest.testmod() 611 os.chdir(cur_dir) 612 del cur_dir 613 print "Done"
614 615 if __name__ == "__main__": 616 if __debug__: 617 _test() 618