Package Bio :: Module SeqFeature
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqFeature

  1  """Represent a Sequence Feature holding info about a part of a sequence. 
  2   
  3  This is heavily modeled after the Biocorba SeqFeature objects, and 
  4  may be pretty biased towards GenBank stuff since I'm writing it 
  5  for the GenBank parser output... 
  6   
  7  What's here: 
  8   
  9  Base class to hold a Feature. 
 10  ---------------------------- 
 11  classes: 
 12  o SeqFeature 
 13   
 14  Hold information about a Reference. 
 15  ---------------------------------- 
 16   
 17  This is an attempt to create a General class to hold Reference type 
 18  information. 
 19   
 20  classes: 
 21  o Reference 
 22   
 23  Specify locations of a feature on a Sequence. 
 24  --------------------------------------------- 
 25   
 26  This aims to handle, in Ewan's words, 'the dreaded fuzziness issue' in 
 27  much the same way as Biocorba. This has the advantages of allowing us 
 28  to handle fuzzy stuff in case anyone needs it, and also be compatible 
 29  with Biocorba. 
 30   
 31  classes: 
 32  o FeatureLocation - Specify the start and end location of a feature. 
 33   
 34  o ExactPosition - Specify the position as being exact. 
 35  o WithinPosition - Specify a position occuring within some range. 
 36  o BetweenPosition - Specify a position occuring between a range (OBSOLETE?). 
 37  o BeforePosition - Specify the position as being found before some base. 
 38  o AfterPosition - Specify the position as being found after some base. 
 39  o OneOfPosition - Specify a position where the location can be multiple positions. 
 40  """ 
 41   
 42  from Bio.Seq import MutableSeq, reverse_complement 
 43   
44 -class SeqFeature(object):
45 """Represent a Sequence Feature on an object. 46 47 Attributes: 48 o location - the location of the feature on the sequence (FeatureLocation) 49 o type - the specified type of the feature (ie. CDS, exon, repeat...) 50 o location_operator - a string specifying how this SeqFeature may 51 be related to others. For example, in the example GenBank feature 52 shown below, the location_operator would be "join" 53 o strand - A value specifying on which strand (of a DNA sequence, for 54 instance) the feature deals with. 1 indicates the plus strand, -1 55 indicates the minus strand, 0 indicates both strands, and None indicates 56 that strand doesn't apply (ie. for proteins) or is not known. 57 o id - A string identifier for the feature. 58 o ref - A reference to another sequence. This could be an accession 59 number for some different sequence. 60 o ref_db - A different database for the reference accession number. 61 o qualifiers - A dictionary of qualifiers on the feature. These are 62 analagous to the qualifiers from a GenBank feature table. The keys of 63 the dictionary are qualifier names, the values are the qualifier 64 values. 65 o sub_features - Additional SeqFeatures which fall under this 'parent' 66 feature. For instance, if we having something like: 67 68 CDS join(1..10,30..40,50..60) 69 70 The the top level feature would be a CDS from 1 to 60, and the sub 71 features would be of 'CDS_join' type and would be from 1 to 10, 30 to 72 40 and 50 to 60, respectively. 73 74 To get the nucleotide sequence for this CDS, you would need to take the 75 parent sequence and do seq[0:10]+seq[29:40]+seq[49:60] (Python counting). 76 Things are more complicated with strands and fuzzy positions. To save you 77 dealing with all these special cases, the SeqFeature provides an extract 78 method to do this for you. 79 """
80 - def __init__(self, location = None, type = '', location_operator = '', 81 strand = None, id = "<unknown id>", 82 qualifiers = None, sub_features = None, 83 ref = None, ref_db = None):
84 """Initialize a SeqFeature on a Sequence. 85 86 location can either be a FeatureLocation (with strand argument also 87 given if required), or a Python slice (with strand given as the step). 88 89 e.g. With no strand, on the forward strand, and on the reverse strand: 90 91 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 92 >>> f1 = SeqFeature(FeatureLocation(5,10), type="domain") 93 >>> f2 = SeqFeature(FeatureLocation(7,110), strand=1, type="CDS") 94 >>> f3 = SeqFeature(FeatureLocation(9,108), strand=-1, type="CDS") 95 96 An invalid strand will trigger an exception: 97 98 >>> f4 = SeqFeature(FeatureLocation(50,60), strand=2) 99 Traceback (most recent call last): 100 ... 101 ValueError: Strand should be +1, -1, 0 or None, not 2 102 103 For exact start/end positions, an integer can be used (as shown above) 104 as shorthand for the ExactPosition object. For non-exact locations, the 105 FeatureLocation must be specified via the appropriate position objects. 106 """ 107 if strand not in [-1, 0, 1, None] : 108 raise ValueError("Strand should be +1, -1, 0 or None, not %s" \ 109 % repr(strand)) 110 if location and not isinstance(location, FeatureLocation): 111 raise TypeError("FeatureLocation (or None) required for the location") 112 self.location = location 113 114 self.type = type 115 self.location_operator = location_operator 116 self.strand = strand 117 self.id = id 118 if qualifiers is None: 119 qualifiers = {} 120 self.qualifiers = qualifiers 121 if sub_features is None: 122 sub_features = [] 123 self.sub_features = sub_features 124 self.ref = ref 125 self.ref_db = ref_db
126
127 - def __repr__(self):
128 """A string representation of the record for debugging.""" 129 answer = "%s(%s" % (self.__class__.__name__, repr(self.location)) 130 if self.type: 131 answer += ", type=%s" % repr(self.type) 132 if self.location_operator: 133 answer += ", location_operator=%s" % repr(self.location_operator) 134 if self.strand: 135 answer += ", strand=%s" % repr(self.strand) 136 if self.id and self.id != "<unknown id>": 137 answer += ", id=%s" % repr(self.id) 138 if self.ref: 139 answer += ", ref=%s" % repr(self.ref) 140 if self.ref_db: 141 answer += ", ref_db=%s" % repr(self.ref_db) 142 answer += ")" 143 return answer
144
145 - def __str__(self):
146 """A readable summary of the feature intended to be printed to screen. 147 """ 148 out = "type: %s\n" % self.type 149 out += "location: %s\n" % self.location 150 out += "ref: %s:%s\n" % (self.ref, self.ref_db) 151 out += "strand: %s\n" % self.strand 152 out += "qualifiers: \n" 153 qualifier_keys = self.qualifiers.keys() 154 qualifier_keys.sort() 155 for qual_key in qualifier_keys: 156 out += " Key: %s, Value: %s\n" % (qual_key, 157 self.qualifiers[qual_key]) 158 if len(self.sub_features) != 0: 159 out += "Sub-Features\n" 160 for sub_feature in self.sub_features: 161 out +="%s\n" % sub_feature 162 163 return out
164
165 - def _shift(self, offset):
166 """Returns a copy of the feature with its location shifted (PRIVATE). 167 168 The annotation qaulifiers are copied.""" 169 answer = SeqFeature(location = self.location._shift(offset), 170 type = self.type, 171 location_operator = self.location_operator, 172 strand = self.strand, 173 id = self.id, 174 #qualifiers = dict(self.qualifiers.iteritems()), 175 #sub_features = [f._shift(offset) for f in self.sub_features], 176 ref = self.ref, 177 ref_db = self.ref_db) 178 #TODO - Sort out the use of sub_feature and qualifiers in __init___ 179 answer.sub_features = [f._shift(offset) for f in self.sub_features] 180 answer.qualifiers = dict(self.qualifiers.iteritems()) 181 return answer
182
183 - def extract(self, parent_sequence):
184 """Extract feature sequence from the supplied parent sequence. 185 186 The parent_sequence can be a Seq like object or a string, and will 187 generally return an object of the same type. The exception to this is 188 a MutableSeq as the parent sequence will return a Seq object. 189 190 This should cope with complex locations including complements, joins 191 and fuzzy positions. Even mixed strand features should work! This 192 also covers features on protein sequences (e.g. domains), although 193 here reverse strand features are not permitted. 194 195 >>> from Bio.Seq import Seq 196 >>> from Bio.Alphabet import generic_protein 197 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 198 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 199 >>> f = SeqFeature(FeatureLocation(8,15), type="domain") 200 >>> f.extract(seq) 201 Seq('VALIVIC', ProteinAlphabet()) 202 203 Note - currently only sub-features of type "join" are supported. 204 """ 205 if isinstance(parent_sequence, MutableSeq): 206 #This avoids complications with reverse complements 207 #(the MutableSeq reverse complement acts in situ) 208 parent_sequence = parent_sequence.toseq() 209 if self.sub_features: 210 if self.location_operator!="join": 211 raise ValueError(self.location_operator) 212 if self.strand == -1: 213 #This is a special case given how the GenBank parser works. 214 #Must avoid doing the reverse complement twice. 215 parts = [] 216 for f_sub in self.sub_features: 217 assert f_sub.strand==-1 218 parts.append(parent_sequence[f_sub.location.nofuzzy_start:\ 219 f_sub.location.nofuzzy_end]) 220 else: 221 #This copes with mixed strand features: 222 parts = [f_sub.extract(parent_sequence) \ 223 for f_sub in self.sub_features] 224 #We use addition rather than a join to avoid alphabet issues: 225 f_seq = parts[0] 226 for part in parts[1:] : f_seq += part 227 else: 228 f_seq = parent_sequence[self.location.nofuzzy_start:\ 229 self.location.nofuzzy_end] 230 if self.strand == -1: 231 #TODO - MutableSeq? 232 try: 233 f_seq = f_seq.reverse_complement() 234 except AttributeError: 235 assert isinstance(f_seq, str) 236 f_seq = reverse_complement(f_seq) 237 return f_seq
238 239 240 # --- References 241 242 # TODO -- Will this hold PubMed and Medline information decently?
243 -class Reference(object):
244 """Represent a Generic Reference object. 245 246 Attributes: 247 o location - A list of Location objects specifying regions of 248 the sequence that the references correspond to. If no locations are 249 specified, the entire sequence is assumed. 250 o authors - A big old string, or a list split by author, of authors 251 for the reference. 252 o title - The title of the reference. 253 o journal - Journal the reference was published in. 254 o medline_id - A medline reference for the article. 255 o pubmed_id - A pubmed reference for the article. 256 o comment - A place to stick any comments about the reference. 257 """
258 - def __init__(self):
259 self.location = [] 260 self.authors = '' 261 self.consrtm = '' 262 self.title = '' 263 self.journal = '' 264 self.medline_id = '' 265 self.pubmed_id = '' 266 self.comment = ''
267
268 - def __str__(self):
269 """Output an informative string for debugging. 270 """ 271 out = "" 272 for single_location in self.location: 273 out += "location: %s\n" % single_location 274 out += "authors: %s\n" % self.authors 275 if self.consrtm: 276 out += "consrtm: %s\n" % self.consrtm 277 out += "title: %s\n" % self.title 278 out += "journal: %s\n" % self.journal 279 out += "medline id: %s\n" % self.medline_id 280 out += "pubmed id: %s\n" % self.pubmed_id 281 out += "comment: %s\n" % self.comment 282 return out
283
284 - def __repr__(self):
285 #TODO - Update this is __init__ later accpets values 286 return "%s(title=%s, ...)" % (self.__class__.__name__, 287 repr(self.title))
288 289 # --- Handling feature locations 290
291 -class FeatureLocation(object):
292 """Specify the location of a feature along a sequence. 293 294 This attempts to deal with fuzziness of position ends, but also 295 make it easy to get the start and end in the 'normal' case (no 296 fuzziness). 297 298 You should access the start and end attributes with 299 your_location.start and your_location.end. If the start and 300 end are exact, this will return the positions, if not, we'll return 301 the approriate Fuzzy class with info about the position and fuzziness. 302 303 Note that the start and end location numbering follow Python's scheme, 304 thus a GenBank entry of 123..150 (one based counting) becomes a location 305 of [122:150] (zero based counting). 306 """
307 - def __init__(self, start, end):
308 """Specify the start and end of a sequence feature. 309 310 start and end arguments specify the values where the feature begins 311 and ends. These can either by any of the *Position objects that 312 inherit from AbstractPosition, or can just be integers specifying the 313 position. In the case of integers, the values are assumed to be 314 exact and are converted in ExactPosition arguments. This is meant 315 to make it easy to deal with non-fuzzy ends. 316 317 i.e. Short form: 318 319 >>> from Bio.SeqFeature import FeatureLocation 320 >>> loc = FeatureLocation(5,10) 321 322 Explicit form: 323 324 >>> from Bio.SeqFeature import FeatureLocation, ExactPosition 325 >>> loc = FeatureLocation(ExactPosition(5),ExactPosition(10)) 326 327 Other fuzzy positions are used similarly, 328 329 >>> from Bio.SeqFeature import FeatureLocation 330 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 331 >>> loc2 = FeatureLocation(BeforePosition(5),AfterPosition(10)) 332 333 """ 334 if isinstance(start, AbstractPosition): 335 self._start = start 336 else: 337 self._start = ExactPosition(start) 338 339 if isinstance(end, AbstractPosition): 340 self._end = end 341 else: 342 self._end = ExactPosition(end)
343
344 - def __str__(self):
345 """Returns a representation of the location (with python counting). 346 347 For the simple case this uses the python splicing syntax, [122:150] 348 (zero based counting) which GenBank would call 123..150 (one based 349 counting). 350 """ 351 return "[%s:%s]" % (self._start, self._end)
352
353 - def __repr__(self):
354 """A string representation of the location for debugging.""" 355 return "%s(%s,%s)" \ 356 % (self.__class__.__name__, repr(self.start), repr(self.end))
357
358 - def _shift(self, offset):
359 """Returns a copy of the location shifted by the offset (PRIVATE).""" 360 return FeatureLocation(start = self._start._shift(offset), 361 end = self._end._shift(offset))
362 363 start = property(fget= lambda self : self._start, 364 doc="Start location (possibly a fuzzy position, read only).") 365 366 end = property(fget= lambda self : self._end, 367 doc="End location (possibly a fuzzy position, read only).") 368
369 - def _get_nofuzzy_start(self):
370 #TODO - Do we still use the BetweenPosition class? 371 if ((self._start == self._end) and isinstance(self._start, 372 BetweenPosition)): 373 return self._start.position 374 else: 375 return min(self._start.position, 376 self._start.position + self._start.extension)
377 nofuzzy_start = property(fget=_get_nofuzzy_start, 378 doc="""Start position (integer, approximated if fuzzy, read only). 379 380 To get non-fuzzy attributes (ie. the position only) ask for 381 'location.nofuzzy_start', 'location.nofuzzy_end'. These should return 382 the largest range of the fuzzy position. So something like: 383 (10.20)..(30.40) should return 10 for start, and 40 for end. 384 """) 385
386 - def _get_nofuzzy_end(self):
387 #TODO - Do we still use the BetweenPosition class? 388 if ((self._start == self._end) and isinstance(self._start, 389 BetweenPosition)): 390 return self._end.position 391 else: 392 return max(self._end.position, 393 self._end.position + self._end.extension)
394 nofuzzy_end = property(fget=_get_nofuzzy_end, 395 doc="""End position (integer, approximated if fuzzy, read only). 396 397 To get non-fuzzy attributes (ie. the position only) ask for 398 'location.nofuzzy_start', 'location.nofuzzy_end'. These should return 399 the largest range of the fuzzy position. So something like: 400 (10.20)..(30.40) should return 10 for start, and 40 for end. 401 """)
402
403 -class AbstractPosition(object):
404 """Abstract base class representing a position. 405 """
406 - def __init__(self, position, extension):
407 self.position = position 408 self.extension = extension
409
410 - def __repr__(self):
411 """String representation of the location for debugging.""" 412 return "%s(%s,%s)" % (self.__class__.__name__, \ 413 repr(self.position), repr(self.extension))
414
415 - def __cmp__(self, other):
416 """A simple comparison function for positions. 417 418 This is very simple-minded and just compares the position attribute 419 of the features; extensions are not considered at all. This could 420 potentially be expanded to try to take advantage of extensions. 421 """ 422 assert isinstance(other, AbstractPosition), \ 423 "We can only do comparisons between Biopython Position objects." 424 425 return cmp(self.position, other.position)
426
427 - def _shift(self, offset):
428 #We want this to maintain the subclass when called from a subclass 429 return self.__class__(self.position + offset, self.extension)
430
431 -class ExactPosition(AbstractPosition):
432 """Specify the specific position of a boundary. 433 434 o position - The position of the boundary. 435 o extension - An optional argument which must be zero since we don't 436 have an extension. The argument is provided so that the same number of 437 arguments can be passed to all position types. 438 439 In this case, there is no fuzziness associated with the position. 440 """
441 - def __init__(self, position, extension = 0):
442 if extension != 0: 443 raise AttributeError("Non-zero extension %s for exact position." 444 % extension) 445 AbstractPosition.__init__(self, position, 0)
446
447 - def __repr__(self):
448 """String representation of the ExactPosition location for debugging.""" 449 assert self.extension == 0 450 return "%s(%s)" % (self.__class__.__name__, repr(self.position))
451
452 - def __str__(self):
453 return str(self.position)
454
455 -class WithinPosition(AbstractPosition):
456 """Specify the position of a boundary within some coordinates. 457 458 Arguments: 459 o position - The start position of the boundary 460 o extension - The range to which the boundary can extend. 461 462 This allows dealing with a position like ((1.4)..100). This 463 indicates that the start of the sequence is somewhere between 1 464 and 4. To represent that with this class we would set position as 465 1 and extension as 3. 466 """
467 - def __init__(self, position, extension = 0):
468 AbstractPosition.__init__(self, position, extension)
469
470 - def __str__(self):
471 return "(%s.%s)" % (self.position, self.position + self.extension)
472
473 -class BetweenPosition(AbstractPosition):
474 """Specify the position of a boundary between two coordinates (OBSOLETE?). 475 476 Arguments: 477 o position - The start position of the boundary. 478 o extension - The range to the other position of a boundary. 479 480 This specifies a coordinate which is found between the two positions. 481 So this allows us to deal with a position like ((1^2)..100). To 482 represent that with this class we set position as 1 and the 483 extension as 1. 484 """
485 - def __init__(self, position, extension = 0):
486 AbstractPosition.__init__(self, position, extension)
487
488 - def __str__(self):
489 return "(%s^%s)" % (self.position, self.position + self.extension)
490
491 -class BeforePosition(AbstractPosition):
492 """Specify a position where the actual location occurs before it. 493 494 Arguments: 495 o position - The upper boundary of where the location can occur. 496 o extension - An optional argument which must be zero since we don't 497 have an extension. The argument is provided so that the same number of 498 arguments can be passed to all position types. 499 500 This is used to specify positions like (<10..100) where the location 501 occurs somewhere before position 10. 502 """
503 - def __init__(self, position, extension = 0):
504 if extension != 0: 505 raise AttributeError("Non-zero extension %s for exact position." 506 % extension) 507 AbstractPosition.__init__(self, position, 0)
508
509 - def __repr__(self):
510 """A string representation of the location for debugging.""" 511 assert self.extension == 0 512 return "%s(%s)" % (self.__class__.__name__, repr(self.position))
513
514 - def __str__(self):
515 return "<%s" % self.position
516
517 -class AfterPosition(AbstractPosition):
518 """Specify a position where the actual location is found after it. 519 520 Arguments: 521 o position - The lower boundary of where the location can occur. 522 o extension - An optional argument which must be zero since we don't 523 have an extension. The argument is provided so that the same number of 524 arguments can be passed to all position types. 525 526 This is used to specify positions like (>10..100) where the location 527 occurs somewhere after position 10. 528 """
529 - def __init__(self, position, extension = 0):
530 if extension != 0: 531 raise AttributeError("Non-zero extension %s for exact position." 532 % extension) 533 AbstractPosition.__init__(self, position, 0)
534
535 - def __repr__(self):
536 """A string representation of the location for debugging.""" 537 assert self.extension == 0 538 return "%s(%s)" % (self.__class__.__name__, repr(self.position))
539
540 - def __str__(self):
541 return ">%s" % self.position
542
543 -class OneOfPosition(AbstractPosition):
544 """Specify a position where the location can be multiple positions. 545 546 This models the GenBank 'one-of(1888,1901)' function, and tries 547 to make this fit within the Biopython Position models. In our case 548 the position of the "one-of" is set as the lowest choice, and the 549 extension is the range to the highest choice. 550 """
551 - def __init__(self, position_list):
552 """Initialize with a set of posssible positions. 553 554 position_list is a list of AbstractPosition derived objects, 555 specifying possible locations. 556 """ 557 # unique attribute for this type of positions 558 self.position_choices = position_list 559 # find the smallest and largest position in the choices 560 smallest = None 561 largest = None 562 for position_choice in self.position_choices: 563 assert isinstance(position_choice, AbstractPosition), \ 564 "Expected position objects, got %r" % position_choice 565 if smallest is None and largest is None: 566 smallest = position_choice.position 567 largest = position_choice.position 568 elif position_choice.position > largest: 569 largest = position_choice.position 570 elif position_choice.position < smallest: 571 smallest = position_choice.position 572 # initialize with our definition of position and extension 573 AbstractPosition.__init__(self, smallest, largest - smallest)
574
575 - def __repr__(self):
576 """String representation of the OneOfPosition location for debugging.""" 577 return "%s(%s)" % (self.__class__.__name__, \ 578 repr(self.position_choices))
579
580 - def __str__(self):
581 out = "one-of(" 582 for position in self.position_choices: 583 out += "%s," % position 584 # replace the last comma with the closing parenthesis 585 out = out[:-1] + ")" 586 return out
587
588 - def _shift(self, offset):
589 return self.__class__([position_choice._shift(offset) \ 590 for position_choice in self.position_choices])
591
592 -class PositionGap(object):
593 """Simple class to hold information about a gap between positions. 594 """
595 - def __init__(self, gap_size):
596 """Intialize with a position object containing the gap information. 597 """ 598 self.gap_size = gap_size
599
600 - def __repr__(self):
601 """A string representation of the position gap for debugging.""" 602 return "%s(%s)" % (self.__class__.__name__, repr(self.gap_size))
603
604 - def __str__(self):
605 out = "gap(%s)" % self.gap_size 606 return out
607
608 -def _test():
609 """Run the Bio.SeqFeature module's doctests.""" 610 print "Runing doctests..." 611 import doctest 612 doctest.testmod() 613 print "Done"
614 615 if __name__ == "__main__": 616 _test() 617