Package Bio :: Package Phylo :: Module PhyloXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PhyloXML

   1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Classes corresponding to phyloXML elements. 
   7   
   8  See U{ http://phyloxml.org/ } for the official specification. 
   9   
  10  See also Han and Zmasek (2009) doi:10.1186/1471-2105-10-356 
  11  """ 
  12  __docformat__ = "epytext en" 
  13   
  14  import re 
  15  import warnings 
  16   
  17  from Bio import Alphabet 
  18  from Bio.Align import MultipleSeqAlignment 
  19  from Bio.Seq import Seq 
  20  from Bio.SeqFeature import SeqFeature, FeatureLocation 
  21  from Bio.SeqRecord import SeqRecord 
  22   
  23  import BaseTree 
  24  import _sugar 
25 26 27 -class PhyloXMLWarning(Warning):
28 """Warning for non-compliance with the phyloXML specification.""" 29 pass
30
31 32 -def _check_str(text, testfunc):
33 """Check a string using testfunc, and warn if there's no match.""" 34 if text is not None and not testfunc(text): 35 warnings.warn("String %s doesn't match the given regexp" % text, 36 PhyloXMLWarning, stacklevel=2)
37
38 39 # Core elements 40 41 -class PhyloElement(BaseTree.TreeElement):
42 """Base class for all PhyloXML objects."""
43
44 45 -class Phyloxml(PhyloElement):
46 """Root node of the PhyloXML document. 47 48 Contains an arbitrary number of Phylogeny elements, possibly followed by 49 elements from other namespaces. 50 51 @param attributes: (XML namespace definitions) 52 @param phylogenies: list of phylogenetic trees 53 @param other: list of arbitrary non-phyloXML elements, if any 54 """
55 - def __init__(self, attributes, phylogenies=None, other=None):
56 self.attributes = attributes 57 self.phylogenies = phylogenies or [] 58 self.other = other or []
59
60 - def __getitem__(self, index):
61 """Get a phylogeny by index or name.""" 62 if isinstance(index, int) or isinstance(index, slice): 63 return self.phylogenies[index] 64 if not isinstance(index, basestring): 65 raise KeyError, "can't use %s as an index" % type(index) 66 for tree in self.phylogenies: 67 if tree.name == index: 68 return tree 69 else: 70 raise KeyError, "no phylogeny found with name " + repr(index)
71
72 - def __iter__(self):
73 """Iterate through the phylogenetic trees in this object.""" 74 return iter(self.phylogenies)
75
76 - def __len__(self):
77 """Number of phylogenetic trees in this object.""" 78 return len(self.phylogenies)
79
80 - def __str__(self):
81 return '%s([%s])' % (self.__class__.__name__, 82 ',\n'.join(map(str, self.phylogenies)))
83
84 85 -class Other(PhyloElement):
86 """Container for non-phyloXML elements in the tree. 87 88 Usually, an Other object will have either a 'value' or a non-empty list 89 of 'children', but not both. This is not enforced here, though. 90 91 @param tag: local tag for the XML node 92 @param namespace: XML namespace for the node -- should not be the default 93 phyloXML namespace. 94 @param attributes: string attributes on the XML node 95 @param value: text contained directly within this XML node 96 @param children: list of child nodes, if any (also Other instances) 97 """
98 - def __init__(self, tag, namespace=None, attributes=None, value=None, 99 children=None):
100 self.tag = tag 101 self.namespace = namespace 102 self.attributes = attributes 103 self.value = value 104 self.children = children or []
105
106 - def __iter__(self):
107 """Iterate through the children of this object (if any).""" 108 return iter(self.children)
109
110 111 -class Phylogeny(PhyloElement, BaseTree.Tree):
112 """A phylogenetic tree. 113 114 @param root: the root node/clade of this tree 115 @param rooted: True if this tree is rooted 116 @param rerootable: True if this tree is rerootable 117 @param branch_length_unit: unit for branch_length values on clades 118 @type type: str 119 120 @param name: string identifier for this tree, not required to be unique 121 @param id: unique identifier for this tree (type Id) 122 @param description: plain-text description 123 @param date: date for the root node of this tree (type Date) 124 @param confidences: list of Confidence objects for this tree 125 @param clade_relations: list of CladeRelation objects 126 @param sequence_relations: list of SequenceRelation objects 127 @param properties: list of Property objects 128 @param other: list of non-phyloXML elements (type Other) 129 """
130 - def __init__(self, root=None, rooted=True, 131 rerootable=None, branch_length_unit=None, type=None, 132 # Child nodes 133 name=None, id=None, description=None, date=None, 134 # Collections 135 confidences=None, clade_relations=None, sequence_relations=None, 136 properties=None, other=None, 137 ):
138 assert isinstance(rooted, bool) 139 self.root = root 140 self.rooted = rooted 141 self.rerootable = rerootable 142 self.branch_length_unit = branch_length_unit 143 self.type = type 144 self.name = name 145 self.id = id 146 self.description = description 147 self.date = date 148 self.confidences = confidences or [] 149 self.clade_relations = clade_relations or [] 150 self.sequence_relations = sequence_relations or [] 151 self.properties = properties or [] 152 self.other = other or []
153 154 @classmethod
155 - def from_tree(cls, tree, **kwargs):
156 phy = cls( 157 root=Clade.from_subtree(tree.root), 158 rooted=tree.rooted, 159 name=tree.name, 160 id=(tree.id is not None) and Id(str(tree.id)) or None) 161 phy.__dict__.update(kwargs) 162 return phy
163 164 @classmethod
165 - def from_subtree(cls, subtree, **kwargs):
166 return Clade.from_subtree(subtree).to_phylogeny(**kwargs)
167
168 - def to_phyloxml(self, **kwargs):
169 """Create a new PhyloXML object containing just this phylogeny.""" 170 return Phyloxml(kwargs, phylogenies=[self])
171
172 - def to_alignment(self):
173 """Construct an alignment from the aligned sequences in this tree.""" 174 def is_aligned_seq(elem): 175 if isinstance(elem, Sequence) and elem.mol_seq.is_aligned: 176 return True 177 return False
178 seqs = self._filter_search(is_aligned_seq, 'preorder', True) 179 try: 180 first_seq = seqs.next() 181 except StopIteration: 182 # No aligned sequences were found --> empty MSA 183 return MultipleSeqAlignment([]) 184 msa = MultipleSeqAlignment([first_seq.to_seqrecord()], 185 first_seq.get_alphabet()) 186 msa.extend(seq.to_seqrecord() for seq in seqs) 187 return msa
188 189 # Singular property for plural attribute
190 - def _get_confidence(self):
191 """Equivalent to self.confidences[0] if there is only 1 value. 192 193 See also: Clade.confidence, Clade.taxonomy 194 """ 195 if len(self.confidences) == 0: 196 return None 197 if len(self.confidences) > 1: 198 raise AttributeError("more than 1 confidence value available; " 199 "use Phylogeny.confidences") 200 return self.confidences[0]
201
202 - def _set_confidence(self, value):
203 if isinstance(value, float) or isinstance(value, int): 204 value = Confidence(value) 205 elif not isinstance(value, Confidence): 206 raise ValueError("value must be a number or Confidence instance") 207 if len(self.confidences) == 0: 208 self.confidences.append(value) 209 elif len(self.confidences) == 1: 210 self.confidences[0] = value 211 else: 212 raise ValueError("multiple confidence values already exist; " 213 "use Phylogeny.confidences instead")
214 215 confidence = property(_get_confidence, _set_confidence) 216
217 218 -class Clade(PhyloElement, BaseTree.Clade):
219 """Describes a branch of the current phylogenetic tree. 220 221 Used recursively, describes the topology of a phylogenetic tree. 222 223 Both 'color' and 'width' elements should be interpreted by client code as 224 applying to the whole clade, including all descendents, unless overwritten 225 in-sub clades. This module doesn't automatically assign these attributes to 226 sub-clades to achieve this cascade -- and neither should you. 227 228 @param branch_length: parent branch length of this clade 229 @param id_source: link other elements to a clade (on the xml-level) 230 231 @param name: short string label for this clade 232 @param confidences: list of Confidence objects, used to indicate the 233 support for a clade/parent branch. 234 @param width: branch width for this clade (including branch from parent) 235 @param color: color used for graphical display of this clade 236 @param node_id: unique identifier for the root node of this clade 237 @param taxonomies: list of Taxonomy objects 238 @param sequences: list of Sequence objects 239 @param events: describe such events as gene-duplications at the root 240 node/parent branch of this clade 241 @param binary_characters: a BinaryCharacters object 242 @param distributions: list of Distribution objects 243 @param date: a date for the root node of this clade (type Date) 244 @param references: list of Reference objects 245 @param properties: list of Property objects 246 @param clades: list of sub-clades (type Clade) 247 @param other: list of non-phyloXML objects 248 """
249 - def __init__(self, 250 # Attributes 251 branch_length=None, id_source=None, 252 # Child nodes 253 name=None, width=None, color=None, node_id=None, events=None, 254 binary_characters=None, date=None, 255 # Collections 256 confidences=None, taxonomies=None, sequences=None, 257 distributions=None, references=None, properties=None, clades=None, 258 other=None, 259 ):
260 self.branch_length = branch_length 261 self.id_source = id_source 262 self.name = name 263 self.width = width 264 self.color = color 265 self.node_id = node_id 266 self.events = events 267 self.binary_characters = binary_characters 268 self.date = date 269 self.confidences = confidences or [] 270 self.taxonomies = taxonomies or [] 271 self.sequences = sequences or [] 272 self.distributions = distributions or [] 273 self.references = references or [] 274 self.properties = properties or [] 275 self.clades = clades or [] 276 self.other = other or []
277 278 @classmethod
279 - def from_subtree(cls, subtree, **kwargs):
280 """Create a new Clade from a BaseTree.Clade object.""" 281 clade = cls(branch_length=subtree.branch_length, 282 name=subtree.name) 283 clade.clades = [cls.from_subtree(st) for st in subtree.clades] 284 clade.__dict__.update(kwargs) 285 return clade
286
287 - def to_phylogeny(self, **kwargs):
288 """Create a new phylogeny containing just this clade.""" 289 phy = Phylogeny(root=self, date=self.date) 290 phy.__dict__.update(kwargs) 291 return phy
292 293 # Shortcuts for list attributes that are usually only 1 item
294 - def _get_confidence(self):
295 if len(self.confidences) == 0: 296 return None 297 if len(self.confidences) > 1: 298 raise AttributeError("more than 1 confidence value available; " 299 "use Clade.confidences") 300 return self.confidences[0]
301
302 - def _set_confidence(self, value):
303 if isinstance(value, float) or isinstance(value, int): 304 value = Confidence(value) 305 elif not isinstance(value, Confidence): 306 raise ValueError("value must be a number or Confidence instance") 307 if len(self.confidences) == 0: 308 self.confidences.append(value) 309 elif len(self.confidences) == 1: 310 self.confidences[0] = value 311 else: 312 raise ValueError("multiple confidence values already exist; " 313 "use Phylogeny.confidences instead")
314 315 confidence = property(_get_confidence, _set_confidence) 316
317 - def _get_taxonomy(self):
318 if len(self.taxonomies) == 0: 319 return None 320 if len(self.taxonomies) > 1: 321 raise AttributeError("more than 1 taxonomy value available; " 322 "use Clade.taxonomies") 323 return self.taxonomies[0]
324
325 - def _set_taxonomy(self, value):
326 if not isinstance(value, Taxonomy): 327 raise ValueError("assigned value must be a Taxonomy instance") 328 if len(self.taxonomies) == 0: 329 self.taxonomies.append(value) 330 elif len(self.taxonomies) == 1: 331 self.taxonomies[0] = value 332 else: 333 raise ValueError("multiple taxonomy values already exist; " 334 "use Phylogeny.taxonomies instead")
335 336 taxonomy = property(_get_taxonomy, _set_taxonomy) 337 338 # Syntax sugar for setting the branch color
339 - def _get_color(self):
340 return self._color
341
342 - def _set_color(self, arg):
343 if arg is None or isinstance(arg, BranchColor): 344 self._color = arg 345 elif isinstance(arg, basestring): 346 if arg in BranchColor.color_names: 347 # Known color name 348 self._color = BranchColor.from_name(arg) 349 elif arg.startswith('#') and len(arg) == 7: 350 # HTML-style hex string 351 self._color = BranchColor.from_hex(arg) 352 else: 353 raise ValueError("invalid color string %s" % arg) 354 elif hasattr(arg, '__iter__') and len(arg) == 3: 355 # RGB triplet 356 self._color = BranchColor(*arg) 357 else: 358 raise ValueError("invalid color value %s" % arg)
359 360 color = property(_get_color, _set_color, doc="Branch color.")
361
362 363 # PhyloXML-specific complex types 364 365 -class Accession(PhyloElement):
366 """Captures the local part in a sequence identifier. 367 368 Example: In 'UniProtKB:P17304', the Accession instance attribute 'value' is 369 'P17304' and the 'source' attribute is 'UniProtKB'. 370 """
371 - def __init__(self, value, source):
372 self.value = value 373 self.source = source
374
375 - def __str__(self):
376 """Show the class name and an identifying attribute.""" 377 return '%s:%s' % (self.source, self.value)
378
379 380 -class Annotation(PhyloElement):
381 """The annotation of a molecular sequence. 382 383 It is recommended to annotate by using the optional 'ref' attribute (some 384 examples of acceptable values for the ref attribute: 'GO:0008270', 385 'KEGG:Tetrachloroethene degradation', 'EC:1.1.1.1'). 386 387 @type ref: str 388 @param source: plain-text source for this annotation 389 @param evidence: describe evidence as free text (e.g. 'experimental') 390 @type type: str 391 392 @param desc: free text description 393 @param confidence: state the type and value of support (type Confidence) 394 @param properties: list of typed and referenced annotations from external 395 resources 396 @type uri: Uri 397 """ 398 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 399
400 - def __init__(self, 401 # Attributes 402 ref=None, source=None, evidence=None, type=None, 403 # Child nodes 404 desc=None, confidence=None, uri=None, 405 # Collection 406 properties=None):
407 _check_str(ref, self.re_ref.match) 408 self.ref = ref 409 self.source = source 410 self.evidence = evidence 411 self.type = type 412 self.desc = desc 413 self.confidence = confidence 414 self.uri = uri 415 self.properties = properties or []
416
417 418 -class BinaryCharacters(PhyloElement):
419 """The names and/or counts of binary characters present, gained, and lost 420 at the root of a clade. 421 """
422 - def __init__(self, 423 # Attributes 424 type=None, gained_count=None, lost_count=None, present_count=None, 425 absent_count=None, 426 # Child nodes (flattened into collections) 427 gained=None, lost=None, present=None, absent=None):
428 self.type=type 429 self.gained_count=gained_count 430 self.lost_count=lost_count 431 self.present_count=present_count 432 self.absent_count=absent_count 433 self.gained=gained or [] 434 self.lost=lost or [] 435 self.present=present or [] 436 self.absent=absent or []
437
438 439 -class BranchColor(PhyloElement):
440 """Indicates the color of a clade when rendered graphically. 441 442 The color should be interpreted by client code (e.g. visualization 443 programs) as applying to the whole clade, unless overwritten by the 444 color(s) of sub-clades. 445 446 Color values must be integers from 0 to 255. 447 """ 448 449 color_names = { 450 'red': (255, 0, 0), 451 'r': (255, 0, 0), 452 'yellow': (255, 255, 0), 453 'y': (255, 255, 0), 454 'green': ( 0, 128, 0), 455 'g': ( 0, 128, 0), 456 'cyan': ( 0, 255, 255), 457 'c': ( 0, 255, 255), 458 'blue': ( 0, 0, 255), 459 'b': ( 0, 0, 255), 460 'magenta': (255, 0, 255), 461 'm': (255, 0, 255), 462 'black': ( 0, 0, 0), 463 'k': ( 0, 0, 0), 464 'white': (255, 255, 255), 465 'w': (255, 255, 255), 466 # Names standardized in HTML/CSS spec 467 # http://w3schools.com/html/html_colornames.asp 468 'maroon': (128, 0, 0), 469 'olive': (128, 128, 0), 470 'lime': ( 0, 255, 0), 471 'aqua': ( 0, 255, 255), 472 'teal': ( 0, 128, 128), 473 'navy': ( 0, 0, 128), 474 'fuchsia': (255, 0, 255), 475 'purple': (128, 0, 128), 476 'silver': (192, 192, 192), 477 'gray': (128, 128, 128), 478 # More definitions from matplotlib/gcolor2 479 'grey': (128, 128, 128), 480 'pink': (255, 192, 203), 481 'salmon': (250, 128, 114), 482 'orange': (255, 165, 0), 483 'gold': (255, 215, 0), 484 'tan': (210, 180, 140), 485 'brown': (165, 42, 42), 486 } 487
488 - def __init__(self, red, green, blue):
489 for color in (red, green, blue): 490 assert (isinstance(color, int) and 491 0 <= color <= 255 492 ), "Color values must be integers between 0 and 255." 493 self.red = red 494 self.green = green 495 self.blue = blue
496 497 @classmethod
498 - def from_hex(cls, hexstr):
499 """Construct a BranchColor object from a hexadecimal string. 500 501 The string format is the same style used in HTML and CSS, such as 502 '#FF8000' for an RGB value of (255, 128, 0). 503 """ 504 assert (isinstance(hexstr, basestring) and 505 hexstr.startswith('#') and 506 len(hexstr) == 7 507 ), "need a 24-bit hexadecimal string, e.g. #000000" 508 def unpack(cc): 509 return int('0x'+cc, base=16)
510 RGB = hexstr[1:3], hexstr[3:5], hexstr[5:] 511 return cls(*map(unpack, RGB))
512 513 @classmethod
514 - def from_name(cls, colorname):
515 """Construct a BranchColor object by the color's name.""" 516 return cls(*cls.color_names[colorname])
517
518 - def to_hex(self):
519 """Return a 24-bit hexadecimal RGB representation of this color. 520 521 The returned string is suitable for use in HTML/CSS, as a color 522 parameter in matplotlib, and perhaps other situations. 523 524 Example: 525 526 >>> bc = BranchColor(12, 200, 100) 527 >>> bc.to_hex() 528 '#0cc864' 529 """ 530 return '#' + hex( 531 self.red * (16**4) 532 + self.green * (16**2) 533 + self.blue)[2:].zfill(6)
534
535 - def to_rgb(self):
536 """Return a tuple of RGB values (0 to 255) representing this color. 537 538 Example: 539 540 >>> bc = BranchColor(255, 165, 0) 541 >>> bc.to_rgb() 542 (255, 165, 0) 543 """ 544 return (self.red, self.green, self.blue)
545
546 - def __repr__(self):
547 """Preserve the standard RGB order when representing this object.""" 548 return ('%s(red=%d, green=%d, blue=%d)' 549 % (self.__class__.__name__, self.red, self.green, self.blue) 550 ).encode('utf-8')
551
552 - def __str__(self):
553 """Show the color's RGB values.""" 554 return "(%d, %d, %d)" % (self.red, self.green, self.blue)
555
556 557 -class CladeRelation(PhyloElement):
558 """Expresses a typed relationship between two clades. 559 560 For example, this could be used to describe multiple parents of a clade. 561 562 @type id_ref_0: str 563 @type id_ref_1: str 564 @type distance: str 565 @type type: str 566 567 @type confidence: Confidence 568 """
569 - def __init__(self, type, id_ref_0, id_ref_1, 570 distance=None, confidence=None):
571 self.distance = distance 572 self.type = type 573 self.id_ref_0 = id_ref_0 574 self.id_ref_1 = id_ref_1 575 self.confidence = confidence
576
577 578 -class Confidence(PhyloElement):
579 """A general purpose confidence element. 580 581 For example, this can be used to express the bootstrap support value of a 582 clade (in which case the 'type' attribute is 'bootstrap'). 583 584 @type value: float 585 @type type: str 586 """
587 - def __init__(self, value, type='unknown'):
588 self.value = value 589 self.type = type
590
591 - def __float__(self):
592 return float(self.value)
593
594 - def __int__(self):
595 return int(self.value)
596
597 598 -class Date(PhyloElement):
599 """A date associated with a clade/node. 600 601 Its value can be numerical by using the 'value' element and/or free text 602 with the 'desc' element' (e.g. 'Silurian'). If a numerical value is used, it 603 is recommended to employ the 'unit' attribute. 604 605 @param unit: type of numerical value (e.g. 'mya' for 'million years ago') 606 607 @type value: float 608 @param desc: plain-text description of the date 609 @param minimum: lower bound on the date value 610 @param maximum: upper bound on the date value 611 """
612 - def __init__(self, value=None, unit=None, desc=None, 613 minimum=None, maximum=None):
614 self.value = value 615 self.unit = unit 616 self.desc = desc 617 self.minimum = minimum 618 self.maximum = maximum
619
620 - def __str__(self):
621 """Show the class name and the human-readable date.""" 622 if self.unit and self.value is not None: 623 return '%s %s' % (self.value, self.unit) 624 if self.desc is not None: 625 return self.desc 626 return self.__class__.__name__
627
628 629 -class Distribution(PhyloElement):
630 """Geographic distribution of the items of a clade (species, sequences). 631 632 Intended for phylogeographic applications. 633 634 The location can be described either by free text in the 'desc' element 635 and/or by the coordinates of one or more 'Points' (similar to the 'Point' 636 element in Google's KML format) or by 'Polygons'. 637 """
638 - def __init__(self, desc=None, points=None, polygons=None):
639 self.desc = desc 640 self.points = points or [] 641 self.polygons = polygons or []
642
643 644 -class DomainArchitecture(PhyloElement):
645 """Domain architecture of a protein. 646 647 @param length: total length of the protein sequence (type int) 648 @param domains: list of ProteinDomain objects 649 """
650 - def __init__(self, length=None, domains=None):
651 self.length = length 652 self.domains = domains
653
654 655 -class Events(PhyloElement):
656 """Events at the root node of a clade (e.g. one gene duplication). 657 658 All attributes are set to None by default, but this object can also be 659 treated as a dictionary, in which case None values are treated as missing 660 keys and deleting a key resets that attribute's value back to None. 661 """ 662 ok_type = set(('transfer', 'fusion', 'speciation_or_duplication', 'other', 663 'mixed', 'unassigned')) 664
665 - def __init__(self, type=None, duplications=None, speciations=None, 666 losses=None, confidence=None):
667 _check_str(type, self.ok_type.__contains__) 668 self.type = type 669 self.duplications = duplications 670 self.speciations = speciations 671 self.losses = losses 672 self.confidence = confidence
673
674 - def iteritems(self):
675 return ((k, v) for k, v in self.__dict__.iteritems() if v is not None)
676
677 - def iterkeys(self):
678 return (k for k, v in self.__dict__.iteritems() if v is not None)
679
680 - def itervalues(self):
681 return (v for v in self.__dict__.itervalues() if v is not None)
682
683 - def items(self):
684 return list(self.iteritems())
685
686 - def keys(self):
687 return list(self.iterkeys())
688
689 - def values(self):
690 return list(self.itervalues())
691
692 - def __len__(self):
693 return len(self.values())
694
695 - def __getitem__(self, key):
696 if not hasattr(self, key): 697 raise KeyError(key) 698 val = getattr(self, key) 699 if val is None: 700 raise KeyError("%s has not been set in this object" % repr(key)) 701 return val
702
703 - def __setitem__(self, key, val):
704 setattr(self, key, val)
705
706 - def __delitem__(self, key):
707 setattr(self, key, None)
708
709 - def __iter__(self):
710 return iter(self.iterkeys())
711
712 - def __contains__(self, key):
713 return (hasattr(self, key) and getattr(self, key) is not None)
714
715 716 -class Id(PhyloElement):
717 """A general-purpose identifier element. 718 719 Allows to indicate the provider (or authority) of an identifier, e.g. NCBI, 720 along with the value itself. 721 """
722 - def __init__(self, value, provider=None):
723 self.value = value 724 self.provider = provider
725
726 - def __str__(self):
727 if self.provider is not None: 728 return '%s:%s' % (self.provider, self.value) 729 return self.value
730
731 732 -class MolSeq(PhyloElement):
733 """Store a molecular sequence. 734 735 @param value: the sequence, as a string 736 @param is_aligned: True is mol_seq is aligned (usu. meaning gaps are 737 introduced and all aligned seqs are the same length) 738 """ 739 re_value = re.compile(r'[a-zA-Z\.\-\?\*_]+') 740
741 - def __init__(self, value, is_aligned=None):
742 _check_str(value, self.re_value.match) 743 self.value = value 744 self.is_aligned = is_aligned
745
746 - def __str__(self):
747 return self.value
748
749 750 -class Point(PhyloElement):
751 """Geographic coordinates of a point, with an optional altitude. 752 753 Used by element 'Distribution'. 754 755 @param geodetic_datum: indicate the geodetic datum (also called 'map 756 datum'). For example, Google's KML uses 'WGS84'. (required) 757 @param lat: latitude 758 @param long: longitude 759 @param alt: altitude 760 @param alt_unit: unit for the altitude (e.g. 'meter') 761 """
762 - def __init__(self, geodetic_datum, lat, long, alt=None, alt_unit=None):
763 self.geodetic_datum = geodetic_datum 764 self.lat = lat 765 self.long = long 766 self.alt = alt 767 self.alt_unit = alt_unit
768
769 770 -class Polygon(PhyloElement):
771 """A polygon defined by a list of 'Points' (used by element 'Distribution'). 772 773 @param points: list of 3 or more points representing vertices. 774 """
775 - def __init__(self, points=None):
776 self.points = points or []
777
778 - def __str__(self):
779 return '%s([%s])' % (self.__class__.__name__, 780 ',\n'.join(map(str, self.points)))
781
782 783 -class Property(PhyloElement):
784 """A typed and referenced property from an external resources. 785 786 Can be attached to 'Phylogeny', 'Clade', and 'Annotation' objects. 787 788 @param ref: reference to an external resource, e.g. "NOAA:depth" 789 790 @param unit: the unit of the property, e.g. "METRIC:m" (optional) 791 792 @param datatype: indicates the type of a property and is limited to 793 xsd-datatypes (e.g. 'xsd:string', 'xsd:boolean', 'xsd:integer', 794 'xsd:decimal', 'xsd:float', 'xsd:double', 'xsd:date', 'xsd:anyURI'). 795 796 @param applies_to: indicates the item to which a property applies to (e.g. 797 'node' for the parent node of a clade, 'parent_branch' for the parent 798 branch of a clade, or just 'clade'). 799 800 @param id_ref: allows to attached a property specifically to one element 801 (on the xml-level). (optional) 802 803 @type value: str 804 """ 805 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 806 ok_applies_to = set(('phylogeny', 'clade', 'node', 'annotation', 807 'parent_branch', 'other')) 808 ok_datatype = set(('xsd:string', 'xsd:boolean', 'xsd:decimal', 'xsd:float', 809 'xsd:double', 'xsd:duration', 'xsd:dateTime', 'xsd:time', 'xsd:date', 810 'xsd:gYearMonth', 'xsd:gYear', 'xsd:gMonthDay', 'xsd:gDay', 811 'xsd:gMonth', 'xsd:hexBinary', 'xsd:base64Binary', 'xsd:anyURI', 812 'xsd:normalizedString', 'xsd:token', 'xsd:integer', 813 'xsd:nonPositiveInteger', 'xsd:negativeInteger', 'xsd:long', 'xsd:int', 814 'xsd:short', 'xsd:byte', 'xsd:nonNegativeInteger', 'xsd:unsignedLong', 815 'xsd:unsignedInt', 'xsd:unsignedShort', 'xsd:unsignedByte', 816 'xsd:positiveInteger')) 817
818 - def __init__(self, value, ref, applies_to, datatype, 819 unit=None, id_ref=None):
820 _check_str(ref, self.re_ref.match) 821 _check_str(applies_to, self.ok_applies_to.__contains__) 822 _check_str(datatype, self.ok_datatype.__contains__) 823 _check_str(unit, self.re_ref.match) 824 self.unit = unit 825 self.id_ref = id_ref 826 self.value = value 827 self.ref = ref 828 self.applies_to = applies_to 829 self.datatype = datatype
830
831 832 -class ProteinDomain(PhyloElement):
833 """Represents an individual domain in a domain architecture. 834 835 The locations use 0-based indexing, as most Python objects including 836 SeqFeature do, rather than the usual biological convention starting at 1. 837 This means the start and end attributes can be used directly as slice 838 indexes on Seq objects. 839 840 @param start: start of the domain on the sequence, using 0-based indexing 841 @type start: non-negative integer 842 @param end: end of the domain on the sequence 843 @type end: non-negative integer 844 @param confidence: can be used to store e.g. E-values. (type float) 845 @param id: unique identifier/name 846 """ 847 # TODO: confirm that 'start' counts from 1, not 0
848 - def __init__(self, value, start, end, confidence=None, id=None):
849 self.value = value 850 self.start = start 851 self.end = end 852 self.confidence = confidence 853 self.id = id
854 855 @classmethod
856 - def from_seqfeature(cls, feat):
857 return ProteinDomain(feat.id, 858 feat.location.nofuzzy_start, 859 feat.location.nofuzzy_end, 860 confidence=feat.qualifiers.get('confidence'))
861
862 - def to_seqfeature(self):
863 feat = SeqFeature(location=FeatureLocation(self.start, self.end), 864 id=self.value) 865 if hasattr(self, 'confidence'): 866 feat.qualifiers['confidence'] = self.confidence 867 return feat
868
869 870 -class Reference(PhyloElement):
871 """Literature reference for a clade. 872 873 It is recommended to use the 'doi' attribute instead of the free text 874 'desc' element whenever possible. 875 """ 876 re_doi = re.compile(r'[a-zA-Z0-9_\.]+/[a-zA-Z0-9_\.]+') 877
878 - def __init__(self, doi=None, desc=None):
879 _check_str(doi, self.re_doi.match) 880 self.doi = doi 881 self.desc = desc
882
883 884 -class Sequence(PhyloElement):
885 """A molecular sequence (Protein, DNA, RNA) associated with a node. 886 887 One intended use for 'id_ref' is to link a sequence to a taxonomy (via the 888 taxonomy's 'id_source') in case of multiple sequences and taxonomies per 889 node. 890 891 @param type: type of sequence ('dna', 'rna', or 'protein'). 892 @type id_ref: str 893 @type id_source: str 894 895 @param symbol: short symbol of the sequence, e.g. 'ACTM' (max. 10 chars) 896 @type accession: Accession 897 @param name: full name of the sequence, e.g. 'muscle Actin' 898 @param location: location of a sequence on a genome/chromosome. 899 @type mol_seq: MolSeq 900 @type uri: Uri 901 @param annotations: list of Annotation objects 902 @param domain_architecture: protein domains on this sequence (type 903 DomainArchitecture) 904 @param other: list of non-phyloXML elements (type Other) 905 """ 906 alphabets = {'dna': Alphabet.generic_dna, 907 'rna': Alphabet.generic_rna, 908 'protein': Alphabet.generic_protein} 909 re_symbol = re.compile(r'\S{1,10}') 910
911 - def __init__(self, 912 # Attributes 913 type=None, id_ref=None, id_source=None, 914 # Child nodes 915 symbol=None, accession=None, name=None, location=None, 916 mol_seq=None, uri=None, domain_architecture=None, 917 # Collections 918 annotations=None, other=None, 919 ):
920 _check_str(type, self.alphabets.__contains__) 921 _check_str(symbol, self.re_symbol.match) 922 self.type = type 923 self.id_ref = id_ref 924 self.id_source = id_source 925 self.symbol = symbol 926 self.accession = accession 927 self.name = name 928 self.location = location 929 self.mol_seq = mol_seq 930 self.uri = uri 931 self.domain_architecture = domain_architecture 932 self.annotations = annotations or [] 933 self.other = other or []
934 935 @classmethod
936 - def from_seqrecord(cls, record, is_aligned=None):
937 """Create a new PhyloXML Sequence from a SeqRecord object.""" 938 if is_aligned == None: 939 is_aligned = isinstance(record.seq.alphabet, Alphabet.Gapped) 940 params = { 941 'accession': Accession(record.id, ''), 942 'symbol': record.name, 943 'name': record.description, 944 'mol_seq': MolSeq(str(record.seq), is_aligned), 945 } 946 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 947 params['type'] = 'dna' 948 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 949 params['type'] = 'rna' 950 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 951 params['type'] = 'protein' 952 953 # Unpack record.annotations 954 for key in ('id_ref', 'id_source', 'location'): 955 if key in record.annotations: 956 params[key] = record.annotations[key] 957 if isinstance(record.annotations.get('uri'), dict): 958 params['uri'] = Uri(**record.annotations['uri']) 959 # Build a Sequence.annotation object 960 if record.annotations.get('annotations'): 961 params['annotations'] = [] 962 for annot in record.annotations['annotations']: 963 ann_args = {} 964 for key in ('ref', 'source', 'evidence', 'type', 'desc'): 965 if key in annot: 966 ann_args[key] = annot[key] 967 if isinstance(annot.get('confidence'), list): 968 ann_args['confidence'] = Confidence( 969 *annot['confidence']) 970 if isinstance(annot.get('properties'), list): 971 ann_args['properties'] = [Property(**prop) 972 for prop in annot['properties'] 973 if isinstance(prop, dict)] 974 params['annotations'].append(Annotation(**ann_args)) 975 976 # Unpack record.features 977 if record.features: 978 params['domain_architecture'] = DomainArchitecture( 979 length=len(record.seq), 980 domains=[ProteinDomain.from_seqfeature(feat) 981 for feat in record.features]) 982 983 return Sequence(**params)
984
985 - def to_seqrecord(self):
986 """Create a SeqRecord object from this Sequence instance. 987 988 The seqrecord.annotations dictionary is packed like so:: 989 990 { # Sequence attributes with no SeqRecord equivalent: 991 'id_ref': self.id_ref, 992 'id_source': self.id_source, 993 'location': self.location, 994 'uri': { 'value': self.uri.value, 995 'desc': self.uri.desc, 996 'type': self.uri.type }, 997 # Sequence.annotations attribute (list of Annotations) 998 'annotations': [{ 'ref': ann.ref, 999 'source': ann.source, 1000 'evidence': ann.evidence, 1001 'type': ann.type, 1002 'confidence': [ ann.confidence.value, 1003 ann.confidence.type ], 1004 'properties': [{ 'value': prop.value, 1005 'ref': prop.ref, 1006 'applies_to': prop.applies_to, 1007 'datatype': prop.datatype, 1008 'unit': prop.unit, 1009 'id_ref': prop.id_ref } 1010 for prop in ann.properties], 1011 } for ann in self.annotations], 1012 } 1013 """ 1014 def clean_dict(dct): 1015 """Remove None-valued items from a dictionary.""" 1016 return dict((key, val) for key, val in dct.iteritems() 1017 if val is not None)
1018 1019 seqrec = SeqRecord(Seq(self.mol_seq.value, self.get_alphabet()), 1020 **clean_dict({ 1021 'id': str(self.accession), 1022 'name': self.symbol, 1023 'description': self.name, 1024 # 'dbxrefs': None, 1025 })) 1026 if self.domain_architecture: 1027 seqrec.features = [dom.to_seqfeature() 1028 for dom in self.domain_architecture.domains] 1029 # Sequence attributes with no SeqRecord equivalent 1030 seqrec.annotations = clean_dict({ 1031 'id_ref': self.id_ref, 1032 'id_source': self.id_source, 1033 'location': self.location, 1034 'uri': self.uri and clean_dict({ 1035 'value': self.uri.value, 1036 'desc': self.uri.desc, 1037 'type': self.uri.type, 1038 }), 1039 'annotations': self.annotations and [ 1040 clean_dict({ 1041 'ref': ann.ref, 1042 'source': ann.source, 1043 'evidence': ann.evidence, 1044 'type': ann.type, 1045 'confidence': ann.confidence and [ 1046 ann.confidence.value, 1047 ann.confidence.type], 1048 'properties': [clean_dict({ 1049 'value': prop.value, 1050 'ref': prop.ref, 1051 'applies_to': prop.applies_to, 1052 'datatype': prop.datatype, 1053 'unit': prop.unit, 1054 'id_ref': prop.id_ref }) 1055 for prop in ann.properties], 1056 }) for ann in self.annotations], 1057 }) 1058 return seqrec
1059
1060 - def get_alphabet(self):
1061 alph = self.alphabets.get(self.type, Alphabet.generic_alphabet) 1062 if self.mol_seq and self.mol_seq.is_aligned: 1063 return Alphabet.Gapped(alph) 1064 return alph
1065
1066 1067 -class SequenceRelation(PhyloElement):
1068 """Express a typed relationship between two sequences. 1069 1070 For example, this could be used to describe an orthology (in which case 1071 attribute 'type' is 'orthology'). 1072 1073 @param id_ref_0: first sequence reference identifier 1074 @param id_ref_1: second sequence reference identifier 1075 @param distance: distance between the two sequences (type float) 1076 @param type: describe the type of relationship 1077 1078 @type confidence: Confidence 1079 """ 1080 ok_type = set(('orthology', 'one_to_one_orthology', 'super_orthology', 1081 'paralogy', 'ultra_paralogy', 'xenology', 'unknown', 'other')) 1082
1083 - def __init__(self, type, id_ref_0, id_ref_1, 1084 distance=None, confidence=None):
1085 _check_str(type, self.ok_type.__contains__) 1086 self.distance = distance 1087 self.type = type 1088 self.id_ref_0 = id_ref_0 1089 self.id_ref_1 = id_ref_1 1090 self.confidence = confidence
1091
1092 1093 -class Taxonomy(PhyloElement):
1094 """Describe taxonomic information for a clade. 1095 1096 @param id_source: link other elements to a taxonomy (on the XML level) 1097 1098 @param id: unique identifier of a taxon, e.g. Id('6500', 1099 provider='ncbi_taxonomy') for the California sea hare 1100 @param code: store UniProt/Swiss-Prot style organism codes, e.g. 'APLCA' 1101 for the California sea hare 'Aplysia californica' (restricted string) 1102 @param scientific_name: the standard scientific name for this organism, 1103 e.g. 'Aplysia californica' for the California sea hare 1104 @param authority: keep the authority, such as 'J. G. Cooper, 1863', 1105 associated with the 'scientific_name' 1106 @param common_names: list of common names for this organism 1107 @param synonyms: ??? 1108 @param rank: taxonomic rank (restricted string) 1109 @type uri: Uri 1110 @param other: list of non-phyloXML elements (type Other) 1111 """ 1112 re_code = re.compile(r'[a-zA-Z0-9_]{2,10}') 1113 ok_rank = set(('domain', 'kingdom', 'subkingdom', 'branch', 'infrakingdom', 1114 'superphylum', 'phylum', 'subphylum', 'infraphylum', 'microphylum', 1115 'superdivision', 'division', 'subdivision', 'infradivision', 1116 'superclass', 'class', 'subclass', 'infraclass', 'superlegion', 1117 'legion', 'sublegion', 'infralegion', 'supercohort', 'cohort', 1118 'subcohort', 'infracohort', 'superorder', 'order', 'suborder', 1119 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe', 'subtribe', 1120 'infratribe', 'genus', 'subgenus', 'superspecies', 'species', 1121 'subspecies', 'variety', 'subvariety', 'form', 'subform', 'cultivar', 1122 'unknown', 'other')) 1123
1124 - def __init__(self, 1125 # Attributes 1126 id_source=None, 1127 # Child nodes 1128 id=None, code=None, scientific_name=None, authority=None, 1129 rank=None, uri=None, 1130 # Collections 1131 common_names=None, synonyms=None, other=None, 1132 ):
1133 _check_str(code, self.re_code.match) 1134 _check_str(rank, self.ok_rank.__contains__) 1135 self.id_source = id_source 1136 self.id = id 1137 self.code = code 1138 self.scientific_name = scientific_name 1139 self.authority = authority 1140 self.rank = rank 1141 self.uri = uri 1142 self.common_names = common_names or [] 1143 self.synonyms = synonyms or [] 1144 self.other = other or []
1145
1146 - def __str__(self):
1147 """Show the class name and an identifying attribute.""" 1148 if self.code is not None: 1149 return self.code 1150 if self.scientific_name is not None: 1151 return self.scientific_name 1152 if self.rank is not None: 1153 return self.rank 1154 if self.id is not None: 1155 return str(self.id) 1156 return self.__class__.__name__
1157
1158 1159 -class Uri(PhyloElement):
1160 """A uniform resource identifier. 1161 1162 In general, this is expected to be an URL (for example, to link to an image 1163 on a website, in which case the 'type' attribute might be 'image' and 'desc' 1164 might be 'image of a California sea hare'). 1165 """
1166 - def __init__(self, value, desc=None, type=None):
1167 self.value = value 1168 self.desc = desc 1169 self.type = type
1170
1171 - def __str__(self):
1172 if self.value: 1173 return self.value 1174 return repr(self)
1175