Package Bio :: Package Align :: Package Applications :: Module _Muscle
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Applications._Muscle

  1  # Copyright 2009 by Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Command line wrapper for the multiple alignment program MUSCLE. 
  6   
  7  http://www.drive5.com/muscle/ 
  8   
  9  Citations: 
 10   
 11  Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high accuracy 
 12  and high throughput, Nucleic Acids Research 32(5), 1792-97. 
 13   
 14  Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with reduced 
 15  time and space complexity. BMC Bioinformatics 5(1): 113. 
 16   
 17  Last checked against version: 3.7 
 18  """ 
 19  import types 
 20  from Bio.Application import _Option, _Switch, AbstractCommandline 
 21   
22 -class MuscleCommandline(AbstractCommandline):
23 """Command line wrapper for the multiple alignment program MUSCLE."""
24 - def __init__(self, cmd="muscle", **kwargs):
25 CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"] 26 DISTANCE_MEASURES_ITER1 = ["kmer6_6", "kmer20_3", "kmer20_4", "kbit20_3", 27 "kmer4_6"] 28 DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + \ 29 ["pctid_kimura", "pctid_log"] 30 OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"] 31 TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"] 32 SEQUENCE_TYPES = ["protein", "nucleo", "auto"] 33 WEIGHTING_SCHEMES = ["none", "clustalw", "henikoff", "henikoffpb", 34 "gsc", "threeway"] 35 self.parameters = \ 36 [ 37 #Can't use "in" as the final alias as this is a reserved word in python: 38 _Option(["-in", "in", "input"], ["input", "file"], 39 None, 0, "Input filename", 40 0), #No equate 41 _Option(["-out", "out"], ["output", "file"], 42 None, 0, "Output filename", 43 0), #No equate 44 _Switch(["-diags", "diags"], ["input"], 45 "Find diagonals (faster for similar sequences)"), 46 _Switch(["-profile", "profile"], ["input"], 47 "Perform a profile alignment"), 48 _Option(["-in1", "in1"], ["input", "file"], 49 None, 0, 50 "First input filename for profile alignment", 51 0), 52 _Option(["-in2", "in2"], ["input", "file"], 53 None, 0, 54 "Second input filename for a profile alignment", 55 0), 56 #anchorspacing Integer 32 Minimum spacing between 57 _Option(["-anchorspacing", "anchorspacing"], ["input"], 58 lambda x: isinstance(x, types.IntType), 59 0, 60 "Minimum spacing between anchor columns", 61 0), 62 #center Floating point [1] Center parameter. 63 # Should be negative. 64 _Option(["-center", "center"], ["input"], 65 lambda x: isinstance(x, types.FloatType), 66 0, 67 "Center parameter - should be negative", 68 0), 69 #cluster1 upgma upgmb Clustering method. 70 _Option(["-cluster1", "cluster1"], ["input"], 71 lambda x: x in CLUSTERING_ALGORITHMS, 0, 72 "Clustering method used in iteration 1", 73 0), 74 #cluster2 upgmb cluster1 is used in 75 # neighborjoining iteration 1 and 2, 76 # cluster2 in later 77 # iterations. 78 _Option(["-cluster2", "cluster2"], ["input"], 79 lambda x: x in CLUSTERING_ALGORITHMS, 0, 80 "Clustering method used in iteration 2", 81 0), 82 #diaglength Integer 24 Minimum length of 83 # diagonal. 84 _Option(["-diaglength", "diaglength"], ["input"], 85 lambda x: isinstance(x, types.IntType), 86 0, 87 "Minimum length of diagonal", 88 0), 89 #diagmargin Integer 5 Discard this many 90 # positions at ends of 91 # diagonal. 92 _Option(["-diagmargin", "diagmargin"], ["input"], 93 lambda x: isinstance(x, types.IntType), 94 0, 95 "Discard this many positions at ends of diagonal", 96 0), 97 #distance1 kmer6_6 Kmer6_6 (amino) or Distance measure for 98 # kmer20_3 Kmer4_6 (nucleo) iteration 1. 99 # kmer20_4 100 # kbit20_3 101 # kmer4_6 102 _Option(["-distance1", "distance1"], ["input"], 103 lambda x: x in DISTANCE_MEASURES_ITER1, 104 0, 105 "Distance measure for iteration 1", 106 0), 107 #distance2 kmer6_6 pctid_kimura Distance measure for 108 # kmer20_3 iterations 2, 3 ... 109 # kmer20_4 110 # kbit20_3 111 # pctid_kimura 112 # pctid_log 113 _Option(["-distance2", "distance2"], ["input"], 114 lambda x: x in DISTANCE_MEASURES_ITER2, 115 0, 116 "Distance measure for iteration 2", 117 0), 118 #gapopen Floating point [1] The gap open score. 119 # Must be negative. 120 _Option(["-gapopen", "gapopen"], ["input"], 121 lambda x: isinstance(x, types.FloatType), 122 0, 123 "Gap open score - negative number", 124 0), 125 #hydro Integer 5 Window size for 126 # determining whether a 127 # region is hydrophobic. 128 _Option(["-hydro", "hydro"], ["input"], 129 lambda x: isinstance(x, types.IntType), 130 0, 131 "Window size for hydrophobic region", 132 0), 133 #hydrofactor Floating point 1.2 Multiplier for gap 134 # open/close penalties in 135 # hydrophobic regions. 136 _Option(["-hydrofactor", "hydrofactor"], ["input"], 137 lambda x: isinstance(x, types.FloatType), 138 0, 139 "Multiplier for gap penalties in hydrophobic regions", 140 0), 141 #log File name None. Log file name (delete 142 # existing file). 143 _Option(["-log", "log"], ["output", "file"], 144 None, 0, 145 "Log file name", 146 0), 147 #loga File name None. Log file name (append 148 # to existing file). 149 _Option(["-loga", "loga"], ["output", "file"], 150 None, 0, 151 "Log file name (append to existing file)", 152 0), 153 #maxdiagbreak Integer 1 Maximum distance 154 # between two diagonals 155 # that allows them to 156 # merge into one 157 # diagonal. 158 _Option(["-maxdiagbreak", "maxdiagbreak"], ["input"], 159 lambda x: isinstance(x, types.IntType), 160 0, 161 "Maximum distance between two diagonals that allows " + \ 162 "them to merge into one diagonal", 163 0), 164 #maxhours Floating point None. Maximum time to run in 165 # hours. The actual time 166 # may exceed the 167 # requested limit by a 168 # few minutes. Decimals 169 # are allowed, so 1.5 170 # means one hour and 30 171 # minutes. 172 _Option(["-maxhours", "maxhours"], ["input"], 173 lambda x: isinstance(x, types.FloatType), 174 0, 175 "Maximum time to run in hours", 176 0), 177 #maxiters Integer 1, 2 ... 16 Maximum number of 178 # iterations. 179 _Option(["-maxiters", "maxiters"], ["input"], 180 lambda x: isinstance(x, types.IntType), 181 0, 182 "Maximum number of iterations", 183 0), 184 #maxtrees Integer 1 Maximum number of new 185 # trees to build in 186 # iteration 2. 187 _Option(["-maxtrees", "maxtrees"], ["input"], 188 lambda x: isinstance(x, types.IntType), 189 0, 190 "Maximum number of trees to build in iteration 2", 191 0), 192 #minbestcolscore Floating point [1] Minimum score a column 193 # must have to be an 194 # anchor. 195 _Option(["-minbestcolscore", "minbestcolscore"], ["input"], 196 lambda x: isinstance(x, types.FloatType), 197 0, 198 "Minimum score a column must have to be an anchor", 199 0), 200 #minsmoothscore Floating point [1] Minimum smoothed score 201 # a column must have to 202 # be an anchor. 203 _Option(["-minsmoothscore", "minsmoothscore"], ["input"], 204 lambda x: isinstance(x, types.FloatType), 205 0, 206 "Minimum smoothed score a column must have to " + \ 207 "be an anchor", 208 0), 209 #objscore sp spm Objective score used by 210 # ps tree dependent 211 # dp refinement. 212 # xp sp=sum-of-pairs score. 213 # spf spf=sum-of-pairs score 214 # spm (dimer approximation) 215 # spm=sp for < 100 seqs, 216 # otherwise spf 217 # dp=dynamic programming 218 # score. 219 # ps=average profile- 220 # sequence score. 221 # xp=cross profile score. 222 _Option(["-objscore", "objscore"], ["input"], 223 lambda x: x in OBJECTIVE_SCORES, 224 0, 225 "Objective score used by tree dependent refinement", 226 0), 227 #root1 pseudo psuedo Method used to root 228 _Option(["-root1", "root1"], ["input"], 229 lambda x: x in TREE_ROOT_METHODS, 230 0, 231 "Method used to root tree in iteration 1", 232 0), 233 #root2 midlongestspan tree; root1 is used in 234 # minavgleafdist iteration 1 and 2, 235 # root2 in later 236 # iterations. 237 _Option(["-root2", "root2"], ["input"], 238 lambda x: x in TREE_ROOT_METHODS, 239 0, 240 "Method used to root tree in iteration 2", 241 0), 242 #seqtype protein auto Sequence type. 243 # nucleo 244 # auto 245 _Option(["-seqtype", "seqtype"], ["input"], 246 lambda x: x in SEQUENCE_TYPES, 247 0, 248 "Sequence type", 249 0), 250 #smoothscoreceil Floating point [1] Maximum value of column 251 # score for smoothing 252 # purposes. 253 _Option(["-smoothscoreceil", "smoothscoreceil"], ["input"], 254 lambda x: isinstance(x, types.FloatType), 255 0, 256 "Maximum value of column score for smoothing", 257 0), 258 #smoothwindow Integer 7 Window used for anchor 259 # column smoothing. 260 _Option(["-smoothwindow", "smoothwindow"], ["input"], 261 lambda x: isinstance(x, types.IntType), 262 0, 263 "Window used for anchor column smoothing", 264 0), 265 #SUEFF Floating point value 0.1 Constant used in UPGMB 266 # between 0 and 1. clustering. Determines 267 # the relative fraction 268 # of average linkage 269 # (SUEFF) vs. nearest- 270 # neighbor linkage (1 271 # SUEFF). 272 _Option(["-sueff", "sueff"], ["input"], 273 lambda x: isinstance(x, types.FloatType), 274 0, 275 "Constant used in UPGMB clustering", 276 0), 277 #tree1 File name None Save tree produced in 278 _Option(["-tree1", "tree1"], ["input"], 279 None, 0, 280 "Save Newick tree from iteration 1", 281 0), 282 #tree2 first or second 283 # iteration to given file 284 # in Newick (Phylip- 285 # compatible) format. 286 _Option(["-tree2", "tree2"], ["input"], 287 None, 0, 288 "Save Newick tree from iteration 2", 289 0), 290 #weight1 none clustalw Sequence weighting 291 _Option(["-weight1", "weight1"], ["input"], 292 lambda x: x in WEIGHTING_SCHEMES, 293 0, 294 "Weighting scheme used in iteration 1", 295 0), 296 #weight2 henikoff scheme. 297 # henikoffpb weight1 is used in 298 # gsc iterations 1 and 2. 299 # clustalw weight2 is used for 300 # threeway tree-dependent 301 # refinement. 302 # none=all sequences have 303 # equal weight. 304 # henikoff=Henikoff & 305 # Henikoff weighting 306 # scheme. 307 # henikoffpb=Modified 308 # Henikoff scheme as used 309 # in PSI-BLAST. 310 # clustalw=CLUSTALW 311 # method. 312 # threeway=Gotoh three- 313 # way method. 314 _Option(["-weight2", "weight2"], ["input"], 315 lambda x: x in WEIGHTING_SCHEMES, 316 0, 317 "Weighting scheme used in iteration 2", 318 0), 319 #################### FORMATS ####################################### 320 # Multiple formats can be specified on the command line 321 # If -msf appears it will be used regardless of other formats 322 # specified. If -clw appears (and not -msf), clustalw format will be 323 # used regardless of other formats specified. If both -clw and 324 # -clwstrict are specified -clwstrict will be used regardless of 325 # other formats specified. If -fasta is specified and not -msf, 326 # -clw, or clwstrict, fasta will be used. If -fasta and -html are 327 # specified -fasta will be used. Only if -html is specified alone 328 # will html be used. I kid ye not. 329 #clw no Write output in CLUSTALW format (default is 330 # FASTA). 331 _Switch(["-clw", "clw"], ["input"], 332 "Write output in CLUSTALW format (with a MUSCLE header)"), 333 #clwstrict no Write output in CLUSTALW format with the 334 # "CLUSTAL W (1.81)" header rather than the 335 # MUSCLE version. This is useful when a post- 336 # processing step is picky about the file 337 # header. 338 _Switch(["-clwstrict", "clwstrict"], ["input"], 339 "Write output in CLUSTALW format with version 1.81 header"), 340 #fasta yes Write output in FASTA format. Alternatives 341 # include clw, 342 # clwstrict, msf and html. 343 _Switch(["-fasta", "fasta"], ["input"], 344 "Write output in FASTA format"), 345 #html no Write output in HTML format (default is 346 # FASTA). 347 _Switch(["-html", "html"], ["input"], 348 "Write output in HTML format"), 349 #msf no Write output in MSF format (default is 350 # FASTA). 351 _Switch(["-msf", "msf"], ["input"], 352 "Write output in MSF format"), 353 #Phylip interleaved - undocumented as of 3.7 354 _Switch(["-phyi", "phyi"], ["input"], 355 "Write output in PHYLIP interleaved format"), 356 #Phylip sequential - undocumented as of 3.7 357 _Switch(["-phys", "phys"], ["input"], 358 "Write output in PHYLIP sequential format"), 359 ################## Additional specified output files ######### 360 _Option(["-phyiout", "phyiout"], ["output", "file"], 361 None, 0, 362 "Write PHYLIP interleaved output to specified filename", 363 0), #No equate 364 _Option(["-physout", "physout"], ["output", "file"], 365 None, 0, 366 "Write PHYLIP sequential format to specified filename", 367 0), #No equate 368 _Option(["-htmlout", "htmlout"], ["output", "file"], 369 None, 0, 370 "Write HTML output to specified filename", 371 0), #No equate 372 _Option(["-clwout", "clwout"], ["output", "file"], 373 None, 0, 374 "Write CLUSTALW output (with MUSCLE header) to specified " 375 "filename", 376 0), #No equate 377 _Option(["-clwstrictout", "clwstrictout"], ["output", "file"], 378 None, 0, 379 "Write CLUSTALW output (with version 1.81 header) to " 380 "specified filename", 381 0), #No equate 382 _Option(["-msfout", "msfout"], ["output", "file"], 383 None, 0, 384 "Write MSF format output to specified filename", 385 0), #No equate 386 _Option(["-fastaout", "fastaout"], ["output", "file"], 387 None, 0, 388 "Write FASTA format output to specified filename", 389 0), #No equate 390 ############## END FORMATS ################################### 391 #anchors yes Use anchor optimization in tree dependent 392 # refinement iterations. 393 _Switch(["-anchors", "anchors"], ["input"], 394 "Use anchor optimisation in tree dependent " + \ 395 "refinement iterations"), 396 #noanchors no Disable anchor optimization. Default is 397 # anchors. 398 _Switch(["-noanchors", "noanchors"], ["input"], 399 "Do not use anchor optimisation in tree dependent " + \ 400 "refinement iterations"), 401 #group yes Group similar sequences together in the 402 # output. This is the default. See also 403 # stable. 404 _Switch(["-group", "group"], ["input"], 405 "Group similar sequences in output"), 406 #stable no Preserve input order of sequences in output 407 # file. Default is to group sequences by 408 # similarity (group). 409 _Switch(["-stable", "stable"], ["input"], 410 "Do not group similar sequences in output"), 411 ############## log-expectation profile score ###################### 412 # One of either -le, -sp, or -sv 413 # 414 # According to the doc, spn is default and the only option for 415 # nucleotides: this doesnt appear to be true. -le, -sp, and -sv can 416 # be used and produce numerically different logs (what is going on?) 417 # 418 #spn fails on proteins 419 #le maybe Use log-expectation profile score (VTML240). 420 # Alternatives are to use sp or sv. This is 421 # the default for amino acid sequences. 422 _Switch(["-le", "le"], ["input"], 423 "Use log-expectation profile score (VTML240)"), 424 #sv no Use sum-of-pairs profile score (VTML240). 425 # Default is le. 426 _Switch(["-sv", "sv"], ["input"], 427 "Use sum-of-pairs profile score (VTML240)"), 428 #sp no Use sum-of-pairs protein profile score 429 # (PAM200). Default is le. 430 _Switch(["-sp", "sp"], ["input"], 431 "Use sum-of-pairs protein profile score (PAM200)"), 432 #spn maybe Use sum-of-pairs nucleotide profile score 433 # (BLASTZ parameters). This is the only option 434 # for nucleotides, and is therefore the 435 # default. 436 _Switch(["-spn", "spn"], ["input"], 437 "Use sum-of-pairs protein nucleotide profile score"), 438 ############## END log-expectation profile score ###################### 439 #quiet no Do not display progress messages. 440 _Switch(["-quiet", "quiet"], ["input"], 441 "Use sum-of-pairs protein nucleotide profile score"), 442 #refine no Input file is already aligned, skip first 443 # two iterations and begin tree dependent 444 # refinement. 445 _Switch(["-refine", "refine"], ["input"], 446 "Only do tree dependent refinement"), 447 #core yes in muscle, Do not catch exceptions. 448 # no in muscled. 449 _Switch(["-core", "core"], ["input"], 450 "Catch exceptions"), 451 #nocore no in muscle, Catch exceptions and give an error message 452 # yes in muscled. if possible. 453 _Switch(["-nocore", "nocore"], ["input"], 454 "Do not catch exceptions"), 455 #termgapsfull no Terminal gaps penalized with full penalty. 456 # [1] Not fully supported in this version. 457 # 458 #termgapshalf yes Terminal gaps penalized with half penalty. 459 # [1] Not fully supported in this version. 460 # 461 #termgapshalflonger no Terminal gaps penalized with half penalty if 462 # gap relative to 463 # longer sequence, otherwise with full 464 # penalty. 465 # [1] Not fully supported in this version. 466 #verbose no Write parameter settings and progress 467 # messages to log file. 468 _Switch(["-verbose", "verbose"], ["input"], 469 "Write parameter settings and progress"), 470 #version no Write version string to stdout and exit. 471 _Switch(["-version", "version"], ["input"], 472 "Write version string to stdout and exit"), 473 ] 474 AbstractCommandline.__init__(self, cmd, **kwargs)
475