Package Bio :: Package Align :: Package Applications :: Module _Mafft
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Applications._Mafft

  1  # Copyright 2009 by Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Command line wrapper for the multiple alignment programme MAFFT. 
  6   
  7  http://align.bmr.kyushu-u.ac.jp/mafft/software/ 
  8   
  9  Citations: 
 10   
 11  Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of 
 12  multiple ncRNA alignment by incorporating structural information into a 
 13  MAFFT-based framework (describes RNA structural alignment methods) 
 14   
 15  Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent developments in 
 16  the MAFFT multiple sequence alignment program (outlines version 6) 
 17  Katoh, Toh (Bioinformatics 23:372-374, 2007)  Errata PartTree: an algorithm to 
 18  build an approximate tree from a large number of unaligned sequences (describes 
 19  the PartTree algorithm) 
 20   
 21  Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT version 5: 
 22  improvement in accuracy of multiple sequence alignment (describes [ancestral 
 23  versions of] the G-INS-i, L-INS-i and E-INS-i strategies) Katoh, Misawa, Kuma, 
 24  Miyata (Nucleic Acids Res. 30:3059-3066, 2002) 
 25   
 26  Last checked against version: 6.626b (2009/03/16) 
 27  """ 
 28  import os 
 29  import types 
 30  from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline 
 31   
32 -class MafftCommandline(AbstractCommandline):
33 """Command line wrapper for the multiple alignment program MAFFT."""
34 - def __init__(self, cmd="mafft", **kwargs):
35 BLOSUM_MATRICES = ["30","45","62","80"] 36 self.parameters = \ 37 [ 38 #**** Algorithm **** 39 #Automatically selects an appropriate strategy from L-INS-i, FFT-NS- 40 #i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2) 41 _Switch(["--auto", "auto"], ["input"], 42 "Automatically select strategy. Default off."), 43 #Distance is calculated based on the number of shared 6mers. Default: on 44 _Switch(["--6merpair", "6merpair", "sixmerpair"], ["input"], 45 "Distance is calculated based on the number of shared " + \ 46 "6mers. Default: on"), 47 #All pairwise alignments are computed with the Needleman-Wunsch 48 #algorithm. More accurate but slower than --6merpair. Suitable for a 49 #set of globally alignable sequences. Applicable to up to ~200 50 #sequences. A combination with --maxiterate 1000 is recommended (G- 51 #INS-i). Default: off (6mer distance is used) 52 _Switch(["--globalpair", "globalpair"], ["input"], 53 "All pairwise alignments are computed with the " + \ 54 "Needleman-Wunsch algorithm. Default: off"), 55 #All pairwise alignments are computed with the Smith-Waterman 56 #algorithm. More accurate but slower than --6merpair. Suitable for a 57 #set of locally alignable sequences. Applicable to up to ~200 58 #sequences. A combination with --maxiterate 1000 is recommended (L- 59 #INS-i). Default: off (6mer distance is used) 60 _Switch(["--localpair", "localpair"], ["input"], 61 "All pairwise alignments are computed with the " + \ 62 "Smith-Waterman algorithm. Default: off"), 63 #All pairwise alignments are computed with a local algorithm with 64 #the generalized affine gap cost (Altschul 1998). More accurate but 65 #slower than --6merpair. Suitable when large internal gaps are 66 #expected. Applicable to up to ~200 sequences. A combination with -- 67 #maxiterate 1000 is recommended (E-INS-i). Default: off (6mer 68 #distance is used) 69 _Switch(["--genafpair", "genafpair"], ["input"], 70 "All pairwise alignments are computed with a local " + \ 71 "algorithm with the generalized affine gap cost " + \ 72 "(Altschul 1998). Default: off"), 73 #All pairwise alignments are computed with FASTA (Pearson and Lipman 74 #1988). FASTA is required. Default: off (6mer distance is used) 75 _Switch(["--fastapair", "fastapair"], ["input"], 76 "All pairwise alignments are computed with FASTA " + \ 77 "(Pearson and Lipman 1988). Default: off"), 78 #Weighting factor for the consistency term calculated from pairwise 79 #alignments. Valid when either of --blobalpair, --localpair, -- 80 #genafpair, --fastapair or --blastpair is selected. Default: 2.7 81 _Option(["--weighti", "weighti"], ["input"], 82 lambda x: isinstance(x, types.FloatType), 0, 83 "Weighting factor for the consistency term calculated " + \ 84 "from pairwise alignments. Default: 2.7", 85 0), 86 #Guide tree is built number times in the progressive stage. Valid 87 #with 6mer distance. Default: 2 88 _Option(["--retree", "retree"], ["input"], 89 lambda x: isinstance(x, types.IntType), 0, 90 "Guide tree is built number times in the progressive " + \ 91 "stage. Valid with 6mer distance. Default: 2", 92 0), 93 #Number cycles of iterative refinement are performed. Default: 0 94 _Option(["--maxiterate", "maxiterate"], ["input"], 95 lambda x: isinstance(x, types.IntType), 0, 96 "Number cycles of iterative refinement are performed. " + \ 97 "Default: 0", 98 0), 99 #Use FFT approximation in group-to-group alignment. Default: on 100 _Switch(["--fft", "fft"], ["input"], 101 "Use FFT approximation in group-to-group alignment. " + \ 102 "Default: on"), 103 #Do not use FFT approximation in group-to-group alignment. Default: 104 #off 105 _Switch(["--nofft", "nofft"], ["input"], 106 "Do not use FFT approximation in group-to-group " + \ 107 "alignment. Default: off"), 108 #Alignment score is not checked in the iterative refinement stage. 109 #Default: off (score is checked) 110 _Switch(["--noscore", "noscore"], ["input"], 111 "Alignment score is not checked in the iterative " + \ 112 "refinement stage. Default: off (score is checked)"), 113 #Use the Myers-Miller (1988) algorithm. Default: automatically 114 #turned on when the alignment length exceeds 10,000 (aa/nt). 115 _Switch(["--memsave", "memsave"], ["input"], 116 "Use the Myers-Miller (1988) algorithm. Default: " + \ 117 "automatically turned on when the alignment length " + \ 118 "exceeds 10,000 (aa/nt)."), 119 #Use a fast tree-building method (PartTree, Katoh and Toh 2007) with 120 #the 6mer distance. Recommended for a large number (> ~10,000) of 121 #sequences are input. Default: off 122 _Switch(["--parttree", "parttree"], ["input"], 123 "Use a fast tree-building method with the 6mer " + \ 124 "distance. Default: off"), 125 #The PartTree algorithm is used with distances based on DP. Slightly 126 #more accurate and slower than --parttree. Recommended for a large 127 #number (> ~10,000) of sequences are input. Default: off 128 _Switch(["--dpparttree", "dpparttree"], ["input"], 129 "The PartTree algorithm is used with distances " + \ 130 "based on DP. Default: off"), 131 #The PartTree algorithm is used with distances based on FASTA. 132 #Slightly more accurate and slower than --parttree. Recommended for 133 #a large number (> ~10,000) of sequences are input. FASTA is 134 #required. Default: off 135 _Switch(["--fastaparttree", "fastaparttree"], ["input"], 136 "The PartTree algorithm is used with distances based " + \ 137 "on FASTA. Default: off"), 138 #The number of partitions in the PartTree algorithm. Default: 50 139 _Option(["--partsize", "partsize"], ["input"], 140 lambda x: isinstance(x, types.IntType), 0, 141 "The number of partitions in the PartTree algorithm. " + \ 142 "Default: 50", 143 0), 144 #Do not make alignment larger than number sequences. Valid only with 145 #the --*parttree options. Default: the number of input sequences 146 _Switch(["--groupsize", "groupsize"], ["input"], 147 "Do not make alignment larger than number sequences. " + \ 148 "Default: the number of input sequences"), 149 #**** Parameter **** 150 #Gap opening penalty at group-to-group alignment. Default: 1.53 151 _Option(["--op", "op"], ["input"], 152 lambda x: isinstance(x, types.FloatType), 0, 153 "Gap opening penalty at group-to-group alignment. " + \ 154 "Default: 1.53", 155 0), 156 #Offset value, which works like gap extension penalty, for group-to- 157 #group alignment. Deafult: 0.123 158 _Option(["--ep", "ep"], ["input"], 159 lambda x: isinstance(x, types.FloatType), 0, 160 "Offset value, which works like gap extension penalty, " + \ 161 "for group-to- group alignment. Default: 0.123", 162 0), 163 #Gap opening penalty at local pairwise alignment. Valid when the -- 164 #localpair or --genafpair option is selected. Default: -2.00 165 _Option(["--lop", "lop"], ["input"], 166 lambda x: isinstance(x, types.FloatType), 0, 167 "Gap opening penalty at local pairwise alignment. " + \ 168 "Default: 0.123", 169 0), 170 #Offset value at local pairwise alignment. Valid when the -- 171 #localpair or --genafpair option is selected. Default: 0.1 172 _Option(["--lep", "lep"], ["input"], 173 lambda x: isinstance(x, types.FloatType), 0, 174 "Offset value at local pairwise alignment. " + \ 175 "Default: 0.1", 176 0), 177 #Gap extension penalty at local pairwise alignment. Valid when the - 178 #-localpair or --genafpair option is selected. Default: -0.1 179 _Option(["--lexp", "lexp"], ["input"], 180 lambda x: isinstance(x, types.FloatType), 0, 181 "Gap extension penalty at local pairwise alignment. " + \ 182 "Default: -0.1", 183 0), 184 #Gap opening penalty to skip the alignment. Valid when the -- 185 #genafpair option is selected. Default: -6.00 186 _Option(["--LOP", "LOP"], ["input"], 187 lambda x: isinstance(x, types.FloatType), 0, 188 "Gap opening penalty to skip the alignment. " + \ 189 "Default: -6.00", 190 0), 191 #Gap extension penalty to skip the alignment. Valid when the -- 192 #genafpair option is selected. Default: 0.00 193 _Option(["--LEXP", "LEXP"], ["input"], 194 lambda x: isinstance(x, types.FloatType), 195 0, 196 "Gap extension penalty to skip the alignment. " + \ 197 "Default: 0.00", 198 0), 199 200 #BLOSUM number matrix (Henikoff and Henikoff 1992) is used. 201 #number=30, 45, 62 or 80. Default: 62 202 _Option(["--bl", "bl"], ["input"], 203 lambda x: x in BLOSUM_MATRICES, 0, 204 "BLOSUM number matrix is used. Default: 62", 205 0), 206 #JTT PAM number (Jones et al. 1992) matrix is used. number>0. 207 #Default: BLOSUM62 208 _Option(["--jtt", "jtt"], ["input"], None, 0, 209 "JTT PAM number (Jones et al. 1992) matrix is used. " + \ 210 "number>0. Default: BLOSUM62", 211 0), 212 #Transmembrane PAM number (Jones et al. 1994) matrix is used. 213 #number>0. Default: BLOSUM62 214 _Option(["--tm", "tm"], ["input"], 215 os.path.exists, 0, 216 "Transmembrane PAM number (Jones et al. 1994) " + \ 217 "matrix is used. number>0. Default: BLOSUM62", 218 0), 219 #Use a user-defined AA scoring matrix. The format of matrixfile is 220 #the same to that of BLAST. Ignored when nucleotide sequences are 221 #input. Default: BLOSUM62 222 _Option(["--aamatrix", "aamatrix"], ["input"], 223 os.path.exists, 0, 224 "Use a user-defined AA scoring matrix. " + \ 225 "Default: BLOSUM62", 226 0), 227 #Incorporate the AA/nuc composition information into the scoring 228 #matrix. Default: off 229 _Switch(["--fmodel", "fmodel"], ["input"], 230 "Incorporate the AA/nuc composition information " + \ 231 "into the scoring matrix. Default: off"), 232 #**** Output **** 233 #Output format: clustal format. Default: off (fasta format) 234 _Switch(["--clustalout", "clustalout"], ["input"], 235 "Output format: clustal format. Default: off (fasta" + \ 236 "format)"), 237 #Output order: same as input. Default: on 238 _Switch(["--inputorder", "inputorder"], ["input"], 239 "Output order: same as input. Default: on"), 240 #Output order: aligned. Default: off (inputorder) 241 _Switch(["--reorder", "reorder"], ["input"], 242 "Output order: aligned. Default: off (inputorder)"), 243 #Guide tree is output to the input.tree file. Default: off 244 _Switch(["--treeout", "treeout"], ["input"], 245 "Guide tree is output to the input.tree file. Default: off"), 246 #Do not report progress. Default: off 247 _Switch(["--quiet", "quiet"], ["input"], 248 "Do not report progress. Default: off"), 249 #**** Input **** 250 #Assume the sequences are nucleotide. Deafult: auto 251 _Switch(["--nuc", "nuc"], ["input"], 252 "Assume the sequences are nucleotide. Default: auto"), 253 #Assume the sequences are amino acid. Deafult: auto 254 _Switch(["--amino", "amino"], ["input"], 255 "Assume the sequences are amino acid. Default: auto"), 256 ###################### SEEDS ##################################### 257 # MAFFT has multiple --seed commands where the unaligned input is 258 # aligned to the seed alignment. There can be multiple seeds in the 259 # form: "mafft --seed align1 --seed align2 [etc] input" 260 # Effectively for n number of seed alignments. Here we're going to 261 # assume 6 extra are enough 262 _Option(["--seed", "seed"], ["input", "file"], os.path.exists, 0, 263 "Seed alignments given in alignment_n (fasta format) " + \ 264 "are aligned with sequences in input.", 265 0), 266 #The old solution of also defining extra parameters with 267 #["--seed", "seed1"] etc worked, but clashes with the recent 268 #code in the base class to look for duplicate paramters and raise 269 #an error. Perhaps that check should be ignored here, or maybe 270 #we can handle this more elegantly... 271 #TODO - Create an _OptionList parameter which allows a list to be 272 #assigned to the value? 273 ####################### END SEEDS ################################ 274 #The input (must be FASTA format) 275 _Argument(["input"], ["input"], os.path.exists, 1, 276 "Input file name"), 277 ################################################################### 278 #mafft-profile takes a second alignment input as an argument: 279 #mafft-profile align1 align2 280 _Argument(["input1"], ["input"], os.path.exists, 0, 281 "Second input file name for the mafft-profile command") 282 ] 283 AbstractCommandline.__init__(self, cmd, **kwargs)
284