Package Bio :: Package AlignIO :: Module ClustalIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.ClustalIO

  1  # Copyright 2006-2010 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """ 
  7  Bio.AlignIO support for the "clustal" output from CLUSTAL W and other tools. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11  """ 
 12   
 13  from Bio.Align import MultipleSeqAlignment 
 14  from Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 15   
16 -class ClustalWriter(SequentialAlignmentWriter):
17 """Clustalw alignment writer."""
18 - def write_alignment(self, alignment):
19 """Use this to write (another) single alignment to an open file.""" 20 21 if len(alignment) == 0: 22 raise ValueError("Must have at least one sequence") 23 if alignment.get_alignment_length() == 0: 24 #This doubles as a check for an alignment object 25 raise ValueError("Non-empty sequences are required") 26 27 #Old versions of the parser in Bio.Clustalw used a ._version property, 28 try: 29 version = str(alignment._version) 30 except AttributeError: 31 version = "" 32 if not version: 33 version = '1.81' 34 if version.startswith("2."): 35 #e.g. 2.0.x 36 output = "CLUSTAL %s multiple sequence alignment\n\n\n" % version 37 else: 38 #e.g. 1.81 or 1.83 39 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version 40 41 cur_char = 0 42 max_length = len(alignment[0]) 43 44 if max_length <= 0: 45 raise ValueError("Non-empty sequences are required") 46 47 # keep displaying sequences until we reach the end 48 while cur_char != max_length: 49 # calculate the number of sequences to show, which will 50 # be less if we are at the end of the sequence 51 if (cur_char + 50) > max_length: 52 show_num = max_length - cur_char 53 else: 54 show_num = 50 55 56 # go through all of the records and print out the sequences 57 # when we output, we do a nice 80 column output, although this 58 # may result in truncation of the ids. 59 for record in alignment: 60 #Make sure we don't get any spaces in the record 61 #identifier when output in the file by replacing 62 #them with underscores: 63 line = record.id[0:30].replace(" ","_").ljust(36) 64 line += record.seq.data[cur_char:(cur_char + show_num)] 65 output += line + "\n" 66 67 # now we need to print out the star info, if we've got it 68 # This was stored by Bio.Clustalw using a ._star_info property. 69 if hasattr(alignment, "_star_info") and alignment._star_info != '': 70 output += (" " * 36) + \ 71 alignment._star_info[cur_char:(cur_char + show_num)] + "\n" 72 73 output += "\n" 74 cur_char += show_num 75 76 # Want a trailing blank new line in case the output is concatenated 77 self.handle.write(output + "\n")
78
79 -class ClustalIterator(AlignmentIterator):
80 """Clustalw alignment iterator.""" 81
82 - def next(self):
83 handle = self.handle 84 try: 85 #Header we saved from when we were parsing 86 #the previous alignment. 87 line = self._header 88 del self._header 89 except AttributeError: 90 line = handle.readline() 91 if not line: 92 return None 93 94 #Whitelisted headers we know about 95 known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE'] 96 if line.strip().split()[0] not in known_headers: 97 raise ValueError("%s is not a known CLUSTAL header: %s" % \ 98 (line.strip().split()[0], 99 ", ".join(known_headers))) 100 101 # find the clustal version in the header line 102 version = None 103 for word in line.split(): 104 if word[0]=='(' and word[-1]==')': 105 word = word[1:-1] 106 if word[0] in '0123456789': 107 version = word 108 break 109 110 #There should be two blank lines after the header line 111 line = handle.readline() 112 while line.strip() == "": 113 line = handle.readline() 114 115 #If the alignment contains entries with the same sequence 116 #identifier (not a good idea - but seems possible), then this 117 #dictionary based parser will merge their sequences. Fix this? 118 ids = [] 119 seqs = [] 120 consensus = "" 121 seq_cols = None #: Used to extract the consensus 122 123 #Use the first block to get the sequence identifiers 124 while True: 125 if line[0] != " " and line.strip() != "": 126 #Sequences identifier... 127 fields = line.rstrip().split() 128 129 #We expect there to be two fields, there can be an optional 130 #"sequence number" field containing the letter count. 131 if len(fields) < 2 or len(fields) > 3: 132 raise ValueError("Could not parse line:\n%s" % line) 133 134 ids.append(fields[0]) 135 seqs.append(fields[1]) 136 137 #Record the sequence position to get the consensus 138 if seq_cols is None: 139 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 140 end = start + len(fields[1]) 141 seq_cols = slice(start, end) 142 del start, end 143 assert fields[1] == line[seq_cols] 144 145 if len(fields) == 3: 146 #This MAY be an old style file with a letter count... 147 try: 148 letters = int(fields[2]) 149 except ValueError: 150 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 151 if len(fields[1].replace("-","")) != letters: 152 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 153 elif line[0] == " ": 154 #Sequence consensus line... 155 assert len(ids) == len(seqs) 156 assert len(ids) > 0 157 assert seq_cols is not None 158 consensus = line[seq_cols] 159 assert not line[:seq_cols.start].strip() 160 assert not line[seq_cols.stop:].strip() 161 #Check for blank line (or end of file) 162 line = handle.readline() 163 assert line.strip() == "" 164 break 165 else: 166 #No consensus 167 break 168 line = handle.readline() 169 if not line : break #end of file 170 171 assert line.strip() == "" 172 assert seq_cols is not None 173 174 #Confirm all same length 175 for s in seqs: 176 assert len(s) == len(seqs[0]) 177 if consensus: 178 assert len(consensus) == len(seqs[0]) 179 180 #Loop over any remaining blocks... 181 done = False 182 while not done: 183 #There should be a blank line between each block. 184 #Also want to ignore any consensus line from the 185 #previous block. 186 while (not line) or line.strip() == "": 187 line = handle.readline() 188 if not line : break # end of file 189 if not line : break # end of file 190 191 if line.split(None,1)[0] in known_headers: 192 #Found concatenated alignment. 193 done = True 194 self._header = line 195 break 196 197 for i in range(len(ids)): 198 assert line[0] != " ", "Unexpected line:\n%s" % repr(line) 199 fields = line.rstrip().split() 200 201 #We expect there to be two fields, there can be an optional 202 #"sequence number" field containing the letter count. 203 if len(fields) < 2 or len(fields) > 3: 204 raise ValueError("Could not parse line:\n%s" % repr(line)) 205 206 if fields[0] != ids[i]: 207 raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \ 208 % (fields[0], ids[i])) 209 210 if fields[1] != line[seq_cols]: 211 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 212 assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start) 213 end = start + len(fields[1]) 214 seq_cols = slice(start, end) 215 del start, end 216 217 #Append the sequence 218 seqs[i] += fields[1] 219 assert len(seqs[i]) == len(seqs[0]) 220 221 if len(fields) == 3: 222 #This MAY be an old style file with a letter count... 223 try: 224 letters = int(fields[2]) 225 except ValueError: 226 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 227 if len(seqs[i].replace("-","")) != letters: 228 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 229 230 #Read in the next line 231 line = handle.readline() 232 #There should now be a consensus line 233 if consensus: 234 assert line[0] == " " 235 assert seq_cols is not None 236 consensus += line[seq_cols] 237 assert len(consensus) == len(seqs[0]) 238 assert not line[:seq_cols.start].strip() 239 assert not line[seq_cols.stop:].strip() 240 #Read in the next line 241 line = handle.readline() 242 243 244 assert len(ids) == len(seqs) 245 if len(seqs) == 0 or len(seqs[0]) == 0: 246 return None 247 248 if self.records_per_alignment is not None \ 249 and self.records_per_alignment != len(ids): 250 raise ValueError("Found %i records in this alignment, told to expect %i" \ 251 % (len(ids), self.records_per_alignment)) 252 253 alignment = MultipleSeqAlignment(self.alphabet) 254 alignment_length = len(seqs[0]) 255 for i in range(len(ids)): 256 if len(seqs[i]) != alignment_length: 257 raise ValueError("Error parsing alignment - sequences of different length?") 258 alignment.add_sequence(ids[i], seqs[i]) 259 #TODO - Handle alignment annotation better, for now 260 #mimic the old parser in Bio.Clustalw 261 if version: 262 alignment._version = version 263 if consensus: 264 assert len(consensus) == alignment_length, \ 265 "Alignment length is %i, consensus length is %i, '%s'" \ 266 % (alignment_length, len(consensus), consensus) 267 alignment._star_info = consensus 268 return alignment
269 270 if __name__ == "__main__": 271 print "Running a quick self-test" 272 273 #This is a truncated version of the example in Tests/cw02.aln 274 #Notice the inclusion of sequence numbers (right hand side) 275 aln_example1 = \ 276 """CLUSTAL W (1.81) multiple sequence alignment 277 278 279 gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50 280 gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41 281 * *: :: :. :* : :. : . :* :: . 282 283 gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100 284 gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65 285 : ** **:... *.*** .. 286 287 gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150 288 gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92 289 .:* * *: .* :* : :* .* 290 291 gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200 292 gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141 293 *::. . .:: :*..* :* .* .. . : . : 294 295 gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRA 210 296 gi|671626|emb|CAA85685.1| VAYVKTFQGP 151 297 *. .:: : . 298 299 """ 300 301 #This example is a truncated version of the dataset used here: 302 #http://virgil.ruc.dk/kurser/Sekvens/Treedraw.htm 303 #with the last record repeated twice (deliberate toture test) 304 aln_example2 = \ 305 """CLUSTAL X (1.83) multiple sequence alignment 306 307 308 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG 309 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG 310 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG 311 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG 312 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG 313 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA 314 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA 315 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG 316 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG 317 : . : :. 318 319 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL 320 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE 321 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG 322 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG 323 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS 324 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA 325 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG 326 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS 327 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS 328 ** .: *::::. : :. . ..: 329 330 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI 331 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI 332 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV 333 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI 334 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL 335 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV 336 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII 337 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV 338 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV 339 *.: . * . * *: : 340 341 """ 342 343 from StringIO import StringIO 344 345 alignments = list(ClustalIterator(StringIO(aln_example1))) 346 assert 1 == len(alignments) 347 assert alignments[0]._version == "1.81" 348 alignment = alignments[0] 349 assert 2 == len(alignment) 350 assert alignment[0].id == "gi|4959044|gb|AAD34209.1|AF069" 351 assert alignment[1].id == "gi|671626|emb|CAA85685.1|" 352 assert alignment[0].seq.tostring() == \ 353 "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \ 354 "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \ 355 "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \ 356 "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \ 357 "VPTTRAQRRA" 358 359 alignments = list(ClustalIterator(StringIO(aln_example2))) 360 assert 1 == len(alignments) 361 assert alignments[0]._version == "1.83" 362 alignment = alignments[0] 363 assert 9 == len(alignment) 364 assert alignment[-1].id == "HISJ_E_COLI" 365 assert alignment[-1].seq.tostring() == \ 366 "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \ 367 "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \ 368 "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV" 369 370 for alignment in ClustalIterator(StringIO(aln_example2 + aln_example1)): 371 print "Alignment with %i records of length %i" \ 372 % (len(alignment), 373 alignment.get_alignment_length()) 374 375 print "Checking empty file..." 376 assert 0 == len(list(ClustalIterator(StringIO("")))) 377 378 print "Checking write/read..." 379 alignments = list(ClustalIterator(StringIO(aln_example1))) \ 380 + list(ClustalIterator(StringIO(aln_example2)))*2 381 handle = StringIO() 382 ClustalWriter(handle).write_file(alignments) 383 handle.seek(0) 384 for i,a in enumerate(ClustalIterator(handle)): 385 assert a.get_alignment_length() == alignments[i].get_alignment_length() 386 handle.seek(0) 387 388 print "Testing write/read when there is only one sequence..." 389 alignment = alignment[0:1] 390 handle = StringIO() 391 ClustalWriter(handle).write_file([alignment]) 392 handle.seek(0) 393 for i,a in enumerate(ClustalIterator(handle)): 394 assert a.get_alignment_length() == alignment.get_alignment_length() 395 assert len(a) == 1 396 397 aln_example3 = \ 398 """CLUSTAL 2.0.9 multiple sequence alignment 399 400 401 Test1seq ------------------------------------------------------------ 402 AT3G20900.1-SEQ ATGAACAAAGTAGCGAGGAAGAACAAAACATCAGGTGAACAAAAAAAAAACTCAATCCAC 403 AT3G20900.1-CDS ------------------------------------------------------------ 404 405 406 Test1seq -----AGTTACAATAACTGACGAAGCTAAGTAGGCTACTAATTAACGTCATCAACCTAAT 407 AT3G20900.1-SEQ ATCAAAGTTACAATAACTGACGAAGCTAAGTAGGCTAGAAATTAAAGTCATCAACCTAAT 408 AT3G20900.1-CDS ------------------------------------------------------------ 409 410 411 Test1seq ACATAGCACTTAGAAAAAAGTGAAGTAAGAAAATATAAAATAATAAAAGGGTGGGTTATC 412 AT3G20900.1-SEQ ACATAGCACTTAGAAAAAAGTGAAGCAAGAAAATATAAAATAATAAAAGGGTGGGTTATC 413 AT3G20900.1-CDS ------------------------------------------------------------ 414 415 416 Test1seq AATTGATAGTGTAAATCATCGTATTCCGGTGATATACCCTACCACAAAAACTCAAACCGA 417 AT3G20900.1-SEQ AATTGATAGTGTAAATCATAGTTGATTTTTGATATACCCTACCACAAAAACTCAAACCGA 418 AT3G20900.1-CDS ------------------------------------------------------------ 419 420 421 Test1seq CTTGATTCAAATCATCTCAATAAATTAGCGCCAAAATAATGAAAAAAATAATAACAAACA 422 AT3G20900.1-SEQ CTTGATTCAAATCATCTCAAAAAACAAGCGCCAAAATAATGAAAAAAATAATAACAAAAA 423 AT3G20900.1-CDS ------------------------------------------------------------ 424 425 426 Test1seq AAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT 427 AT3G20900.1-SEQ CAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT 428 AT3G20900.1-CDS ------------------------------------------------------------ 429 430 431 Test1seq GTATTAACAAATCAAAGAGCTGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT 432 AT3G20900.1-SEQ GTATTAACAAATCAAAGAGATGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT 433 AT3G20900.1-CDS ------------------------------------------------------------ 434 435 436 Test1seq CCTATATCAACGTAAACAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT 437 AT3G20900.1-SEQ CCTATATCAAAAAAAAAAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT 438 AT3G20900.1-CDS ------------------------------------------------------ATGAAC 439 * 440 441 Test1seq TCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT 442 AT3G20900.1-SEQ GCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT 443 AT3G20900.1-CDS AAAGTAGCGAGGAAGAACAAAACATC------AGCAAAGAAAACGATCTGTCTCCGTCGT 444 * *** ***** * * ** **************************** 445 446 Test1seq AACACACGGTCGCTAGAGAAACTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC 447 AT3G20900.1-SEQ AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC 448 AT3G20900.1-CDS AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC 449 ******* ** * **** *************************************** 450 451 Test1seq GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCGTGGTGACGTCAGCACCGCT 452 AT3G20900.1-SEQ GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT 453 AT3G20900.1-CDS GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT 454 **************************************** ******************* 455 456 Test1seq GCTGGGGATGGAGAGGGAACAGAGTT- 457 AT3G20900.1-SEQ GCTGGGGATGGAGAGGGAACAGAGTAG 458 AT3G20900.1-CDS GCTGGGGATGGAGAGGGAACAGAGTAG 459 ************************* 460 """ 461 alignments = list(ClustalIterator(StringIO(aln_example3))) 462 assert 1 == len(alignments) 463 assert alignments[0]._version == "2.0.9" 464 465 print "The End" 466