1
2
3
4
5
6
7
8
9 """Sequence input/output as SeqRecord objects.
10
11 Bio.SeqIO is also documented at U{http://biopython.org/wiki/SeqIO} and by
12 a whole chapter in our tutorial:
13 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
14 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
15
16 Input
17 =====
18 The main function is Bio.SeqIO.parse(...) which takes an input file handle
19 (or in recent versions of Biopython alternatively a filename as a string),
20 and format string. This returns an iterator giving SeqRecord objects:
21
22 >>> from Bio import SeqIO
23 >>> for record in SeqIO.parse("Fasta/f002", "fasta"):
24 ... print record.id, len(record)
25 gi|1348912|gb|G26680|G26680 633
26 gi|1348917|gb|G26685|G26685 413
27 gi|1592936|gb|G29385|G29385 471
28
29 Note that the parse() function will invoke the relevant parser for the
30 format with its default settings. You may want more control, in which case
31 you need to create a format specific sequence iterator directly.
32
33 Input - Single Records
34 ======================
35 If you expect your file to contain one-and-only-one record, then we provide
36 the following 'helper' function which will return a single SeqRecord, or
37 raise an exception if there are no records or more than one record:
38
39 >>> from Bio import SeqIO
40 >>> record = SeqIO.read("Fasta/f001", "fasta")
41 >>> print record.id, len(record)
42 gi|3318709|pdb|1A91| 79
43
44 This style is useful when you expect a single record only (and would
45 consider multiple records an error). For example, when dealing with GenBank
46 files for bacterial genomes or chromosomes, there is normally only a single
47 record. Alternatively, use this with a handle when downloading a single
48 record from the internet.
49
50 However, if you just want the first record from a file containing multiple
51 record, use the iterator's next() method:
52
53 >>> from Bio import SeqIO
54 >>> record = SeqIO.parse("Fasta/f002", "fasta").next()
55 >>> print record.id, len(record)
56 gi|1348912|gb|G26680|G26680 633
57
58 The above code will work as long as the file contains at least one record.
59 Note that if there is more than one record, the remaining records will be
60 silently ignored.
61
62
63 Input - Multiple Records
64 ========================
65 For non-interlaced files (e.g. Fasta, GenBank, EMBL) with multiple records
66 using a sequence iterator can save you a lot of memory (RAM). There is
67 less benefit for interlaced file formats (e.g. most multiple alignment file
68 formats). However, an iterator only lets you access the records one by one.
69
70 If you want random access to the records by number, turn this into a list:
71
72 >>> from Bio import SeqIO
73 >>> records = list(SeqIO.parse("Fasta/f002", "fasta"))
74 >>> len(records)
75 3
76 >>> print records[1].id
77 gi|1348917|gb|G26685|G26685
78
79 If you want random access to the records by a key such as the record id,
80 turn the iterator into a dictionary:
81
82 >>> from Bio import SeqIO
83 >>> record_dict = SeqIO.to_dict(SeqIO.parse("Fasta/f002", "fasta"))
84 >>> len(record_dict)
85 3
86 >>> print len(record_dict["gi|1348917|gb|G26685|G26685"])
87 413
88
89 However, using list() or the to_dict() function will load all the records
90 into memory at once, and therefore is not possible on very large files.
91 Instead, for *some* file formats Bio.SeqIO provides an indexing approach
92 providing dictionary like access to any record. For example,
93
94 >>> from Bio import SeqIO
95 >>> record_dict = SeqIO.index("Fasta/f002", "fasta")
96 >>> len(record_dict)
97 3
98 >>> print len(record_dict["gi|1348917|gb|G26685|G26685"])
99 413
100
101 Many but not all of the supported input file formats can be indexed like
102 this. For example "fasta", "fastq", "qual" and even the binary format "sff"
103 work, but alignment formats like "phylip", "clustalw" and "nexus" will not.
104
105 In most cases you can also use SeqIO.index to get the record from the file
106 as a raw string (not a SeqRecord). This can be useful for example to extract
107 a sub-set of records from a file where SeqIO cannot output the file format
108 (e.g. the plain text SwissProt format, "swiss") or where it is important to
109 keep the output 100% identical to the input). For example,
110
111 >>> from Bio import SeqIO
112 >>> record_dict = SeqIO.index("Fasta/f002", "fasta")
113 >>> len(record_dict)
114 3
115 >>> print record_dict.get_raw("gi|1348917|gb|G26685|G26685")
116 >gi|1348917|gb|G26685|G26685 human STS STS_D11734.
117 CGGAGCCAGCGAGCATATGCTGCATGAGGACCTTTCTATCTTACATTATGGCTGGGAATCTTACTCTTTC
118 ATCTGATACCTTGTTCAGATTTCAAAATAGTTGTAGCCTTATCCTGGTTTTACAGATGTGAAACTTTCAA
119 GAGATTTACTGACTTTCCTAGAATAGTTTCTCTACTGGAAACCTGATGCTTTTATAAGCCATTGTGATTA
120 GGATGACTGTTACAGGCTTAGCTTTGTGTGAAANCCAGTCACCTTTCTCCTAGGTAATGAGTAGTGCTGT
121 TCATATTACTNTAAGTTCTATAGCATACTTGCNATCCTTTANCCATGCTTATCATANGTACCATTTGAGG
122 AATTGNTTTGCCCTTTTGGGTTTNTTNTTGGTAAANNNTTCCCGGGTGGGGGNGGTNNNGAAA
123 <BLANKLINE>
124 >>> print record_dict["gi|1348917|gb|G26685|G26685"].format("fasta")
125 >gi|1348917|gb|G26685|G26685 human STS STS_D11734.
126 CGGAGCCAGCGAGCATATGCTGCATGAGGACCTTTCTATCTTACATTATGGCTGGGAATC
127 TTACTCTTTCATCTGATACCTTGTTCAGATTTCAAAATAGTTGTAGCCTTATCCTGGTTT
128 TACAGATGTGAAACTTTCAAGAGATTTACTGACTTTCCTAGAATAGTTTCTCTACTGGAA
129 ACCTGATGCTTTTATAAGCCATTGTGATTAGGATGACTGTTACAGGCTTAGCTTTGTGTG
130 AAANCCAGTCACCTTTCTCCTAGGTAATGAGTAGTGCTGTTCATATTACTNTAAGTTCTA
131 TAGCATACTTGCNATCCTTTANCCATGCTTATCATANGTACCATTTGAGGAATTGNTTTG
132 CCCTTTTGGGTTTNTTNTTGGTAAANNNTTCCCGGGTGGGGGNGGTNNNGAAA
133 <BLANKLINE>
134
135 Here the original file and what Biopython would output differ in the line
136 wrapping.
137
138 Input - Alignments
139 ==================
140 You can read in alignment files as alignment objects using Bio.AlignIO.
141 Alternatively, reading in an alignment file format via Bio.SeqIO will give
142 you a SeqRecord for each row of each alignment:
143
144 >>> from Bio import SeqIO
145 >>> for record in SeqIO.parse("Clustalw/hedgehog.aln", "clustal"):
146 ... print record.id, len(record)
147 gi|167877390|gb|EDS40773.1| 447
148 gi|167234445|ref|NP_001107837. 447
149 gi|74100009|gb|AAZ99217.1| 447
150 gi|13990994|dbj|BAA33523.2| 447
151 gi|56122354|gb|AAV74328.1| 447
152
153 Output
154 ======
155 Use the function Bio.SeqIO.write(...), which takes a complete set of
156 SeqRecord objects (either as a list, or an iterator), an output file handle
157 (or in recent versions of Biopython an output filename as a string) and of
158 course the file format::
159
160 from Bio import SeqIO
161 records = ...
162 SeqIO.write(records, "example.faa", "fasta")
163
164 Or, using a handle::
165
166 from Bio import SeqIO
167 records = ...
168 handle = open("example.faa", "w")
169 SeqIO.write(records, handle, "fasta")
170 handle.close()
171
172 You are expected to call this function once (with all your records) and if
173 using a handle, make sure you close it to flush the data to the hard disk.
174
175 Output - Advanced
176 =================
177 The effect of calling write() multiple times on a single file will vary
178 depending on the file format, and is best avoided unless you have a strong
179 reason to do so.
180
181 If you give a filename, then each time you call write() the existing file
182 will be overwriten. For sequential files formats (e.g. fasta, genbank) each
183 "record block" holds a single sequence. For these files it would probably
184 be safe to call write() multiple times by re-using the same handle.
185
186
187 However, trying this for certain alignment formats (e.g. phylip, clustal,
188 stockholm) would have the effect of concatenating several multiple sequence
189 alignments together. Such files are created by the PHYLIP suite of programs
190 for bootstrap analysis, but it is clearer to do this via Bio.AlignIO instead.
191
192
193 Conversion
194 ==========
195 The Bio.SeqIO.convert(...) function allows an easy interface for simple
196 file format conversions. Additionally, it may use file format specific
197 optimisations so this should be the fastest way too.
198
199 In general however, you can combine the Bio.SeqIO.parse(...) function with
200 the Bio.SeqIO.write(...) function for sequence file conversion. Using
201 generator expressions or generator functions provides a memory efficient way
202 to perform filtering or other extra operations as part of the process.
203
204 File Formats
205 ============
206 When specifying the file format, use lowercase strings. The same format
207 names are also used in Bio.AlignIO and include the following:
208
209 - ace - Reads the contig sequences from an ACE assembly file.
210 - embl - The EMBL flat file format. Uses Bio.GenBank internally.
211 - fasta - The generic sequence file format where each record starts with
212 an identifer line starting with a ">" character, followed by
213 lines of sequence.
214 - fastq - A "FASTA like" format used by Sanger which also stores PHRED
215 sequence quality values (with an ASCII offset of 33).
216 - fastq-sanger - An alias for "fastq" for consistency with BioPerl and EMBOSS
217 - fastq-solexa - Original Solexa/Illumnia variant of the FASTQ format which
218 encodes Solexa quality scores (not PHRED quality scores) with an
219 ASCII offset of 64.
220 - fastq-illumina - Solexa/Illumnia 1.3+ variant of the FASTQ format which
221 encodes PHRED quality scores with an ASCII offset of 64 (not 33).
222 - genbank - The GenBank or GenPept flat file format.
223 - gb - An alias for "genbank", for consistency with NCBI Entrez Utilities
224 - ig - The IntelliGenetics file format, apparently the same as the
225 MASE alignment format.
226 - phd - Output from PHRED, used by PHRAP and CONSED for input.
227 - pir - A "FASTA like" format introduced by the National Biomedical
228 Research Foundation (NBRF) for the Protein Information Resource
229 (PIR) database, now part of UniProt.
230 - sff - Standard Flowgram Format (SFF), typical output from Roche 454.
231 - sff-trim - Standard Flowgram Format (SFF) with given trimming applied.
232 - swiss - Plain text Swiss-Prot aka UniProt format.
233 - tab - Simple two column tab separated sequence files, where each
234 line holds a record's identifier and sequence. For example,
235 this is used as by Aligent's eArray software when saving
236 microarray probes in a minimal tab delimited text file.
237 - qual - A "FASTA like" format holding PHRED quality values from
238 sequencing DNA, but no actual sequences (usually provided
239 in separate FASTA files).
240
241 Note that while Bio.SeqIO can read all the above file formats, it cannot
242 write to all of them.
243
244 You can also use any file format supported by Bio.AlignIO, such as "nexus",
245 "phlip" and "stockholm", which gives you access to the individual sequences
246 making up each alignment as SeqRecords.
247 """
248 __docformat__ = "epytext en"
249
250
251
252
253
254
255
256
257
258
259
260
261 """
262 FAO BioPython Developers
263 ========================
264 The way I envision this SeqIO system working as that for any sequence file
265 format we have an iterator that returns SeqRecord objects.
266
267 This also applies to interlaced fileformats (like clustal - although that
268 is now handled via Bio.AlignIO instead) where the file cannot be read record
269 by record. You should still return an iterator, even if the implementation
270 could just as easily return a list.
271
272 These file format specific sequence iterators may be implemented as:
273 * Classes which take a handle for __init__ and provide the __iter__ method
274 * Functions that take a handle, and return an iterator object
275 * Generator functions that take a handle, and yield SeqRecord objects
276
277 It is then trivial to turn this iterator into a list of SeqRecord objects,
278 an in memory dictionary, or a multiple sequence alignment object.
279
280 For building the dictionary by default the id propery of each SeqRecord is
281 used as the key. You should always populate the id property, and it should
282 be unique in most cases. For some file formats the accession number is a good
283 choice. If the file itself contains ambiguous identifiers, don't try and
284 dis-ambiguate them - return them as is.
285
286 When adding a new file format, please use the same lower case format name
287 as BioPerl, or if they have not defined one, try the names used by EMBOSS.
288
289 See also http://biopython.org/wiki/SeqIO_dev
290
291 --Peter
292 """
293
294 from Bio.Seq import Seq
295 from Bio.SeqRecord import SeqRecord
296 from Bio.Align import MultipleSeqAlignment
297 from Bio.Align.Generic import Alignment
298 from Bio.Alphabet import Alphabet, AlphabetEncoder, _get_base_alphabet
299
300 import AceIO
301 import FastaIO
302 import IgIO
303 import InsdcIO
304 import PhdIO
305 import PirIO
306 import SffIO
307 import SwissIO
308 import TabIO
309 import QualityIO
310
311
312
313
314
315
316
317
318
319
320
321 _FormatToIterator = {"fasta" : FastaIO.FastaIterator,
322 "gb" : InsdcIO.GenBankIterator,
323 "genbank" : InsdcIO.GenBankIterator,
324 "genbank-cds" : InsdcIO.GenBankCdsFeatureIterator,
325 "embl" : InsdcIO.EmblIterator,
326 "embl-cds" : InsdcIO.EmblCdsFeatureIterator,
327 "ig" : IgIO.IgIterator,
328 "swiss" : SwissIO.SwissIterator,
329 "phd" : PhdIO.PhdIterator,
330 "ace" : AceIO.AceIterator,
331 "tab" : TabIO.TabIterator,
332 "pir" : PirIO.PirIterator,
333 "fastq" : QualityIO.FastqPhredIterator,
334 "fastq-sanger" : QualityIO.FastqPhredIterator,
335 "fastq-solexa" : QualityIO.FastqSolexaIterator,
336 "fastq-illumina" : QualityIO.FastqIlluminaIterator,
337 "qual" : QualityIO.QualPhredIterator,
338 "sff": SffIO.SffIterator,
339
340 "sff-trim": SffIO._SffTrimIterator,
341 }
342
343 _FormatToWriter = {"fasta" : FastaIO.FastaWriter,
344 "gb" : InsdcIO.GenBankWriter,
345 "genbank" : InsdcIO.GenBankWriter,
346 "embl" : InsdcIO.EmblWriter,
347 "tab" : TabIO.TabWriter,
348 "fastq" : QualityIO.FastqPhredWriter,
349 "fastq-sanger" : QualityIO.FastqPhredWriter,
350 "fastq-solexa" : QualityIO.FastqSolexaWriter,
351 "fastq-illumina" : QualityIO.FastqIlluminaWriter,
352 "phd" : PhdIO.PhdWriter,
353 "qual" : QualityIO.QualPhredWriter,
354 "sff" : SffIO.SffWriter,
355 }
356
357 _BinaryFormats = ["sff", "sff-trim"]
358
359 -def write(sequences, handle, format):
360 """Write complete set of sequences to a file.
361
362 - sequences - A list (or iterator) of SeqRecord objects, or (if using
363 Biopython 1.54 or later) a single SeqRecord.
364 - handle - File handle object to write to, or filename as string
365 (note older versions of Biopython only took a handle).
366 - format - lower case string describing the file format to write.
367
368 You should close the handle after calling this function.
369
370 Returns the number of records written (as an integer).
371 """
372 from Bio import AlignIO
373
374
375 if not isinstance(format, basestring):
376 raise TypeError("Need a string for the file format (lower case)")
377 if not format:
378 raise ValueError("Format required (lower case string)")
379 if format != format.lower():
380 raise ValueError("Format string '%s' should be lower case" % format)
381
382 if isinstance(sequences, SeqRecord):
383
384 sequences = [sequences]
385
386 if isinstance(handle, basestring):
387 if format in _BinaryFormats :
388 handle = open(handle, "wb")
389 else :
390 handle = open(handle, "w")
391 handle_close = True
392 else:
393 handle_close = False
394
395
396 if format in _FormatToWriter:
397 writer_class = _FormatToWriter[format]
398 count = writer_class(handle).write_file(sequences)
399 elif format in AlignIO._FormatToWriter:
400
401
402 alignment = MultipleSeqAlignment(sequences)
403 alignment_count = AlignIO.write([alignment], handle, format)
404 assert alignment_count == 1, "Internal error - the underlying writer " \
405 + " should have returned 1, not %s" % repr(alignment_count)
406 count = len(alignment)
407 del alignment_count, alignment
408 elif format in _FormatToIterator or format in AlignIO._FormatToIterator:
409 raise ValueError("Reading format '%s' is supported, but not writing" \
410 % format)
411 else:
412 raise ValueError("Unknown format '%s'" % format)
413
414 assert isinstance(count, int), "Internal error - the underlying %s " \
415 "writer should have returned the record count, not %s" \
416 % (format, repr(count))
417
418 if handle_close:
419 handle.close()
420
421 return count
422
423 -def parse(handle, format, alphabet=None):
424 r"""Turns a sequence file into an iterator returning SeqRecords.
425
426 - handle - handle to the file, or the filename as a string
427 (note older verions of Biopython only took a handle).
428 - format - lower case string describing the file format.
429 - alphabet - optional Alphabet object, useful when the sequence type
430 cannot be automatically inferred from the file itself
431 (e.g. format="fasta" or "tab")
432
433 Typical usage, opening a file to read in, and looping over the record(s):
434
435 >>> from Bio import SeqIO
436 >>> filename = "Fasta/sweetpea.nu"
437 >>> for record in SeqIO.parse(filename, "fasta"):
438 ... print "ID", record.id
439 ... print "Sequence length", len(record)
440 ... print "Sequence alphabet", record.seq.alphabet
441 ID gi|3176602|gb|U78617.1|LOU78617
442 Sequence length 309
443 Sequence alphabet SingleLetterAlphabet()
444
445 For file formats like FASTA where the alphabet cannot be determined, it
446 may be useful to specify the alphabet explicitly:
447
448 >>> from Bio import SeqIO
449 >>> from Bio.Alphabet import generic_dna
450 >>> filename = "Fasta/sweetpea.nu"
451 >>> for record in SeqIO.parse(filename, "fasta", generic_dna):
452 ... print "ID", record.id
453 ... print "Sequence length", len(record)
454 ... print "Sequence alphabet", record.seq.alphabet
455 ID gi|3176602|gb|U78617.1|LOU78617
456 Sequence length 309
457 Sequence alphabet DNAAlphabet()
458
459 If you have a string 'data' containing the file contents, you must
460 first turn this into a handle in order to parse it:
461
462 >>> data = ">Alpha\nACCGGATGTA\n>Beta\nAGGCTCGGTTA\n"
463 >>> from Bio import SeqIO
464 >>> from StringIO import StringIO
465 >>> for record in SeqIO.parse(StringIO(data), "fasta"):
466 ... print record.id, record.seq
467 Alpha ACCGGATGTA
468 Beta AGGCTCGGTTA
469
470 Use the Bio.SeqIO.read(...) function when you expect a single record
471 only.
472 """
473
474
475
476 from Bio import AlignIO
477
478 if isinstance(handle, basestring):
479
480 if format in _BinaryFormats :
481 handle = open(handle, "rb")
482 else :
483 handle = open(handle, "rU")
484
485
486
487 if not isinstance(format, basestring):
488 raise TypeError("Need a string for the file format (lower case)")
489 if not format:
490 raise ValueError("Format required (lower case string)")
491 if format != format.lower():
492 raise ValueError("Format string '%s' should be lower case" % format)
493 if alphabet is not None and not (isinstance(alphabet, Alphabet) or \
494 isinstance(alphabet, AlphabetEncoder)):
495 raise ValueError("Invalid alphabet, %s" % repr(alphabet))
496
497
498 if format in _FormatToIterator:
499 iterator_generator = _FormatToIterator[format]
500 if alphabet is None:
501 return iterator_generator(handle)
502 try:
503 return iterator_generator(handle, alphabet=alphabet)
504 except:
505 return _force_alphabet(iterator_generator(handle), alphabet)
506 elif format in AlignIO._FormatToIterator:
507
508
509
510 return _iterate_via_AlignIO(handle, format, alphabet)
511 else:
512 raise ValueError("Unknown format '%s'" % format)
513
514
521
535
536 -def read(handle, format, alphabet=None):
537 """Turns a sequence file into a single SeqRecord.
538
539 - handle - handle to the file, or the filename as a string
540 (note older verions of Biopython only took a handle).
541 - format - string describing the file format.
542 - alphabet - optional Alphabet object, useful when the sequence type
543 cannot be automatically inferred from the file itself
544 (e.g. format="fasta" or "tab")
545
546 This function is for use parsing sequence files containing
547 exactly one record. For example, reading a GenBank file:
548
549 >>> from Bio import SeqIO
550 >>> record = SeqIO.read("GenBank/arab1.gb", "genbank")
551 >>> print "ID", record.id
552 ID AC007323.5
553 >>> print "Sequence length", len(record)
554 Sequence length 86436
555 >>> print "Sequence alphabet", record.seq.alphabet
556 Sequence alphabet IUPACAmbiguousDNA()
557
558 If the handle contains no records, or more than one record,
559 an exception is raised. For example:
560
561 >>> from Bio import SeqIO
562 >>> record = SeqIO.read("GenBank/cor6_6.gb", "genbank")
563 Traceback (most recent call last):
564 ...
565 ValueError: More than one record found in handle
566
567 If however you want the first record from a file containing
568 multiple records this function would raise an exception (as
569 shown in the example above). Instead use:
570
571 >>> from Bio import SeqIO
572 >>> record = SeqIO.parse("GenBank/cor6_6.gb", "genbank").next()
573 >>> print "First record's ID", record.id
574 First record's ID X55053.1
575
576 Use the Bio.SeqIO.parse(handle, format) function if you want
577 to read multiple records from the handle.
578 """
579 iterator = parse(handle, format, alphabet)
580 try:
581 first = iterator.next()
582 except StopIteration:
583 first = None
584 if first is None:
585 raise ValueError("No records found in handle")
586 try:
587 second = iterator.next()
588 except StopIteration:
589 second = None
590 if second is not None:
591 raise ValueError("More than one record found in handle")
592 return first
593
594 -def to_dict(sequences, key_function=None):
595 """Turns a sequence iterator or list into a dictionary.
596
597 - sequences - An iterator that returns SeqRecord objects,
598 or simply a list of SeqRecord objects.
599 - key_function - Optional callback function which when given a
600 SeqRecord should return a unique key for the dictionary.
601
602 e.g. key_function = lambda rec : rec.name
603 or, key_function = lambda rec : rec.description.split()[0]
604
605 If key_function is ommitted then record.id is used, on the assumption
606 that the records objects returned are SeqRecords with a unique id.
607
608 If there are duplicate keys, an error is raised.
609
610 Example usage, defaulting to using the record.id as key:
611
612 >>> from Bio import SeqIO
613 >>> filename = "GenBank/cor6_6.gb"
614 >>> format = "genbank"
615 >>> id_dict = SeqIO.to_dict(SeqIO.parse(filename, format))
616 >>> print sorted(id_dict.keys())
617 ['AF297471.1', 'AJ237582.1', 'L31939.1', 'M81224.1', 'X55053.1', 'X62281.1']
618 >>> print id_dict["L31939.1"].description
619 Brassica rapa (clone bif72) kin mRNA, complete cds.
620
621 A more complex example, using the key_function argument in order to
622 use a sequence checksum as the dictionary key:
623
624 >>> from Bio import SeqIO
625 >>> from Bio.SeqUtils.CheckSum import seguid
626 >>> filename = "GenBank/cor6_6.gb"
627 >>> format = "genbank"
628 >>> seguid_dict = SeqIO.to_dict(SeqIO.parse(filename, format),
629 ... key_function = lambda rec : seguid(rec.seq))
630 >>> for key, record in sorted(seguid_dict.iteritems()):
631 ... print key, record.id
632 /wQvmrl87QWcm9llO4/efg23Vgg AJ237582.1
633 BUg6YxXSKWEcFFH0L08JzaLGhQs L31939.1
634 SabZaA4V2eLE9/2Fm5FnyYy07J4 X55053.1
635 TtWsXo45S3ZclIBy4X/WJc39+CY M81224.1
636 l7gjJFE6W/S1jJn5+1ASrUKW/FA X62281.1
637 uVEYeAQSV5EDQOnFoeMmVea+Oow AF297471.1
638
639 This approach is not suitable for very large sets of sequences, as all
640 the SeqRecord objects are held in memory. Instead, consider using the
641 Bio.SeqIO.index() function (if it supports your particular file format).
642 """
643 if key_function is None:
644 key_function = lambda rec : rec.id
645
646 d = dict()
647 for record in sequences:
648 key = key_function(record)
649 if key in d:
650 raise ValueError("Duplicate key '%s'" % key)
651 d[key] = record
652 return d
653
654 -def index(filename, format, alphabet=None, key_function=None):
655 """Indexes a sequence file and returns a dictionary like object.
656
657 - filename - string giving name of file to be indexed
658 - format - lower case string describing the file format
659 - alphabet - optional Alphabet object, useful when the sequence type
660 cannot be automatically inferred from the file itself
661 (e.g. format="fasta" or "tab")
662 - key_function - Optional callback function which when given a
663 SeqRecord identifier string should return a unique
664 key for the dictionary.
665
666 This indexing function will return a dictionary like object, giving the
667 SeqRecord objects as values:
668
669 >>> from Bio import SeqIO
670 >>> records = SeqIO.index("Quality/example.fastq", "fastq")
671 >>> len(records)
672 3
673 >>> sorted(records.keys())
674 ['EAS54_6_R1_2_1_413_324', 'EAS54_6_R1_2_1_443_348', 'EAS54_6_R1_2_1_540_792']
675 >>> print records["EAS54_6_R1_2_1_540_792"].format("fasta")
676 >EAS54_6_R1_2_1_540_792
677 TTGGCAGGCCAAGGCCGATGGATCA
678 <BLANKLINE>
679 >>> "EAS54_6_R1_2_1_540_792" in records
680 True
681 >>> print records.get("Missing", None)
682 None
683
684 Note that this psuedo dictionary will not support all the methods of a
685 true Python dictionary, for example values() is not defined since this
686 would require loading all of the records into memory at once.
687
688 When you call the index function, it will scan through the file, noting
689 the location of each record. When you access a particular record via the
690 dictionary methods, the code will jump to the appropriate part of the
691 file and then parse that section into a SeqRecord.
692
693 Note that not all the input formats supported by Bio.SeqIO can be used
694 with this index function. It is designed to work only with sequential
695 file formats (e.g. "fasta", "gb", "fastq") and is not suitable for any
696 interlaced file format (e.g. alignment formats such as "clustal").
697
698 For small files, it may be more efficient to use an in memory Python
699 dictionary, e.g.
700
701 >>> from Bio import SeqIO
702 >>> records = SeqIO.to_dict(SeqIO.parse(open("Quality/example.fastq"), "fastq"))
703 >>> len(records)
704 3
705 >>> sorted(records.keys())
706 ['EAS54_6_R1_2_1_413_324', 'EAS54_6_R1_2_1_443_348', 'EAS54_6_R1_2_1_540_792']
707 >>> print records["EAS54_6_R1_2_1_540_792"].format("fasta")
708 >EAS54_6_R1_2_1_540_792
709 TTGGCAGGCCAAGGCCGATGGATCA
710 <BLANKLINE>
711
712 As with the to_dict() function, by default the id string of each record
713 is used as the key. You can specify a callback function to transform
714 this (the record identifier string) into your prefered key. For example:
715
716 >>> from Bio import SeqIO
717 >>> def make_tuple(identifier):
718 ... parts = identifier.split("_")
719 ... return int(parts[-2]), int(parts[-1])
720 >>> records = SeqIO.index("Quality/example.fastq", "fastq",
721 ... key_function=make_tuple)
722 >>> len(records)
723 3
724 >>> sorted(records.keys())
725 [(413, 324), (443, 348), (540, 792)]
726 >>> print records[(540, 792)].format("fasta")
727 >EAS54_6_R1_2_1_540_792
728 TTGGCAGGCCAAGGCCGATGGATCA
729 <BLANKLINE>
730 >>> (540, 792) in records
731 True
732 >>> "EAS54_6_R1_2_1_540_792" in records
733 False
734 >>> print records.get("Missing", None)
735 None
736
737 Another common use case would be indexing an NCBI style FASTA file,
738 where you might want to extract the GI number from the FASTA identifer
739 to use as the dictionary key.
740
741 Notice that unlike the to_dict() function, here the key_function does
742 not get given the full SeqRecord to use to generate the key. Doing so
743 would impose a severe performance penalty as it would require the file
744 to be completely parsed while building the index. Right now this is
745 usually avoided.
746 """
747
748 if not isinstance(filename, basestring):
749 raise TypeError("Need a filename (not a handle)")
750 if not isinstance(format, basestring):
751 raise TypeError("Need a string for the file format (lower case)")
752 if not format:
753 raise ValueError("Format required (lower case string)")
754 if format != format.lower():
755 raise ValueError("Format string '%s' should be lower case" % format)
756 if alphabet is not None and not (isinstance(alphabet, Alphabet) or \
757 isinstance(alphabet, AlphabetEncoder)):
758 raise ValueError("Invalid alphabet, %s" % repr(alphabet))
759
760
761 import _index
762 try:
763 indexer = _index._FormatToIndexedDict[format]
764 except KeyError:
765 raise ValueError("Unsupported format '%s'" % format)
766 return indexer(filename, alphabet, key_function)
767
769 """Returns a multiple sequence alignment (DEPRECATED).
770
771 - sequences -An iterator that returns SeqRecord objects,
772 or simply a list of SeqRecord objects. All
773 the record sequences must be the same length.
774 - alphabet - Optional alphabet. Stongly recommended.
775 - strict - Dummy argument, used to enable strict error
776 checking of sequence lengths and alphabets.
777 This is now always done.
778
779 Using this function is now discouraged. You are now encouraged to use
780 Bio.AlignIO instead, e.g.
781
782 >>> from Bio import AlignIO
783 >>> filename = "Clustalw/protein.aln"
784 >>> alignment = AlignIO.read(filename, "clustal")
785 """
786 import warnings
787 warnings.warn("The Bio.SeqIO.to_alignment(...) function is deprecated. "
788 "Please use the Bio.Align.MultipleSeqAlignment(...) object "
789 "directly instead.", DeprecationWarning)
790 return MultipleSeqAlignment(sequences, alphabet)
791
792 -def convert(in_file, in_format, out_file, out_format, alphabet=None):
793 """Convert between two sequence file formats, return number of records.
794
795 - in_file - an input handle or filename
796 - in_format - input file format, lower case string
797 - out_file - an output handle or filename
798 - out_format - output file format, lower case string
799 - alphabet - optional alphabet to assume
800
801 NOTE - If you provide an output filename, it will be opened which will
802 overwrite any existing file without warning. This may happen if even
803 the conversion is aborted (e.g. an invalid out_format name is given).
804
805 For example, going from a filename to a handle:
806
807 >>> from Bio import SeqIO
808 >>> from StringIO import StringIO
809 >>> handle = StringIO("")
810 >>> SeqIO.convert("Quality/example.fastq", "fastq", handle, "fasta")
811 3
812 >>> print handle.getvalue()
813 >EAS54_6_R1_2_1_413_324
814 CCCTTCTTGTCTTCAGCGTTTCTCC
815 >EAS54_6_R1_2_1_540_792
816 TTGGCAGGCCAAGGCCGATGGATCA
817 >EAS54_6_R1_2_1_443_348
818 GTTGCTTCTGGCGTGGGTGGGGGGG
819 <BLANKLINE>
820 """
821 if isinstance(in_file, basestring):
822
823 if in_format in _BinaryFormats :
824 in_handle = open(in_file, "rb")
825 else :
826 in_handle = open(in_file, "rU")
827 in_close = True
828 else:
829 in_handle = in_file
830 in_close = False
831
832 if isinstance(out_file, basestring):
833 if out_format in ["sff", "sff_trim"] :
834 out_handle = open(out_file, "wb")
835 else :
836 out_handle = open(out_file, "w")
837 out_close = True
838 else:
839 out_handle = out_file
840 out_close = False
841
842
843 from _convert import _handle_convert
844 count = _handle_convert(in_handle, in_format,
845 out_handle, out_format,
846 alphabet)
847
848 if in_close:
849 in_handle.close()
850 if out_close:
851 out_handle.close()
852 return count
853
855 """Run the Bio.SeqIO module's doctests.
856
857 This will try and locate the unit tests directory, and run the doctests
858 from there in order that the relative paths used in the examples work.
859 """
860 import doctest
861 import os
862 if os.path.isdir(os.path.join("..", "..", "Tests")):
863 print "Runing doctests..."
864 cur_dir = os.path.abspath(os.curdir)
865 os.chdir(os.path.join("..", "..", "Tests"))
866 doctest.testmod()
867 os.chdir(cur_dir)
868 del cur_dir
869 print "Done"
870 elif os.path.isdir(os.path.join("Tests", "Fasta")):
871 print "Runing doctests..."
872 cur_dir = os.path.abspath(os.curdir)
873 os.chdir(os.path.join("Tests"))
874 doctest.testmod()
875 os.chdir(cur_dir)
876 del cur_dir
877 print "Done"
878
879 if __name__ == "__main__":
880
881 _test()
882