1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36 __doc__="Access the PDB over the internet (for example to download structures)."
37
38 import urllib, re, os
39 import shutil
40
42 """
43 This class provides quick access to the structure lists on the
44 PDB server or its mirrors. The structure lists contain
45 four-letter PDB codes, indicating that structures are
46 new, have been modified or are obsolete. The lists are released
47 on a weekly basis.
48
49 It also provides a function to retrieve PDB files from the server.
50 To use it properly, prepare a directory /pdb or the like,
51 where PDB files are stored.
52
53 If You want to use this module from inside a proxy, add
54 the proxy variable to Your environment, e.g. in Unix
55 export HTTP_PROXY='http://realproxy.charite.de:888'
56 (This can also be added to ~/.bashrc)
57 """
58
59 PDB_REF="""
60 The Protein Data Bank: a computer-based archival file for macromolecular structures.
61 F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi
62 J. Mol. Biol. 112 pp. 535-542 (1977)
63 http://www.pdb.org/.
64 """
65
66 alternative_download_url = "http://www.rcsb.org/pdb/files/"
67
68
69
70 - def __init__(self,server='ftp://ftp.wwpdb.org', pdb=os.getcwd(), obsolete_pdb=None):
71 """Initialize the class with the default server or a custom one."""
72
73 self.pdb_server = server
74
75
76 self.local_pdb = pdb
77
78
79 if obsolete_pdb:
80 self.obsolete_pdb = obsolete_pdb
81 else:
82 self.obsolete_pdb = os.path.join(self.local_pdb, 'obsolete')
83 if not os.access(self.obsolete_pdb,os.F_OK):
84 os.makedirs(self.obsolete_pdb)
85
86
87 self.overwrite = 0
88 self.flat_tree = 0
89
90
92 """Retrieves a list of pdb codes in the weekly pdb status file
93 from the given URL. Used by get_recent_files.
94
95 Typical contents of the list files parsed by this method is now
96 very simply one PDB name per line.
97 """
98 handle = urllib.urlopen(url)
99 answer = []
100 for line in handle:
101 pdb = line.strip()
102 assert len(pdb)==4
103 answer.append(pdb)
104 handle.close()
105 return answer
106
107
109 """Returns three lists of the newest weekly files (added,mod,obsolete).
110
111 Reads the directories with changed entries from the PDB server and
112 returns a tuple of three URL's to the files of new, modified and
113 obsolete entries from the most recent list. The directory with the
114 largest numerical name is used.
115 Returns None if something goes wrong.
116
117 Contents of the data/status dir (20031013 would be used);
118 drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006
119 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013
120 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README
121
122
123 """
124 url = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/')
125
126
127 recent = filter(lambda x: x.isdigit(), \
128 map(lambda x: x.split()[-1], url.readlines()))[-1]
129
130 path = self.pdb_server+'/pub/pdb/data/status/%s/'%(recent)
131
132 added = self.get_status_list(path+'added.pdb')
133 modified = self.get_status_list(path+'modified.pdb')
134 obsolete = self.get_status_list(path+'obsolete.pdb')
135 return [added,modified,obsolete]
136
137
138
140 """Retrieves a big file containing all the
141 PDB entries and some annotation to them.
142 Returns a list of PDB codes in the index file.
143 """
144 entries = []
145 print "retrieving index file. Takes about 5 MB."
146 url = urllib.urlopen(self.pdb_server+'/pub/pdb/derived_data/index/entries.idx')
147
148 entries = map(lambda x: x[:4], \
149 filter(lambda x: len(x)>4, url.readlines()[2:]))
150
151 return entries
152
153
154
156 """Returns a list of all obsolete entries ever in the PDB.
157
158 Returns a list of all obsolete pdb codes that have ever been
159 in the PDB.
160
161 Gets and parses the file from the PDB server in the format
162 (the first pdb_code column is the one used). The file looks
163 like this:
164
165 LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS
166 OBSLTE 31-JUL-94 116L 216L
167 ...
168 OBSLTE 29-JAN-96 1HFT 2HFT
169 OBSLTE 21-SEP-06 1HFV 2J5X
170 OBSLTE 21-NOV-03 1HG6
171 OBSLTE 18-JUL-84 1HHB 2HHB 3HHB
172 OBSLTE 08-NOV-96 1HID 2HID
173 OBSLTE 01-APR-97 1HIU 2HIU
174 OBSLTE 14-JAN-04 1HKE 1UUZ
175 ...
176
177 """
178 handle = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/obsolete.dat')
179
180
181 obsolete = []
182 for line in handle:
183 if not line.startswith("OBSLTE ") : continue
184 pdb = line.split()[2]
185 assert len(pdb)==4
186 obsolete.append(pdb)
187 handle.close()
188 return obsolete
189
190
191
192 - def retrieve_pdb_file(self,pdb_code, obsolete=0, compression='.gz',
193 uncompress="gunzip", pdir=None):
194 """Retrieves a PDB structure file from the PDB server and
195 stores it in a local file tree.
196 The PDB structure is returned as a single string.
197 If obsolete is 1, the file will be by default saved in a special file tree.
198 The compression should be '.Z' or '.gz'. 'uncompress' is
199 the command called to uncompress the files.
200
201 @param pdir: put the file in this directory (default: create a PDB-style directory tree)
202 @type pdir: string
203
204 @return: filename
205 @rtype: string
206 """
207
208 code=pdb_code.lower()
209 filename="pdb%s.ent%s"%(code,compression)
210 if not obsolete:
211 url=(self.pdb_server+
212 '/pub/pdb/data/structures/divided/pdb/%s/pdb%s.ent%s'
213 % (code[1:3],code,compression))
214 else:
215 url=(self.pdb_server+
216 '/pub/pdb/data/structures/obsolete/pdb/%s/pdb%s.ent%s'
217 % (code[1:3],code,compression))
218
219
220 if pdir is None:
221 if self.flat_tree:
222 if not obsolete:
223 path=self.local_pdb
224 else:
225 path=self.obsolete_pdb
226 else:
227
228 if not obsolete:
229 path=os.path.join(self.local_pdb, code[1:3])
230 else:
231 path=os.path.join(self.obsolete_pdb,code[1:3])
232 else:
233
234 path=pdir
235
236 if not os.access(path,os.F_OK):
237 os.makedirs(path)
238
239 filename=os.path.join(path, filename)
240
241 final_file=os.path.join(path, "pdb%s.ent" % code)
242
243
244 if not self.overwrite:
245 if os.path.exists(final_file):
246 print "file exists, not retrieved %s" % final_file
247 return final_file
248
249
250 print 'retrieving %s' % url
251 lines=urllib.urlopen(url).read()
252 open(filename,'wb').write(lines)
253
254 os.system("%s %s" % (uncompress, filename))
255
256 return final_file
257
258
260 """
261 I guess this is the 'most wanted' function from this module.
262 It gets the weekly lists of new and modified pdb entries and
263 automatically downloads the according PDB files.
264 You can call this module as a weekly cronjob.
265 """
266 assert os.path.isdir(self.local_pdb)
267 assert os.path.isdir(self.obsolete_pdb)
268
269 new, modified, obsolete = self.get_recent_changes()
270
271 for pdb_code in new+modified:
272 try:
273
274 self.retrieve_pdb_file(pdb_code)
275 except Exception:
276 print 'error %s\n' % pdb_code
277
278
279
280
281 for pdb_code in obsolete:
282 if self.flat_tree:
283 old_file = os.path.join(self.local_pdb,
284 'pdb%s.ent' % pdb_code)
285 new_dir = self.obsolete_pdb
286 else:
287 old_file = os.path.join(self.local_pdb, pdb_code[1:3],
288 'pdb%s.ent' % pdb_code)
289 new_dir = os.path.join(self.obsolete_pdb, pdb_code[1:3])
290 new_file = os.path.join(new_dir, 'pdb%s.ent' % pdb_code)
291 if os.path.isfile(old_file):
292 if not os.path.isdir(new_dir):
293 os.mkdir(new_dir)
294 try:
295 shutil.move(old_file, new_file)
296 except Exception:
297 print "Could not move %s to obsolete folder" % old_file
298 elif os.path.isfile(new_file):
299 print "Obsolete file %s already moved" % old_file
300 else:
301 print "Obsolete file %s is missing" % old_file
302
303
305 """Retrieves all PDB entries not present in the local PDB copy.
306 Writes a list file containing all PDB codes (optional, if listfile is given).
307 """
308 entries = self.get_all_entries()
309 for pdb_code in entries: self.retrieve_pdb_file(pdb_code)
310
311
312 if listfile:
313 open(listfile,'w').writelines(map(lambda x: x+'\n',entries))
314
315
317
318 """Retrieves all obsolete PDB entries not present in the local obsolete PDB copy.
319 Writes a list file containing all PDB codes (optional, if listfile is given).
320 """
321 entries = self.get_all_obsolete()
322 for pdb_code in entries: self.retrieve_pdb_file(pdb_code,obsolete=1)
323
324
325 if listfile:
326 open(listfile,'w').writelines(map(lambda x: x+'\n',entries))
327
328
329
330
331
332
333
335 """Retrieves a (big) file containing all the sequences
336 of PDB entries and writes it to a file."""
337 print "retrieving sequence file. Takes about 15 MB."
338 url = urllib.urlopen(self.pdb_server+'/pub/pdb/derived_data/pdb_seqres.txt')
339 file = url.readlines()
340 open(savefile,'w').writelines(file)
341
342
343
344 if __name__ == '__main__':
345
346 import sys
347
348 doc = """PDBList.py
349 (c) Kristian Rother 2003, Contributed to BioPython
350
351 Usage:
352 PDBList.py update <pdb_path> [options] - write weekly PDB updates to
353 local pdb tree.
354 PDBList.py all <pdb_path> [options] - write all PDB entries to
355 local pdb tree.
356 PDBList.py obsol <pdb_path> [options] - write all obsolete PDB
357 entries to local pdb tree.
358 PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure
359
360 Options:
361 -d A single directory will be used as <pdb_path>, not a tree.
362 -o Overwrite existing structure files.
363 """
364 print doc
365
366 if len(sys.argv)>2:
367 pdb_path = sys.argv[2]
368 pl = PDBList(pdb=pdb_path)
369 if len(sys.argv)>3:
370 for option in sys.argv[3:]:
371 if option == '-d': pl.flat_tree = 1
372 elif option == '-o': pl.overwrite = 1
373
374 else:
375 pdb_path = os.getcwd()
376 pl = PDBList()
377 pl.flat_tree = 1
378
379 if len(sys.argv) > 1:
380 if sys.argv[1] == 'update':
381
382 print "updating local PDB at "+pdb_path
383 pl.update_pdb()
384
385 elif sys.argv[1] == 'all':
386
387 pl.download_entire_pdb()
388
389 elif sys.argv[1] == 'obsol':
390
391 pl.download_obsolete_entries(pdb_path)
392
393 elif re.search('^\d...$',sys.argv[1]):
394
395 pl.retrieve_pdb_file(sys.argv[1],pdir=pdb_path)
396