source file: /disk/u/t/dev/python-svn/Lib/gzip.py
file stats: 325 lines, 246 executed: 75.7% covered
1. """Functions that read and write gzipped files.
2.
3. The user of the file doesn't have to worry about the compression,
4. but random access is not allowed."""
5.
6. # based on Andrew Kuchling's minigzip.py distributed with the zlib module
7.
8. import struct, sys, time
9. import zlib
10. import __builtin__
11.
12. __all__ = ["GzipFile","open"]
13.
14. FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
15.
16. READ, WRITE = 1, 2
17.
18. def U32(i):
19. """Return i as an unsigned integer, assuming it fits in 32 bits.
20.
21. If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
22. """
23. if i < 0:
24. i += 1L << 32
25. return i
26.
27. def LOWU32(i):
28. """Return the low-order 32 bits of an int, as a non-negative int."""
29. return i & 0xFFFFFFFFL
30.
31. def write32(output, value):
32. output.write(struct.pack("<l", value))
33.
34. def write32u(output, value):
35. # The L format writes the bit pattern correctly whether signed
36. # or unsigned.
37. output.write(struct.pack("<L", value))
38.
39. def read32(input):
40. return struct.unpack("<l", input.read(4))[0]
41.
42. def open(filename, mode="rb", compresslevel=9):
43. """Shorthand for GzipFile(filename, mode, compresslevel).
44.
45. The filename argument is required; mode defaults to 'rb'
46. and compresslevel defaults to 9.
47.
48. """
49. return GzipFile(filename, mode, compresslevel)
50.
51. class GzipFile:
52. """The GzipFile class simulates most of the methods of a file object with
53. the exception of the readinto() and truncate() methods.
54.
55. """
56.
57. myfileobj = None
58. max_read_chunk = 10 * 1024 * 1024 # 10Mb
59.
60. def __init__(self, filename=None, mode=None,
61. compresslevel=9, fileobj=None):
62. """Constructor for the GzipFile class.
63.
64. At least one of fileobj and filename must be given a
65. non-trivial value.
66.
67. The new class instance is based on fileobj, which can be a regular
68. file, a StringIO object, or any other object which simulates a file.
69. It defaults to None, in which case filename is opened to provide
70. a file object.
71.
72. When fileobj is not None, the filename argument is only used to be
73. included in the gzip file header, which may includes the original
74. filename of the uncompressed file. It defaults to the filename of
75. fileobj, if discernible; otherwise, it defaults to the empty string,
76. and in this case the original filename is not included in the header.
77.
78. The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
79. depending on whether the file will be read or written. The default
80. is the mode of fileobj if discernible; otherwise, the default is 'rb'.
81. Be aware that only the 'rb', 'ab', and 'wb' values should be used
82. for cross-platform portability.
83.
84. The compresslevel argument is an integer from 1 to 9 controlling the
85. level of compression; 1 is fastest and produces the least compression,
86. and 9 is slowest and produces the most compression. The default is 9.
87.
88. """
89.
90. # guarantee the file is opened in binary mode on platforms
91. # that care about that sort of thing
92. if mode and 'b' not in mode:
93. mode += 'b'
94. if fileobj is None:
95. fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
96. if filename is None:
97. if hasattr(fileobj, 'name'): filename = fileobj.name
98. else: filename = ''
99. if mode is None:
100. if hasattr(fileobj, 'mode'): mode = fileobj.mode
101. else: mode = 'rb'
102.
103. if mode[0:1] == 'r':
104. self.mode = READ
105. # Set flag indicating start of a new member
106. self._new_member = True
107. self.extrabuf = ""
108. self.extrasize = 0
109. self.name = filename
110. # Starts small, scales exponentially
111. self.min_readsize = 100
112.
113. elif mode[0:1] == 'w' or mode[0:1] == 'a':
114. self.mode = WRITE
115. self._init_write(filename)
116. self.compress = zlib.compressobj(compresslevel,
117. zlib.DEFLATED,
118. -zlib.MAX_WBITS,
119. zlib.DEF_MEM_LEVEL,
120. 0)
121. else:
122. raise IOError, "Mode " + mode + " not supported"
123.
124. self.fileobj = fileobj
125. self.offset = 0
126.
127. if self.mode == WRITE:
128. self._write_gzip_header()
129.
130. @property
131. def filename(self):
132. import warnings
133. warnings.warn("use the name attribute", DeprecationWarning)
134. if self.mode == WRITE and self.name[-3:] != ".gz":
135. return self.name + ".gz"
136. return self.name
137.
138. def __repr__(self):
139. s = repr(self.fileobj)
140. return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
141.
142. def _init_write(self, filename):
143. self.name = filename
144. self.crc = zlib.crc32("")
145. self.size = 0
146. self.writebuf = []
147. self.bufsize = 0
148.
149. def _write_gzip_header(self):
150. self.fileobj.write('\037\213') # magic header
151. self.fileobj.write('\010') # compression method
152. fname = self.name
153. if fname.endswith(".gz"):
154. fname = fname[:-3]
155. flags = 0
156. if fname:
157. flags = FNAME
158. self.fileobj.write(chr(flags))
159. write32u(self.fileobj, long(time.time()))
160. self.fileobj.write('\002')
161. self.fileobj.write('\377')
162. if fname:
163. self.fileobj.write(fname + '\000')
164.
165. def _init_read(self):
166. self.crc = zlib.crc32("")
167. self.size = 0
168.
169. def _read_gzip_header(self):
170. magic = self.fileobj.read(2)
171. if magic != '\037\213':
172. raise IOError, 'Not a gzipped file'
173. method = ord( self.fileobj.read(1) )
174. if method != 8:
175. raise IOError, 'Unknown compression method'
176. flag = ord( self.fileobj.read(1) )
177. # modtime = self.fileobj.read(4)
178. # extraflag = self.fileobj.read(1)
179. # os = self.fileobj.read(1)
180. self.fileobj.read(6)
181.
182. if flag & FEXTRA:
183. # Read & discard the extra field, if present
184. xlen = ord(self.fileobj.read(1))
185. xlen = xlen + 256*ord(self.fileobj.read(1))
186. self.fileobj.read(xlen)
187. if flag & FNAME:
188. # Read and discard a null-terminated string containing the filename
189. while True:
190. s = self.fileobj.read(1)
191. if not s or s=='\000':
192. break
193. if flag & FCOMMENT:
194. # Read and discard a null-terminated string containing a comment
195. while True:
196. s = self.fileobj.read(1)
197. if not s or s=='\000':
198. break
199. if flag & FHCRC:
200. self.fileobj.read(2) # Read & discard the 16-bit header CRC
201.
202.
203. def write(self,data):
204. if self.mode != WRITE:
205. import errno
206. raise IOError(errno.EBADF, "write() on read-only GzipFile object")
207.
208. if self.fileobj is None:
209. raise ValueError, "write() on closed GzipFile object"
210. if len(data) > 0:
211. self.size = self.size + len(data)
212. self.crc = zlib.crc32(data, self.crc)
213. self.fileobj.write( self.compress.compress(data) )
214. self.offset += len(data)
215.
216. def read(self, size=-1):
217. if self.mode != READ:
218. import errno
219. raise IOError(errno.EBADF, "read() on write-only GzipFile object")
220.
221. if self.extrasize <= 0 and self.fileobj is None:
222. return ''
223.
224. readsize = 1024
225. if size < 0: # get the whole thing
226. try:
227. while True:
228. self._read(readsize)
229. readsize = min(self.max_read_chunk, readsize * 2)
230. except EOFError:
231. size = self.extrasize
232. else: # just get some more of it
233. try:
234. while size > self.extrasize:
235. self._read(readsize)
236. readsize = min(self.max_read_chunk, readsize * 2)
237. except EOFError:
238. if size > self.extrasize:
239. size = self.extrasize
240.
241. chunk = self.extrabuf[:size]
242. self.extrabuf = self.extrabuf[size:]
243. self.extrasize = self.extrasize - size
244.
245. self.offset += size
246. return chunk
247.
248. def _unread(self, buf):
249. self.extrabuf = buf + self.extrabuf
250. self.extrasize = len(buf) + self.extrasize
251. self.offset -= len(buf)
252.
253. def _read(self, size=1024):
254. if self.fileobj is None:
255. raise EOFError, "Reached EOF"
256.
257. if self._new_member:
258. # If the _new_member flag is set, we have to
259. # jump to the next member, if there is one.
260. #
261. # First, check if we're at the end of the file;
262. # if so, it's time to stop; no more members to read.
263. pos = self.fileobj.tell() # Save current position
264. self.fileobj.seek(0, 2) # Seek to end of file
265. if pos == self.fileobj.tell():
266. raise EOFError, "Reached EOF"
267. else:
268. self.fileobj.seek( pos ) # Return to original position
269.
270. self._init_read()
271. self._read_gzip_header()
272. self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
273. self._new_member = False
274.
275. # Read a chunk of data from the file
276. buf = self.fileobj.read(size)
277.
278. # If the EOF has been reached, flush the decompression object
279. # and mark this object as finished.
280.
281. if buf == "":
282. uncompress = self.decompress.flush()
283. self._read_eof()
284. self._add_read_data( uncompress )
285. raise EOFError, 'Reached EOF'
286.
287. uncompress = self.decompress.decompress(buf)
288. self._add_read_data( uncompress )
289.
290. if self.decompress.unused_data != "":
291. # Ending case: we've come to the end of a member in the file,
292. # so seek back to the start of the unused data, finish up
293. # this member, and read a new gzip header.
294. # (The number of bytes to seek back is the length of the unused
295. # data, minus 8 because _read_eof() will rewind a further 8 bytes)
296. self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
297.
298. # Check the CRC and file size, and set the flag so we read
299. # a new member on the next call
300. self._read_eof()
301. self._new_member = True
302.
303. def _add_read_data(self, data):
304. self.crc = zlib.crc32(data, self.crc)
305. self.extrabuf = self.extrabuf + data
306. self.extrasize = self.extrasize + len(data)
307. self.size = self.size + len(data)
308.
309. def _read_eof(self):
310. # We've read to the end of the file, so we have to rewind in order
311. # to reread the 8 bytes containing the CRC and the file size.
312. # We check the that the computed CRC and size of the
313. # uncompressed data matches the stored values. Note that the size
314. # stored is the true file size mod 2**32.
315. self.fileobj.seek(-8, 1)
316. crc32 = read32(self.fileobj)
317. isize = U32(read32(self.fileobj)) # may exceed 2GB
318. if U32(crc32) != U32(self.crc):
319. raise IOError, "CRC check failed"
320. elif isize != LOWU32(self.size):
321. raise IOError, "Incorrect length of data produced"
322.
323. def close(self):
324. if self.mode == WRITE:
325. self.fileobj.write(self.compress.flush())
326. # The native zlib crc is an unsigned 32-bit integer, but
327. # the Python wrapper implicitly casts that to a signed C
328. # long. So, on a 32-bit box self.crc may "look negative",
329. # while the same crc on a 64-bit box may "look positive".
330. # To avoid irksome warnings from the `struct` module, force
331. # it to look positive on all boxes.
332. write32u(self.fileobj, LOWU32(self.crc))
333. # self.size may exceed 2GB, or even 4GB
334. write32u(self.fileobj, LOWU32(self.size))
335. self.fileobj = None
336. elif self.mode == READ:
337. self.fileobj = None
338. if self.myfileobj:
339. self.myfileobj.close()
340. self.myfileobj = None
341.
342. def __del__(self):
343. try:
344. if (self.myfileobj is None and
345. self.fileobj is None):
346. return
347. except AttributeError:
348. return
349. self.close()
350.
351. def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
352. if self.mode == WRITE:
353. # Ensure the compressor's buffer is flushed
354. self.fileobj.write(self.compress.flush(zlib_mode))
355. self.fileobj.flush()
356.
357. def fileno(self):
358. """Invoke the underlying file object's fileno() method.
359.
360. This will raise AttributeError if the underlying file object
361. doesn't support fileno().
362. """
363. return self.fileobj.fileno()
364.
365. def isatty(self):
366. return False
367.
368. def tell(self):
369. return self.offset
370.
371. def rewind(self):
372. '''Return the uncompressed stream file position indicator to the
373. beginning of the file'''
374. if self.mode != READ:
375. raise IOError("Can't rewind in write mode")
376. self.fileobj.seek(0)
377. self._new_member = True
378. self.extrabuf = ""
379. self.extrasize = 0
380. self.offset = 0
381.
382. def seek(self, offset, whence=0):
383. if whence:
384. if whence == 1:
385. offset = self.offset + offset
386. else:
387. raise ValueError('Seek from end not supported')
388. if self.mode == WRITE:
389. if offset < self.offset:
390. raise IOError('Negative seek in write mode')
391. count = offset - self.offset
392. for i in range(count // 1024):
393. self.write(1024 * '\0')
394. self.write((count % 1024) * '\0')
395. elif self.mode == READ:
396. if offset < self.offset:
397. # for negative seek, rewind and do positive seek
398. self.rewind()
399. count = offset - self.offset
400. for i in range(count // 1024):
401. self.read(1024)
402. self.read(count % 1024)
403.
404. def readline(self, size=-1):
405. if size < 0:
406. size = sys.maxint
407. readsize = self.min_readsize
408. else:
409. readsize = size
410. bufs = []
411. while size != 0:
412. c = self.read(readsize)
413. i = c.find('\n')
414.
415. # We set i=size to break out of the loop under two
416. # conditions: 1) there's no newline, and the chunk is
417. # larger than size, or 2) there is a newline, but the
418. # resulting line would be longer than 'size'.
419. if (size <= i) or (i == -1 and len(c) > size):
420. i = size - 1
421.
422. if i >= 0 or c == '':
423. bufs.append(c[:i + 1]) # Add portion of last chunk
424. self._unread(c[i + 1:]) # Push back rest of chunk
425. break
426.
427. # Append chunk to list, decrease 'size',
428. bufs.append(c)
429. size = size - len(c)
430. readsize = min(size, readsize * 2)
431. if readsize > self.min_readsize:
432. self.min_readsize = min(readsize, self.min_readsize * 2, 512)
433. return ''.join(bufs) # Return resulting line
434.
435. def readlines(self, sizehint=0):
436. # Negative numbers result in reading all the lines
437. if sizehint <= 0:
438. sizehint = sys.maxint
439. L = []
440. while sizehint > 0:
441. line = self.readline()
442. if line == "":
443. break
444. L.append(line)
445. sizehint = sizehint - len(line)
446.
447. return L
448.
449. def writelines(self, L):
450. for line in L:
451. self.write(line)
452.
453. def __iter__(self):
454. return self
455.
456. def next(self):
457. line = self.readline()
458. if line:
459. return line
460. else:
461. raise StopIteration
462.
463.
464. def _test():
465. # Act like gzip; with -d, act like gunzip.
466. # The input file is not deleted, however, nor are any other gzip
467. # options or features supported.
468. args = sys.argv[1:]
469. decompress = args and args[0] == "-d"
470. if decompress:
471. args = args[1:]
472. if not args:
473. args = ["-"]
474. for arg in args:
475. if decompress:
476. if arg == "-":
477. f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
478. g = sys.stdout
479. else:
480. if arg[-3:] != ".gz":
481. print "filename doesn't end in .gz:", repr(arg)
482. continue
483. f = open(arg, "rb")
484. g = __builtin__.open(arg[:-3], "wb")
485. else:
486. if arg == "-":
487. f = sys.stdin
488. g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
489. else:
490. f = __builtin__.open(arg, "rb")
491. g = open(arg + ".gz", "wb")
492. while True:
493. chunk = f.read(1024)
494. if not chunk:
495. break
496. g.write(chunk)
497. if g is not sys.stdout:
498. g.close()
499. if f is not sys.stdin:
500. f.close()
501.
502. if __name__ == '__main__':
503. _test()