Use gzip to compress files uploaded to swift

We've discovered that rackspace swift seems to always want to gzip encode files when clients request their contents. When our files are deflate encoded this results in files that are first deflate encoded then gzip encoded. Not all browers or layer 7 firewalls can handle this (despite being perfectly valid according to the HTTP RFCs). We'll use gzip to see if that causes rackspace to not double encode the files. To do this with memory efficienty we vendor a tool from pypi called gzip-stream which allows us to read chunks of the compressed data at a time without loading the entire file into memory or writing multiple gzip headers in a single file. Change-Id: I9483cfdbd8e7d0683eeb24d28dd6d8b0c0e772fa
2019-10-11 08:46:20 -07:00 · 2019-10-11 08:46:20 -07:00 · 7755ef1b8d
commit 7755ef1b8d
parent 61c05a180f
1 changed files with 113 additions and 2 deletions
--- a/roles/upload-logs-swift/library/zuul_swift_upload.py
+++ b/roles/upload-logs-swift/library/zuul_swift_upload.py
@ -25,6 +25,8 @@ Utility to upload files to swift
 """

 import argparse
+import gzip
+import io
 import logging
 import mimetypes
 import os
@ -131,6 +133,95 @@ ICON_IMAGES = {
                   'AupSdoFsAAAAAElFTkSuQmCC'}


+# Begin vendored code
+# This code is licensed under the Public Domain/CC0 and comes from
+# https://github.com/leenr/gzip-stream/blob/master/gzip_stream.py
+# Code was modified:
+#   removed type annotations to support python2.
+#   removed use of *, somearg for positional anonymous args.
+#   Default compression level to 9.
+
+class GZIPCompressedStream(io.RawIOBase):
+    def __init__(self, stream, compression_level=9):
+        assert 1 <= compression_level <= 9
+
+        self._compression_level = compression_level
+        self._stream = stream
+
+        self._compressed_stream = io.BytesIO()
+        self._compressor = gzip.GzipFile(
+            mode='wb',
+            fileobj=self._compressed_stream,
+            compresslevel=compression_level
+        )
+
+        # because of the GZIP header written by `GzipFile.__init__`:
+        self._compressed_stream.seek(0)
+
+    @property
+    def compression_level(self):
+        return self._compression_level
+
+    @property
+    def stream(self):
+        return self._stream
+
+    def readable(self):
+        return True
+
+    def _read_compressed_into(self, b):
+        buf = self._compressed_stream.read(len(b))
+        b[:len(buf)] = buf
+        return len(buf)
+
+    def readinto(self, b):
+        b = memoryview(b)
+
+        offset = 0
+        size = len(b)
+        while offset < size:
+            offset += self._read_compressed_into(b[offset:])
+            if offset < size:
+                # self._compressed_buffer now empty
+                if self._compressor.closed:
+                    # nothing to compress anymore
+                    break
+                # compress next bytes
+                self._read_n_compress(size)
+
+        return offset
+
+    def _read_n_compress(self, size):
+        assert size > 0
+
+        data = self._stream.read(size)
+
+        # rewind buffer to the start to free up memory
+        # (because anything currently in the buffer should be already
+        #  streamed off the object)
+        self._compressed_stream.seek(0)
+        self._compressed_stream.truncate(0)
+
+        if data:
+            self._compressor.write(data)
+        else:
+            # this will write final data (will flush zlib with Z_FINISH)
+            self._compressor.close()
+
+        # rewind to the buffer start
+        self._compressed_stream.seek(0)
+
+    def __repr__(self):
+        return (
+            '{self.__class__.__name__}('
+            '{self.stream!r}, '
+            'compression_level={self.compression_level!r}'
+            ')'
+        ).format(self=self)
+
+# End vendored code
+
+
 def get_mime_icon(mime, filename=''):
    icon = (APACHE_FILE_ICON_MAP.get(filename) or
            APACHE_MIME_ICON_MAP.get(mime) or
@ -463,6 +554,26 @@ class Indexer():
        self.file_list.file_list = new_list


+class GzipFilter():
+    chunk_size = 16384
+
+    def __init__(self, infile):
+        self.gzipfile = GZIPCompressedStream(infile)
+        self.done = False
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.done:
+            self.gzipfile.close()
+            raise StopIteration()
+        data = self.gzipfile.read(self.chunk_size)
+        if not data:
+            self.done = True
+        return data
+
+
 class DeflateFilter():
    chunk_size = 16384

@ -622,8 +733,8 @@ class Uploader():
        if not file_detail.folder:
            if (file_detail.encoding is None and
                self._is_text_type(file_detail.mimetype)):
-                headers['content-encoding'] = 'deflate'
-                data = DeflateFilter(open(file_detail.full_path, 'rb'))
+                headers['content-encoding'] = 'gzip'
+                data = GzipFilter(open(file_detail.full_path, 'rb'))
            else:
                if file_detail.encoding:
                    headers['content-encoding'] = file_detail.encoding