Added support for streaming multipart decoding

tzickel · tzickel · commit 0743890d51ed · 2018-06-16T19:54:00.000+03:00
diff --git a/requests_toolbelt/multipart/__init__.py b/requests_toolbelt/multipart/__init__.py
@@ -9,7 +9,7 @@
 """
 
 from .encoder import MultipartEncoder, MultipartEncoderMonitor
-from .decoder import MultipartDecoder
+from .decoder import MultipartDecoder, MultipartStreamDecoder
 from .decoder import ImproperBodyPartContentException
 from .decoder import NonMultipartContentTypeException
 
diff --git a/requests_toolbelt/multipart/decoder.py b/requests_toolbelt/multipart/decoder.py
@@ -107,11 +107,12 @@ def __init__(self, content, content_type, encoding='utf-8'):
         self.encoding = encoding
         #: Parsed parts of the multipart response body
         self.parts = tuple()
-        self._find_boundary()
+        self.boundary = MultipartDecoder._find_boundary(content_type, encoding)
         self._parse_body(content)
 
-    def _find_boundary(self):
-        ct_info = tuple(x.strip() for x in self.content_type.split(';'))
+    @staticmethod
+    def _find_boundary(content_type, encoding):
+        ct_info = tuple(x.strip() for x in content_type.split(';'))
         mimetype = ct_info[0]
         if mimetype.split('/')[0].lower() != 'multipart':
             raise NonMultipartContentTypeException(
@@ -123,7 +124,8 @@ def _find_boundary(self):
                 '='
             )
             if attr.lower() == 'boundary':
-                self.boundary = encode_with(value.strip('"'), self.encoding)
+                boundary = encode_with(value.strip('"'), encoding)
+        return boundary
 
     @staticmethod
     def _fix_first_part(part, boundary_marker):
@@ -154,3 +156,145 @@ def from_response(cls, response, encoding='utf-8'):
         content = response.content
         content_type = response.headers.get('content-type', None)
         return cls(content, content_type, encoding)
+
+
+class AlreadyIteratedException(Exception):
+    pass
+
+
+# Currently .text is not implemented
+class StreamPart(object):
+    def __init__(self, headers, iterator):
+        self.headers = headers
+        self._iterator = iterator
+        self._started = False
+        self._consumed = False
+
+    def __iter__(self):
+        if self._started:
+            raise AlreadyIteratedException()
+        self._started = True
+        for typ, data in self._iterator():
+            # TODO break if data is True as well ?
+            if typ == 'done' and data == False:
+                break
+            elif typ == 'stream':
+                yield data
+            else:
+                raise ImproperBodyPartContentException()
+
+    @property
+    def content(self):
+        if self._consumed:
+            return self._content
+        if self._started:
+            raise AlreadyIteratedException()
+        self._content = b''.join(self)
+        self._consumed = True
+        return self._content
+
+
+# On error this will not consume all data, afaik requests will handle this and deplete the stream
+# part_test is different then before (it's a stream, can't know what will be)
+class MultipartStreamDecoder(object):
+    @classmethod
+    def from_response(cls, response, encoding='utf-8', chunk_size=10 * 1024):
+        content = lambda: response.raw.read(chunk_size)
+        content_type = response.headers.get('content-type', None)
+        return cls(content, content_type, encoding)
+
+    def __init__(self, stream_read_func, content_type, encoding='utf-8'):
+        self.content_type = content_type
+        self.encoding = encoding
+        self._stream_read_func = stream_read_func
+        self._boundary = MultipartDecoder._find_boundary(content_type, encoding)
+        self._splitter = StreamSplitter()
+        self._boundary = b''.join((b'--', self._boundary))
+        self._boundary_split = b''.join((b'\r\n', self._boundary))
+        self._state = 0
+        self._found = False
+        self._started = False
+
+    def __iter__(self):
+        if self._started:
+            raise AlreadyIteratedException()
+        self._started = True
+        for typ, data in self._stream():
+            if typ == 'headers':
+                yield StreamPart(data, self._stream)
+            else:
+                raise ImproperBodyPartContentException()
+
+    def _stream(self):
+        while True:
+            data = self._stream_read_func()
+            # This persumes that if data returned empty once it won't return anything again (EOS)
+            # TODO should we guard against data that returns None instead of '' ?
+            if not self._found and not data:
+                break
+            # Remove the first empty multipart part
+            if self._state == 0:
+                # TODO can this be non empty?
+                _, self._found = self._splitter.stream(data, self._boundary_split)
+                if self._found:
+                    self._state = 1
+                    continue
+            # Parse the headers
+            elif self._state == 1:
+                headers, self._found = self._splitter.stream(data, b'\r\n\r\n', True)
+                if headers:
+                    # TODO can this only happen in headers or in body as well if didn't have headers ?
+                    #headers = MultipartDecoder._fix_first_part(headers, boundary)
+                    # TODO should headers be utf8 ? in python3 they are binary
+                    headers = _header_parser(headers.lstrip(), self.encoding)
+                    headers = CaseInsensitiveDict(headers)
+                    self._state = 2
+                    yield 'headers', headers
+                    continue
+                # No headers found
+                if self._found:
+                    headers = CaseInsensitiveDict({})
+                    self._state = 2
+                    yield 'headers', headers
+                    continue
+            # Stream the part
+            elif self._state == 2:
+                stream, self._found = self._splitter.stream(data, self._boundary_split)
+                if stream:
+                    yield 'stream', stream
+                # boundary_split found, end of part
+                if self._found:
+                    self._state = 1
+                    yield 'done', False
+                    continue
+
+
+# TODO this can be implmented with less copying
+class StreamSplitter(object):
+    def __init__(self):
+        self.leftover = b''
+    
+    def stream(self, data, split_data, return_only_full=False):
+        self.leftover += data
+        index = self.leftover.find(split_data)
+        if return_only_full:
+            if index > -1:
+                ret = self.leftover[:index]
+                self.leftover = self.leftover[index + len(split_data):]
+                found = True
+            else:
+                ret = b''
+                found = False
+        else:
+            if index > -1:
+                ret = self.leftover[:index]
+                self.leftover = self.leftover[index + len(split_data):]
+                found = True
+            elif len(self.leftover) >= len(split_data):
+                ret = self.leftover[:-len(split_data)]
+                self.leftover = self.leftover[-len(split_data):]
+                found = False
+            else:
+                ret = b''
+                found = False
+        return ret, found