【问题标题】:PyCURL is processing body before headersPyCURL 在标题之前处理正文
【发布时间】:2016-07-14 10:27:13
【问题描述】:

this question 的公认答案的启发,我正在尝试使用类似requests 的界面来包装 PyCurl。一切都会好的,但是在遵循PyCURL docs 描述如何从标头读取正文编码之后,我遇到了以下问题。每个响应头都会调用头回调,但只有在迭代器开始产生响应行之后才会调用,这使得编码/字符集检测毫无意义。

代码如下:

import re
import io
import urllib
import urllib.error
import http

import pycurl


class CurlHTTPStream(object):

    SELECT_TIMEOUT = 10
    HTTP_STANDARD_ENCODING = 'iso-8859-1'

    def __init__(self, method, url, data=None, params=None, headers=None):
        self.url = url
        self.received_buffer = io.BytesIO()

        self.curl = pycurl.Curl()
        self.curl.setopt(pycurl.CUSTOMREQUEST, method)
        if headers:
            self.curl.setopt(
                pycurl.HTTPHEADER,
                [
                    '{}: {}'.format(key, value)
                    for key, value in headers.items()
                ]
            )
        if params:
            query_string = '&'.join((
                '{}={}'.format(key, value)
                for key, value in params.items()
            ))
            url = '{}?{}'.format(url, query_string)
        self.curl.setopt(pycurl.URL, url)
        self.curl.setopt(pycurl.ENCODING, 'gzip')
        self.curl.setopt(pycurl.CONNECTTIMEOUT, 5)
        self.curl.setopt(pycurl.HEADERFUNCTION, self.header_function)
        self.curl.setopt(pycurl.WRITEFUNCTION, self.received_buffer.write)

        self.curl_multi = pycurl.CurlMulti()
        self.curl_multi.add_handle(self.curl)

        self.status_code = 0
        self.headers = {}

    def _any_data_received(self):
        return self.received_buffer.tell() != 0

    def _get_received_data(self):
        result = self.received_buffer.getvalue()
        self.received_buffer.truncate(0)
        self.received_buffer.seek(0)
        return result

    def _check_status_code(self):
        if self.status_code == 0:
            self.status_code = self.curl.getinfo(pycurl.HTTP_CODE)
        if self.status_code != 0 and self.status_code != http.HTTPStatus.OK:
            raise urllib.error.HTTPError(
                self.url, self.status_code, None, None, None
            )

    def _perform_on_curl(self):
        while True:
            ret, num_handles = self.curl_multi.perform()
            if ret != pycurl.E_CALL_MULTI_PERFORM:
                break
        return num_handles

    def _iter_chunks(self):
        while True:
            remaining = self._perform_on_curl()
            if self._any_data_received():
                self._check_status_code()
                yield self._get_received_data()
            if remaining == 0:
                break
            self.curl_multi.select(self.SELECT_TIMEOUT)

        self._check_status_code()
        self._check_curl_errors()

    def _check_curl_errors(self):
        for f in self.curl_multi.info_read()[2]:
            raise pycurl.error(*f[1:])

    def iter_lines(self):
        chunks = self._iter_chunks()
        return self._split_lines_from_chunks(chunks)

    def _split_lines_from_chunks(self, chunks):
        print('foo')
        print(self.headers)
        charset = None
        if 'content-type' in self.headers:
            content_type = self.headers['content-type'].lower()
            match = re.search('charset=(\S+)', content_type)
            if match:
                charset = match.group(1)
                print('Decoding using %s' % charset)
        if charset is None:
            charset = self.HTTP_STANDARD_ENCODING
            print('Assuming encoding is %s' % charset)
        pending = None
        for chunk in chunks:
            if pending is not None:
                chunk = pending + chunk
            lines = chunk.splitlines()
            if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]:
                pending = lines.pop()
            else:
                pending = None
            for line in lines:
                yield line.decode(charset)
        if pending is not None:
            yield pending.decode(charset)

    def header_function(self, header_line):
        print('hello')
        header_line = header_line.decode(self.HTTP_STANDARD_ENCODING)
        if ':' not in header_line:
            return
        name, value = header_line.split(':', 1)
        name = name.strip()
        value = value.strip()
        name = name.lower()
        self.headers[name] = value


def request(method, url, data=None, params=None, headers=None,
            stream=False):
    if stream:
        return CurlHTTPStream(method, url, data=data, params=params,
                              headers=headers)

这就是我尝试测试时在终端中发生的情况:

Python 3.5.1 (default, Dec 09 2015, 07:29:36) [GCC] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from pycurl_requests.requests import request
>>> r = request('GET', 'http://my-couchdb-instance:5984/user-30323561366530622d336135622d343637372d386464392d613038653536663865636566/_changes', params={'feed': 'continuous'}, stream=True)
>>> for l in r.iter_lines():
...     print(l)
... 
foo
{}
Assuming encoding is iso-8859-1
hello
hello
hello
hello
hello
hello
hello
{"seq":1,"id":"account","changes":[{"rev":"1-806053b347406e04d1872e13199fd3cf"}]}
{"seq":4,"id":"identity-bd2c5007-9df3-4ece-9751-843bf5523edd","changes":[{"rev":"1-e3a98ec37776f2cb479b2dcae0266700"}]}
{"seq":5,"id":"section_phone-0342667c-ecbd-401f-acfe-7bb2a1aa3159","changes":[{"rev":"1-457342bc895c7cb6924ceabd07e1ffcf"}]}

有更多行来自 CouchDB 更改提要,但我截断了输出,因为它们不相关。

基本上,输出中的foo 表示它进入了它期望标题就位的块,但下一行显示self.headers 是空的。多个hello 代表每次调用header_function()。怎么会在触发 header 回调之前调用将 body 写入 BytesIO 的 write 回调?

【问题讨论】:

    标签: python python-3.x pycurl response-headers


    【解决方案1】:

    我找到了解决方案。问题是_split_lines_from_chunks(self, chunks) 在响应出现之前就被触发了,所以标题也不存在。

    这是有效的代码。当第一行正文可用时检测到字符集,因此我已经确定处理了所有标题。

    import re
    import io
    import urllib
    import urllib.error
    import http
    
    import pycurl
    
    
    class CurlHTTPStream(object):
    
        SELECT_TIMEOUT = 10
        HTTP_STANDARD_ENCODING = 'iso-8859-1'
    
        def __init__(self, method, url, data=None, params=None, headers=None):
            self.url = url
            self.received_buffer = io.BytesIO()
    
            self.curl = pycurl.Curl()
            self.curl.setopt(pycurl.CUSTOMREQUEST, method)
            if headers:
                self.curl.setopt(
                    pycurl.HTTPHEADER,
                    [
                        '{}: {}'.format(key, value)
                        for key, value in headers.items()
                    ]
                )
            if params:
                query_string = '&'.join((
                    '{}={}'.format(key, value)
                    for key, value in params.items()
                ))
                url = '{}?{}'.format(url, query_string)
            self.curl.setopt(pycurl.URL, url)
            self.curl.setopt(pycurl.ENCODING, 'gzip')
            self.curl.setopt(pycurl.CONNECTTIMEOUT, 5)
            self.curl.setopt(pycurl.HEADERFUNCTION, self.header_function)
            self.curl.setopt(pycurl.WRITEFUNCTION, self.received_buffer.write)
    
            self.curl_multi = pycurl.CurlMulti()
            self.curl_multi.add_handle(self.curl)
    
            self.status_code = 0
            self.headers = {}
            self._charset = None
    
        def _any_data_received(self):
            return self.received_buffer.tell() != 0
    
        def _get_received_data(self):
            result = self.received_buffer.getvalue()
            self.received_buffer.truncate(0)
            self.received_buffer.seek(0)
            return result
    
        def _check_status_code(self):
            if self.status_code == 0:
                self.status_code = self.curl.getinfo(pycurl.HTTP_CODE)
            if self.status_code != 0 and self.status_code != http.HTTPStatus.OK:
                raise urllib.error.HTTPError(
                    self.url, self.status_code, None, None, None
                )
    
        def _perform_on_curl(self):
            while True:
                ret, num_handles = self.curl_multi.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break
            return num_handles
    
        def _iter_chunks(self):
            while True:
                remaining = self._perform_on_curl()
                if self._any_data_received():
                    self._check_status_code()
                    yield self._get_received_data()
                if remaining == 0:
                    break
                self.curl_multi.select(self.SELECT_TIMEOUT)
    
            self._check_status_code()
            self._check_curl_errors()
    
        def _check_curl_errors(self):
            for f in self.curl_multi.info_read()[2]:
                raise pycurl.error(*f[1:])
    
        def iter_lines(self):
            chunks = self._iter_chunks()
            return self._split_lines_from_chunks(chunks)
    
        def _split_lines_from_chunks(self, chunks):
            print('foo')
            print(self.headers)
            pending = None
            for chunk in chunks:
                if pending is not None:
                    chunk = pending + chunk
                lines = chunk.splitlines()
                if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]:
                    pending = lines.pop()
                else:
                    pending = None
                for line in lines:
                    yield line.decode(self.charset)
            if pending is not None:
                yield pending.decode(self.charset)
    
        @property
        def charset(self):
            if self._charset is not None:
                return self._charset
            try:
                content_type = self.headers['content-type'].lower()
                match = re.search('charset=(\S+)', content_type)
                if match:
                    self._charset = match.group(1).strip()
                    print('Decoding using %s' % self._charset)
                else:
                    raise KeyError('charset')
            except KeyError:
                self._charset = self.HTTP_STANDARD_ENCODING
                print('Assuming encoding is %s' % self._charset)
            return self._charset
    
        def header_function(self, header_line):
            print('hello')
            header_line = header_line.decode(self.HTTP_STANDARD_ENCODING)
            if ':' not in header_line:
                return
            name, value = header_line.split(':', 1)
            name = name.strip()
            value = value.strip()
            name = name.lower()
            self.headers[name] = value
    
    
    def request(method, url, data=None, params=None, headers=None,
                stream=False):
        if stream:
            return CurlHTTPStream(method, url, data=data, params=params,
                                  headers=headers)
    

    【讨论】:

      猜你喜欢
      • 2010-12-23
      • 2022-01-13
      • 1970-01-01
      • 2018-03-03
      • 2020-12-06
      • 1970-01-01
      • 2021-08-02
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多