Python+Requests编码识别Bug
>>> r = requests.get('http://cn.python-requests.org/en/latest/')
>>> r.headers['content-type']
'text/html'
>>> r.encoding
'ISO-8859-1'
>>> r.apparent_encoding
'utf-8'
>>> requests.utils.get_encodings_from_content(r.content)
['utf-8']
>>> r = requests.get('http://reader.360duzhe.com/2013_24/index.html')
>>> r.headers['content-type']
'text/html'
>>> r.encoding
'ISO-8859-1'
>>> r.apparent_encoding
'gb2312'
>>> requests.utils.get_encodings_from_content(r.content)
['gb2312']
终极解决办法:
if r.encoding == 'ISO-8859-1':
encodings = requests.utils.get_encodings_from_content(r.content)
if encodings:
r.encoding = encodings[0]
else:
r.encoding = r.apparent_encoding
r._content = r.content.decode(r.encoding, 'replace').encode('utf8', 'replace')
http://www.tuicool.com/articles/vEJzMv
>>> r = requests.get('http://cn.python-requests.org/en/latest/')
>>> r.headers['content-type']
'text/html'
>>> r.encoding
'ISO-8859-1'
>>> r.apparent_encoding
'utf-8'
>>> requests.utils.get_encodings_from_content(r.content)
['utf-8']
>>> r = requests.get('http://reader.360duzhe.com/2013_24/index.html')
>>> r.headers['content-type']
'text/html'
>>> r.encoding
'ISO-8859-1'
>>> r.apparent_encoding
'gb2312'
>>> requests.utils.get_encodings_from_content(r.content)
['gb2312']
终极解决办法:
if r.encoding == 'ISO-8859-1':
encodings = requests.utils.get_encodings_from_content(r.content)
if encodings:
r.encoding = encodings[0]
else:
r.encoding = r.apparent_encoding
r._content = r.content.decode(r.encoding, 'replace').encode('utf8', 'replace')
http://www.tuicool.com/articles/vEJzMv