【python】获取http响应

2023-08-13 19:53:28

一个相对完整的http请求，输入ip和端口，输出响应码，响应头，响应体，是否超时，以及出错时的错误信息

处理包括：

1.协议处理，如果是443用https，其他用http

2.HTTPError处理，HTTPError一般是401,403,404之类的错误，虽然报错，但是也有响应头。注意获取错误信息时要用str(e)，其他的比如repr(e)得到的不是字符串，e.read()是响应体，不是错误原因

3.URLError处理，一般是Connection refused之类的错误。注意获取错误信息时要用str(e.reason)

4.响应体gzip解压

5.响应体编码转换

# coding=utf8

import urllib2

import chardet

import traceback

import StringIO

import re

import gzip

def plugin_homepage(data, timeout):

    ip = data["ip"]

    port = data["port"]

    if port == 443:

        url = "https://%s:%s/" % (ip, port)

    else:

        url = "http://%s:%s/" % (ip, port)

    is_timeout, error_reason, code, header, body, title = get_html(url, timeout)

    res = {"ip": ip,

           "port": port,

           "rsp_header": header,

           "rsp_body": body,

           "code": code,

           "title": title,

           "is_timeout": is_timeout,

           "error_reason": error_reason}

    return res

def get_html(url, timeout):

    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'

    headers = {'User-Agent': user_agent}

    is_timeout = False

    error_reason = None

    code = None

    header = None

    body = None

    title = None

    try:

        request = urllib2.Request(url, headers=headers)

        response = urllib2.urlopen(request, timeout=timeout)

        code = response.getcode()

        body = response.read()

        header = str(response.headers)

    except urllib2.HTTPError, e:   # 处理http错误

        # print "str(e):%s\nrepr(e):%s\ne:%s\ne.read():%s\n" % (str(e), repr(e), e, e.read())

        error_reason = str(e)

        body = e.read()

        header = e.headers

    except urllib2.URLError, e:

        print traceback.print_exc()

        error_reason = str(e.reason)

        if error_reason == "timed out":  # 判断是否超时

            is_timeout = True

        return is_timeout, error_reason, code, header, body, title

    except Exception, e:

        print traceback.print_exc()

        error_reason = str(e)

        return is_timeout, error_reason, code, header, body, title

    if not header:

        return is_timeout, error_reason, code, header, body, title

    # 解压gzip

    if 'Content-Encoding' in header and 'gzip' in header['Content-Encoding']:

        html_data = StringIO.StringIO(body)

        gz = gzip.GzipFile(fileobj=html_data)

        body = gz.read()

    # 编码转换

    try:

        html_encode = get_encode(header, body).strip()

        if html_encode and len(html_encode) < 12:

            body = body.decode(html_encode).encode('utf-8')

    except:

        pass

    # 获取title

    try:

        title = re.search(r'<title>(.*?)</title>', body, flags=re.I | re.M)

        if title:

            title = title.group(1)

    except:

        pass

    return is_timeout, error_reason, code, str(header), body, title

# 获取html编码

def get_encode(header, body):

    try:

        m = re.search(r'<meta.*?charset=(.*?)"(>| |/)', body, flags=re.I)

        if m:

            return m.group(1).replace('"', '')

    except:

        pass

    try:

        if 'Content-Type' in header:

            Content_Type = header['Content-Type']

            m = re.search(r'.*?charset=(.*?)(;|$)', Content_Type, flags=re.I)

            if m:

                return m.group(1)

    except:

        pass

    chardit1 = chardet.detect(body)

    encode_method = chardit1['encoding']

    return encode_method

if __name__ == "__main__":

    data = {"ip": "127.0.0.1", "port": 80}

    res = plugin_homepage(data, 3)

    print res

码农公寓

相关文章