Python扫描器-常用库-Request

2022-11-16 10:06:21

1、常用库-Request

1.1、介绍

#安装：pip3 install requests

#各种请求方式：常用的就是requests.get()和requests.post()

>>> import requests

>>> r = requests.get('https://api.github.com/events')

>>> r = requests.post('http://httpbin.org/post', data = {'key':'value'})

>>> r = requests.put('http://httpbin.org/put', data = {'key':'value'})

>>> r = requests.delete('http://httpbin.org/delete')

>>> r = requests.head('http://httpbin.org/get')

>>> r = requests.options('http://httpbin.org/get')

1.2、基于GET请求

1、基本请求

import requests

response=requests.get('http://dig.chouti.com/')

print(response.text)

2、带参数的GET请求->params

import requests

response=requests.get('https://www.baidu.com')

response = requests.get(url='http://dict.baidu.com/s', params={'wd': 'python'})      # 带参数的get请求

3、带参数的GET请求->headers

#通常我们在发送请求时都需要带上请求头，请求头是将自身伪装成浏览器的关键，常见的有用的请求头如下

Host

Referer #大型网站通常都会根据该参数判断请求的来源

User-Agent #客户端

Cookie #Cookie信息虽然包含在请求头里，但requests模块有单独的参数来处理他，headers={}内就不要放它了

#添加headers(浏览器会识别请求头,不加可能会被拒绝访问,比如访问https://www.zhihu.com/explore)

import requests

response=requests.get('https://www.zhihu.com/explore')

response.status_code #500

#自己定制headers

headers={

    'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',

}

respone=requests.get('https://www.zhihu.com/explore',

                     headers=headers)

print(respone.status_code) #200

4、带参数的GET请求->cookies

import uuid

import requests

url = 'http://httpbin.org/cookies'

cookies = dict(sbid=str(uuid.uuid4()))

res = requests.get(url, cookies=cookies)

print(res.json())

1.3、基于POST请求

1、介绍

#GET请求

HTTP默认的请求方法就是GET

     * 没有请求体

     * 数据必须在1K之内！

     * GET请求数据会暴露在浏览器的地址栏中

GET请求常用的操作：

       1. 在浏览器的地址栏中直接给出URL，那么就一定是GET请求

       2. 点击页面上的超链接也一定是GET请求

       3. 提交表单时，表单默认使用GET请求，但可以设置为POST

#POST请求

(1). 数据不会出现在地址栏中

(2). 数据的大小没有上限

(3). 有请求体

(4). 请求体中如果存在中文，会使用URL编码！

#！！！requests.post()用法与requests.get()完全一致，特殊的是requests.post()有一个data参数，用来存放请求体数据

2、发送post请求，模拟浏览器的登录行为
- 一目标站点分析

    浏览器输入https://github.com/login

    然后输入错误的账号密码，抓包

    发现登录行为是post提交到：https://github.com/session

    而且请求头包含cookie

    而且请求体包含：

        commit:Sign in

        utf8:✓

        authenticity_token:lbI8IJCwGslZS8qJPnof5e7ZkCoSoMn6jmDTsL1r/m06NLyIbw7vCrpwrFAPzHMep3Tmf/TSJVoXWrvDZaVwxQ==

        login:maple

        password:123

- 二流程分析

    先GET：https://github.com/login拿到初始cookie与authenticity_token

    返回POST：https://github.com/session， 带上初始cookie，带上请求体（authenticity_token，用户名，密码等）

    最后拿到登录cookie

    ps：如果密码时密文形式，则可以先输错账号，输对密码，然后到浏览器中拿到加密后的密码，github的密码是明文

'''

import requests

import re

#第一次请求

r1=requests.get('https://github.com/login')

r1_cookie=r1.cookies.get_dict() #拿到初始cookie(未被授权)

authenticity_token=re.findall(r'name="authenticity_token".*?value="(.*?)"',r1.text)[0] #从页面中拿到CSRF TOKEN

#第二次请求：带着初始cookie和TOKEN发送POST请求给登录页面，带上账号密码

data={

    'commit':'Sign in',

    'utf8':'✓',

    'authenticity_token':authenticity_token,

    'login':'maple@qq.com',

    'password':'123'

}

r2=requests.post('https://github.com/session',

             data=data,

             cookies=r1_cookie

             )

login_cookie=r2.cookies.get_dict()

#第三次请求：以后的登录，拿着login_cookie就可以,比如访问一些个人配置

r3=requests.get('https://github.com/settings/emails',

                cookies=login_cookie)

print('maple@qq.com' in r3.text) #True

自动登录Github（自己处理cookie信息）

import requests

import re

session=requests.session()

#第一次请求

r1=session.get('https://github.com/login')

authenticity_token=re.findall(r'name="authenticity_token".*?value="(.*?)"',r1.text)[0] #从页面中拿到CSRF TOKEN

#第二次请求

data={

    'commit':'Sign in',

    'utf8':'✓',

    'authenticity_token':authenticity_token,

    'login':'maple@qq.com',

    'password':'123'

}

r2=session.post('https://github.com/session',

             data=data,

             )

#第三次请求

r3=session.get('https://github.com/settings/emails')

print('maple@qq.com' in r3.text) #True

requests.session()自动帮我们保存cookie信息

1.4、补充

requests.post(url='xxxxxxxx',

              data={'xxx':'yyy'}) #没有指定请求头,#默认的请求头:application/x-www-form-urlencoed

#如果我们自定义请求头是application/json,并且用data传值, 则服务端取不到值

requests.post(url='',

              data={'':1,},

              headers={

                  'content-type':'application/json'

              })

requests.post(url='',

              json={'':1,},

              ) #默认的请求头:application/json

1.5、响应Response

1、response属性

import requests

respone=requests.get('http://www.jianshu.com')

# respone属性

#获取所有内容

print(respone.text)

#获取二进制

print(respone.content)

#获取状态码，如200,301等

print(respone.status_code)

print(respone.headers)

print(respone.cookies)

#获取cookie

print(respone.cookies.get_dict())

print(respone.cookies.items())

print(respone.url)

print(respone.history)

#获取编码

print(respone.encoding)

#解决乱码

print（response.apparent_encoding）

2、编码问题

#编码问题

import requests

response=requests.get('http://www.autohome.com/news')

#方式一：

# response.encoding='gbk' #汽车之家网站返回的页面内容为gb2312编码的，而requests的默认编码为ISO-8859-1，如果不设置成gbk则中文乱码

print(response.text)

#方式二：

#在不知道编码格式的前提下使用以下方式

response.encoding=response.apparent_encoding

print(response.text)

3、获取二进制数据

import requests

response=requests.get('https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1509868306530&di=712e4ef3ab258b36e9f4b48e85a81c9d&imgtype=0&src=http%3A%2F%2Fc.hiphotos.baidu.com%2Fimage%2Fpic%2Fitem%2F11385343fbf2b211e1fb58a1c08065380dd78e0c.jpg')

with open('a.jpg','wb') as f:

    f.write(response.content)

4、解析json

#解析json

import requests

response=requests.get('http://httpbin.org/get')

import json

res1=json.loads(response.text) #太麻烦

res2=response.json() #直接获取json数据

print(res1 == res2) #True

1.6、高级用法

1、SSL Cert Verification

#证书验证(大部分网站都是https)

import requests

# 如果是ssl请求,首先检查证书是否合法,不合法则报错,程序终端

response = requests.get('https://www.xiaohuar.com')

print(response.status_code)

# 改进1:去掉报错,但是会报警告

import requests

response = requests.get('https://www.xiaohuar.com', verify=False)

# 不验证证书,报警告,返回200

print(response.status_code)

# 改进2:去掉报错,并且去掉警报信息

import requests

import urllib3

urllib3.disable_warnings()  # 关闭警告

response = requests.get('https://www.xiaohuar.com', verify=False)

print(response.status_code)

# 改进3:加上证书

# 很多网站都是https,但是不用证书也可以访问,大多数情况都是可以携带也可以不携带证书

# 知乎\百度等都是可带可不带

# 有硬性要求的,则必须带，比如对于定向的用户,拿到证书后才有权限访问某个特定网站

import requests

import urllib3

# urllib3.disable_warnings()  # 关闭警告

response = requests.get(

    'https://www.xiaohuar.com',

    # verify=False,

    cert=('/path/server.crt', '/path/key'))

print(response.status_code)

2、使用代理

# 官网链接: http://docs.python-requests.org/en/master/user/advanced/#proxies

# 代理设置:先发送请求给代理,然后由代理帮忙发送(封ip是常见的事情)

import requests

proxies={

    # 带用户名密码的代理,@符号前是用户名与密码

    'http':'http://tank:123@localhost:9527',

    'http':'http://localhost:9527',

    'https':'https://localhost:9527',

}

response=requests.get('https://www.12306.cn',

                     proxies=proxies)

print(response.status_code)

# 支持socks代理,安装:pip install requests[socks]

import requests

proxies = {

    'http': 'socks5://user:pass@host:port',

    'https': 'socks5://user:pass@host:port'

}

respone=requests.get('https://www.12306.cn',

                     proxies=proxies)

print(respone.status_code)

使用代理爬取微信新闻: 参考

from urllib.parse import urlencode

import pymongo

import requests

from lxml.etree import XMLSyntaxError

from requests.exceptions import ConnectionError

from pyquery import PyQuery as pq

# from config import *

#

# client = pymongo.MongoClient(MONGO_URI)

# db = client[MONGO_DB]

base_url = 'http://weixin.sogou.com/weixin?'

headers = {

    'Cookie': 'SUID=F6177C7B3220910A000000058E4D679; SUV=1491392122762346; ABTEST=1|1491392129|v1; SNUID=0DED8681FBFEB69230E6BF3DFB2F8D6B; ld=OZllllllll2Yi2balllllV06C77lllllWTZgdkllll9lllllxv7ll5@@@@@@@@@@; LSTMV=189%2C31; LCLKINT=1805; weixinIndexVisited=1; SUIR=0DED8681FBFEB69230E6BF3DFB2F8D6B; JSESSIONID=aaa-BcHIDk9xYdr4odFSv; PHPSESSID=afohijek3ju93ab6l0eqeph902; sct=21; IPLOC=CN; ppinf=5|1491580643|1492790243|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToyNzolRTUlQjQlOTQlRTUlQkElODYlRTYlODklOER8Y3J0OjEwOjE0OTE1ODA2NDN8cmVmbmljazoyNzolRTUlQjQlOTQlRTUlQkElODYlRTYlODklOER8dXNlcmlkOjQ0Om85dDJsdUJfZWVYOGRqSjRKN0xhNlBta0RJODRAd2VpeGluLnNvaHUuY29tfA; pprdig=j7ojfJRegMrYrl96LmzUhNq-RujAWyuXT_H3xZba8nNtaj7NKA5d0ORq-yoqedkBg4USxLzmbUMnIVsCUjFciRnHDPJ6TyNrurEdWT_LvHsQIKkygfLJH-U2MJvhwtHuW09enCEzcDAA_GdjwX6_-_fqTJuv9w9Gsw4rF9xfGf4; sgid=; ppmdig=1491580643000000d6ae8b0ebe76bbd1844c993d1ff47cea',

    'Host': 'weixin.sogou.com',

    'Upgrade-Insecure-Requests': '1',

    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'

}

proxy = None

def get_proxy():

    try:

        response = requests.get('http://127.0.0.1:5555/random')

        if response.status_code == 200:

            return response.text

        return None

    except ConnectionError:

        return None

def get_html(url, count=1):

    print('Crawling', url)

    print('Trying Count', count)

    global proxy

    if count >= 5:

        print('Tried Too Many Counts')

        return None

    try:

        if proxy:

            proxies = {

                'http': 'http://' + proxy

            }

            response = requests.get(url, allow_redirects=False, headers=headers, proxies=proxies)

        else:

            response = requests.get(url, allow_redirects=False, headers=headers)

        if response.status_code == 200:

            return response.text

        if response.status_code == 302:

            # Need Proxy

            print('302')

            proxy = get_proxy()

            if proxy:

                print('Using Proxy', proxy)

                return get_html(url)

            else:

                print('Get Proxy Failed')

                return None

    except ConnectionError as e:

        print('Error Occurred', e.args)

        proxy = get_proxy()

        count += 1

        return get_html(url, count)

def get_index(keyword, page):

    data = {

        'query': keyword,

        'type': 2,

        'page': page

    }

    queries = urlencode(data)

    url = base_url + queries

    html = get_html(url)

    return html

def parse_index(html):

    doc = pq(html)

    items = doc('.news-box .news-list li .txt-box h3 a').items()

    for item in items:

        yield item.attr('href')

def get_detail(url):

    try:

        response = requests.get(url)

        if response.status_code == 200:

            return response.text

        return None

    except ConnectionError:

        return None

def parse_detail(html):

    try:

        doc = pq(html)

        title = doc('.rich_media_title').text()

        content = doc('.rich_media_content').text()

        date = doc('#post-date').text()

        nickname = doc('#js_profile_qrcode > div > strong').text()

        wechat = doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()

        return {

            'title': title,

            'content': content,

            'date': date,

            'nickname': nickname,

            'wechat': wechat

        }

    except XMLSyntaxError:

        return None

# def save_to_mongo(data):

#     if db['articles'].update({'title': data['title']}, {'$set': data}, True):

#         print('Saved to Mongo', data['title'])

#     else:

#         print('Saved to Mongo Failed', data['title'])

def main():

    for page in range(1, 101):

        html = get_index('Python', page)

        if html:

            article_urls = parse_index(html)

            for article_url in article_urls:

                article_html = get_detail(article_url)

                if article_html:

                    article_data = parse_detail(article_html)

                    print(article_data)

                    # if article_data:

                    #     save_to_mongo(article_data)

if __name__ == '__main__':

    main()

3、超时设置

#超时设置

#两种超时:float or tuple

#timeout=0.1 #代表接收数据的超时时间

#timeout=(0.1,0.2)#0.1代表链接超时  0.2代表接收数据的超时时间

import requests

respone=requests.get('https://www.baidu.com',

                     timeout=0.0001)

4、认证设置

# 官网链接：http://docs.python-requests.org/en/master/user/authentication/

# 认证设置:登陆网站是,弹出一个框,要求你输入用户名密码（与alter很类似），此时是无法获取html的

# ps: https://www.cnblogs.com/post/readauth?url=/kermitjam/articles/10147263.html

# 但本质原理是拼接成请求头发送

#         r.headers['Authorization'] = _basic_auth_str(self.username, self.password)

# 一般的网站都不用默认的加密方式，都是自己写

# 那么我们就需要按照网站的加密方式，自己写一个类似于_basic_auth_str的方法

# 得到加密字符串后添加到请求头

#         r.headers['Authorization'] =func('.....')

# 看一看默认的加密方式吧，通常网站都不会用默认的加密设置

import requests

from requests.auth import HTTPBasicAuth

r=requests.get('xxx',auth=HTTPBasicAuth('user','password'))

print(r.status_code)

# HTTPBasicAuth可以简写为如下格式

import requests

r=requests.get('xxx',auth=('user','password'))

print(r.status_code)

5、异常处理

#异常处理

import requests

from requests.exceptions import * #可以查看requests.exceptions获取异常类型

try:

    r=requests.get('http://www.baidu.com',timeout=0.00001)

except ReadTimeout:

    print('===:')

# except ConnectionError: #网络不通

#     print('-----')

# except Timeout:

#     print('aaaaa')

except RequestException:

    print('Error')

6、上传文件

import requests

files={'file':open('a.jpg','rb')}

respone=requests.post('http://httpbin.org/post',files=files)

print(respone.status_code)

码农公寓

1、常用库-Request

1.1、 介绍

1.2、基于GET请求

1.3、基于POST请求

1.4、补充

1.5、响应Response

1.6、高级用法

相关文章

1.1、介绍