爬虫代码 暂时存放

 1 import smtplib , requests , time , schedule ,random
 2 from bs4 import BeautifulSoup 
 3 from email.mime.text import MIMEText 
 4 from email.header import Header
 5 
 6 def choose_movie():
 7     movie_250 = []
 8     movie_3 = []
 9     headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
10     for i in range(3,10):
11         url = 'https://movie.douban.com/top250?start={}&filter='.format(i*25)
12         res_movie = requests.get(url , headers=headers)
13         bs_movie = BeautifulSoup(res_movie.text , 'html.parser')
14         list_movie = bs_movie.find('ol', class_="grid_view").find_all('li')
15         for movie in list_movie:
16             link = movie.find('a')
17             comment = movie.find(class_='inq')
18             name = movie.find(class_='title')
19             mark = movie.find(class_='rating_num')
20 
21             try:
22                 content = ('电影名:{}\n链接:{}\n推荐语:{}\n评分:{}\n.......\n'.format(name.text, link['href'], comment.text, mark.text))
23             except:
24                 content = ('电影名:{}\n链接:{}\n推荐语:{}\n评分:{}\n.......\n'.format(name.text, link['href'], '无', mark.text))
25             movie_250.append(content)
26 
27     movie_3 = random.sample(movie_250 , 6)
28     return movie_3    
29 
30 
31 def sendmail():
32     qqmail = smtplib.SMTP()
33     qqmail.connect('smtp.qq.com' , 25)
34     qqmail.login('2191313025@qq.com' , 'tdkphuzskgwwebed')
35     content = ''
36     ii = choose_movie()
37     for i in ii: 
38         content += i
39     message_ = MIMEText(content , 'plain' , 'utf-8')
40     message_['Subject'] = Header('go to movie' , 'utf-8')
41     qqmail.sendmail('2191313025@qq.com' , '2191313025@qq.com' , message_.as_string())
42     qqmail.quit()
43 
44 sendmail()
45     

 

import requests 
from bs4 import BeautifulSoup
from urllib.request import quote

header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Upgrade-Insecure-Requests': '1',
'Host':'www.ygdy8.com'}
movie = input('输入电影:')
movie_gbk = movie.encode('gbk')
res_movie = requests.get('http://s.ygdy8.com/plus/s0.php?typeid=1&keyword='+ quote(movie_gbk) , headers=header)
print(res_movie.status_code)
bs_movie = BeautifulSoup(res_movie.text , 'html.parser')
web_2_part = bs_movie.find(class_="co_content8").find('a')
print(web_2_part['href'])

res_movie2 = requests.get('https://www.ygdy8.com{}'.format(web_2_part['href']) , headers=header)
res_movie2.encoding = 'gbk'
bs_movie2 = BeautifulSoup(res_movie2.text , 'html.parser')
link = bs_movie2.find(style="WORD-WRAP: break-word")
print(link.text)

# # movie = bs_movie.find('title')
# print(movie.text)

 

import requests
import json


userid = str(123)
# 1 可以替换成任何长度小于32的字符串哦 
apikey = str('f502483535bb4f9eac11b822d9307b06')
# 这里的A,记得替换成你自己的apikey哦~
# 创建post函数
def robot(content):
    api = 'http://openapi.tuling123.com/openapi/api/v2'
    data = {
        "perception": {
        "inputText": {
        "text": content}},
        "userInfo": {
        "apiKey": apikey,
        "userId": userid,}}
# 转化为json格式
    jsondata = json.dumps(data)
# 发起post请求
    response = requests.post(api, data = jsondata)
    # 将返回的json数据解码
    robot_res = json.loads(response.content)
    # 提取对话数据
    print(robot_res["results"][0]['values']['text'])


for x in range(10):
    content = input("talk:")
# 输入对话内容 
    robot(content)
    if x == 10:
        break
# 十次之后就结束对话,数字可以改哦,你想几次就几次



#当然咯,你也可以加一些stopwords,只要说了这些词就可以终止聊天

while True:
    content = input("talk:")
# 输入对话内容 
    robot(content)
    if content == 'bye':
    # 设置stopwords
        break



#但是,我觉得吧,喜欢和聊天机器人玩的都是话痨,所以,可以最后加个死循环,如下:

# 创建对话死循环
while True:
    # 输入对话内容
    content = input("talk:")
    robot(content)
a='''Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9
Connection: keep-alive
Cookie: _tt_=2462E5FAAE8403A742AD0486C71F6936; _userCode_=2020417102834190; _userIdentity_=2020417102839948; DefaultCity-CookieKey=364; DefaultDistrict-CookieKey=0; __utmz=196937584.1587091165.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); maxShowNewbie=1; _movies_=147142.15374.14134.15167; waf_cookie=50c0f1c1-763f-4d6f642a1162e3ca581dddf9cfbe7d4bbb13; Hm_lvt_6dd1e3b818c756974fb222f0eae5512e=1587090484,1587171365,1587390636,1587427442; __utma=196937584.1627096506.1587091165.1587390636.1587427442.4; __utmc=196937584; Hm_lpvt_6dd1e3b818c756974fb222f0eae5512e=1587427860; _ydclearance=75feeb049194e628d808e7c9-74f7-4fe1-af67-83f6cf70bd27-1587483021
Host: www.mtime.com
Referer: http://www.mtime.com/top/tv/top100/
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'''

headers = dict([line.split(": ",1) for line in a.split("\n")])
#用正则表达式将headers转为字典格式

from gevent import monkey
monkey.patch_all()
import gevent,requests,bs4,csv
from gevent.queue import Queue

work = Queue()
url_1 = 'http://www.mtime.com/top/tv/top100/'
work.put_nowait(url_1)

url_2 = 'http://www.mtime.com/top/tv/top100/index-{page}.html'
for x in range(1,11):
    real_url = url_2.format(page=x)
    work.put_nowait(real_url)

def crawler():
    while not work.empty():
        url = work.get_nowait()
        res = requests.get(url,headers=headers)
      #  res.encoding = 'utf-8'
        bs_res = bs4.BeautifulSoup(res.text,'html.parser')
        datas = bs_res.find_all('div',class_="mov_con")
        for data in datas:
            TV_title = data.find('a').text
            data = data.find_all('p')
            TV_data =''
            for i in data:
                TV_data =TV_data + ''+ i.text
            writer.writerow([TV_title,TV_data])
            print([TV_title,TV_data])

csv_file = open('D:\\python_common_exercise\\exercise\\time_21444.csv','w',newline='',encoding='utf-8-sig')
writer = csv.writer(csv_file)

task_list = []
for x in range(3):
    task = gevent.spawn(crawler)
    task_list.append(task)
gevent.joinall(task_list)

 

 

from gevent import monkey
monkey.patch_all()
from gevent.queue import Queue
import gevent , requests
from bs4 import BeautifulSoup

a='''Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9
Cache-Control: max-age=0
Connection: keep-alive
Cookie: _tt_=2462E5FAAE8403A742AD0486C71F6936; _userCode_=2020417102834190; _userIdentity_=2020417102839948; DefaultCity-CookieKey=364; DefaultDistrict-CookieKey=0; __utmz=196937584.1587091165.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); maxShowNewbie=1; _movies_=147142.15374.14134.15167; waf_cookie=f350783e-8c48-4ea4c223edc8c1fb9f98125419847edb15ed; _ydclearance=2ca23e7c6de11736a47ac649-914d-437e-b771-9d7ed679dfe3-1587660586; __utma=196937584.1627096506.1587091165.1587475823.1587653388.6; __utmc=196937584; __utmt=1; __utmt_~1=1; __utmb=196937584.2.10.1587653388; Hm_lvt_6dd1e3b818c756974fb222f0eae5512e=1587171365,1587390636,1587427442,1587653388; Hm_lpvt_6dd1e3b818c756974fb222f0eae5512e=1587653388
Host: www.mtime.com
Referer: http://www.mtime.com/top/tv/top100/index-9.html
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'''

headers = dict([line.split(": ",1) for line in a.split("\n")])


tv_list = ['http://www.mtime.com/top/tv/top100/']
for i in range(2,10): 
    tv_url = 'http://www.mtime.com/top/tv/top100/index-{}.html'.format(i)
    tv_list.append(tv_url)

work = Queue()
for ii in tv_list:
    work.put_nowait(ii)
# for i in range(10):
#     url = work.get_nowait()  
#     print(url)
#     res = requests.get(url,headers=headers)  
#     print(res.status_code)

def crawler():
    drama_list = []
    while not work.empty():
        url = work.get_nowait()
        res_tv = requests.get(url , headers=headers)
        print(res_tv.status_code)
        bs_tv = BeautifulSoup(res_tv.text , 'html.parser')
        drama_list = bs_tv.find(class_='top_list').find_all('li')
        for drama in drama_list:
            name = drama.find('a')
            director = drama.find_all('a')[1]
            introduc = drama.find_all('p')[2]
            print(name['title'] , director.text , introduc.text)
            actor_list = drama.find('p')[2].find_all('a')
            for actor in actor_list:
                print(actor)
            print('........')

task_list = []
for x in range(2):
    task = gevent.spawn(crawler)
    task_list.append(task)
gevent.joinall(task_list)

 

from gevent import monkey
monkey.patch_all()
from gevent.queue import Queue
import gevent , requests , csv
from bs4 import BeautifulSoup

headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
cookie = {'Cookie': '''userId=0; defaultCity=%25E5%25B9%25BF%25E4%25B8%259C%257C364; _tt_=2462E5FAAE8403A742AD0486C71F6936; 
waf_cookie=522ce5bd-6f19-400678091dd5c012bf1eb6416cc7058df67c; 
_ydclearance=ee83b7a0ea13cdf14409e819-c5bf-45eb-9c6e-62e1bd85ca35-1587097681; _userCode_=2020417102834190; 
_userIdentity_=2020417102839948; DefaultCity-CookieKey=364; DefaultDistrict-CookieKey=0; Hm_lvt_6dd1e3b818c756974fb222f0eae5512e=1587090484; __utma=196937584.1627096506.1587091165.1587091165.1587091165.1;
 __utmc=196937584; __utmz=196937584.1587091165.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); homePageType=B; userId=0; defaultCity=%25E5%25B9%25BF%25E4%25B8%259C%257C364; strIdCity=China_Beijing; maxShowNewbie=1; _movies_=15167.147142.15374.14134; Hm_lpvt_6dd1e3b818c756974fb222f0eae5512e=1587092252'''}

url_list = ['http://www.mtime.com/top/tv/top100/']
for i in range(2,11):
    url = 'http://www.mtime.com/top/tv/top100/index-{}.html'.format(i)
    url_list.append(url)
work = Queue()
for i in url_list:
    work.put_nowait(i)

def crawler(url_list):
    while not work.empty():
        content = []
        url = work.get_nowait()
        res_tv = requests.get(url , headers=headers)
        print(res_tv.status_code)
        bs_tv = BeautifulSoup(res_tv.text , 'html.parser')
        list_tv = bs_tv.find('top_list').find_all('li')
        for tv in list_tv:
            title = tv.find('a')
            introduc = tv.find(class_='mt3')
        try:
            content.append('电影名:{},简介:{}'.format(title['title'].text , introduc.text))
        except:
            content.append('电影名:{},简介:无'.format(title['title'].text))
    with open('D:\\python_common_exercise\\TV_100.csv' , 'w' , encoding='gbk' , newline='') as file1:
        writer = file1.writer()
        for i in content:
            writer.writerow(i)

list_tasks = []
for x in range(2):
    task = gevent.spawn(crawler , url_list)
    list_tasks.append(task)

gevent.joinall(list_tasks)
 

 

import requests
import csv
file1=open('D:\\python_common_exercise\\zhihu.csv','w',encoding='gbk',newline='')
writer = csv.writer(file1)
writer.writerow(['标题','链接','摘要'])

headers={'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
url='https://www.zhihu.com/api/v4/members/zhang-jia-wei/articles?'
articlelist=[]
#建立一个空列表,以待写入数据
offset=0
#设置offset的起始值为0
while True:
    params={
        'include':'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
        'offset':str(offset),
        'limit':'20',
        'sort_by':'voteups',
        }
    #封装参数
    res=requests.get(url,headers=headers,params=params)
    #发送请求,并把响应内容赋值到变量res里面
    articles=res.json()
    # print(articles)
    data_ = articles['data']
    #定位数据
    for i in data_:
        list1=[i['title'],i['url'],i['excerpt']]
        #把数据封装成列表
        writer.writerow(list1)
        articlelist.append(list1) 
      #  writer.writerow(list1)
    offset=offset+20
    #在while循环内部,offset的值每次增加20
    if offset>40:
        break
    #如果offset大于40,即爬了两页,就停止
    #if articles['paging']['is_end'] == True:
    #如果键is_end所对应的值是True,就结束while循环。
        #break
print(articlelist)
file1.close()
#打印看看

 

上一篇:网络爬虫(一)


下一篇:Linux编程 24 shell编程(结构化 if [ condition ] 数值比较,字符串比较)