1 import smtplib , requests , time , schedule ,random 2 from bs4 import BeautifulSoup 3 from email.mime.text import MIMEText 4 from email.header import Header 5 6 def choose_movie(): 7 movie_250 = [] 8 movie_3 = [] 9 headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'} 10 for i in range(3,10): 11 url = 'https://movie.douban.com/top250?start={}&filter='.format(i*25) 12 res_movie = requests.get(url , headers=headers) 13 bs_movie = BeautifulSoup(res_movie.text , 'html.parser') 14 list_movie = bs_movie.find('ol', class_="grid_view").find_all('li') 15 for movie in list_movie: 16 link = movie.find('a') 17 comment = movie.find(class_='inq') 18 name = movie.find(class_='title') 19 mark = movie.find(class_='rating_num') 20 21 try: 22 content = ('电影名:{}\n链接:{}\n推荐语:{}\n评分:{}\n.......\n'.format(name.text, link['href'], comment.text, mark.text)) 23 except: 24 content = ('电影名:{}\n链接:{}\n推荐语:{}\n评分:{}\n.......\n'.format(name.text, link['href'], '无', mark.text)) 25 movie_250.append(content) 26 27 movie_3 = random.sample(movie_250 , 6) 28 return movie_3 29 30 31 def sendmail(): 32 qqmail = smtplib.SMTP() 33 qqmail.connect('smtp.qq.com' , 25) 34 qqmail.login('2191313025@qq.com' , 'tdkphuzskgwwebed') 35 content = '' 36 ii = choose_movie() 37 for i in ii: 38 content += i 39 message_ = MIMEText(content , 'plain' , 'utf-8') 40 message_['Subject'] = Header('go to movie' , 'utf-8') 41 qqmail.sendmail('2191313025@qq.com' , '2191313025@qq.com' , message_.as_string()) 42 qqmail.quit() 43 44 sendmail() 45
import requests from bs4 import BeautifulSoup from urllib.request import quote header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 'Upgrade-Insecure-Requests': '1', 'Host':'www.ygdy8.com'} movie = input('输入电影:') movie_gbk = movie.encode('gbk') res_movie = requests.get('http://s.ygdy8.com/plus/s0.php?typeid=1&keyword='+ quote(movie_gbk) , headers=header) print(res_movie.status_code) bs_movie = BeautifulSoup(res_movie.text , 'html.parser') web_2_part = bs_movie.find(class_="co_content8").find('a') print(web_2_part['href']) res_movie2 = requests.get('https://www.ygdy8.com{}'.format(web_2_part['href']) , headers=header) res_movie2.encoding = 'gbk' bs_movie2 = BeautifulSoup(res_movie2.text , 'html.parser') link = bs_movie2.find(style="WORD-WRAP: break-word") print(link.text) # # movie = bs_movie.find('title') # print(movie.text)
import requests import json userid = str(123) # 1 可以替换成任何长度小于32的字符串哦 apikey = str('f502483535bb4f9eac11b822d9307b06') # 这里的A,记得替换成你自己的apikey哦~ # 创建post函数 def robot(content): api = 'http://openapi.tuling123.com/openapi/api/v2' data = { "perception": { "inputText": { "text": content}}, "userInfo": { "apiKey": apikey, "userId": userid,}} # 转化为json格式 jsondata = json.dumps(data) # 发起post请求 response = requests.post(api, data = jsondata) # 将返回的json数据解码 robot_res = json.loads(response.content) # 提取对话数据 print(robot_res["results"][0]['values']['text']) for x in range(10): content = input("talk:") # 输入对话内容 robot(content) if x == 10: break # 十次之后就结束对话,数字可以改哦,你想几次就几次 #当然咯,你也可以加一些stopwords,只要说了这些词就可以终止聊天 while True: content = input("talk:") # 输入对话内容 robot(content) if content == 'bye': # 设置stopwords break #但是,我觉得吧,喜欢和聊天机器人玩的都是话痨,所以,可以最后加个死循环,如下: # 创建对话死循环 while True: # 输入对话内容 content = input("talk:") robot(content)
a='''Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 Accept-Encoding: gzip, deflate Accept-Language: zh-CN,zh;q=0.9 Connection: keep-alive Cookie: _tt_=2462E5FAAE8403A742AD0486C71F6936; _userCode_=2020417102834190; _userIdentity_=2020417102839948; DefaultCity-CookieKey=364; DefaultDistrict-CookieKey=0; __utmz=196937584.1587091165.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); maxShowNewbie=1; _movies_=147142.15374.14134.15167; waf_cookie=50c0f1c1-763f-4d6f642a1162e3ca581dddf9cfbe7d4bbb13; Hm_lvt_6dd1e3b818c756974fb222f0eae5512e=1587090484,1587171365,1587390636,1587427442; __utma=196937584.1627096506.1587091165.1587390636.1587427442.4; __utmc=196937584; Hm_lpvt_6dd1e3b818c756974fb222f0eae5512e=1587427860; _ydclearance=75feeb049194e628d808e7c9-74f7-4fe1-af67-83f6cf70bd27-1587483021 Host: www.mtime.com Referer: http://www.mtime.com/top/tv/top100/ Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36''' headers = dict([line.split(": ",1) for line in a.split("\n")]) #用正则表达式将headers转为字典格式 from gevent import monkey monkey.patch_all() import gevent,requests,bs4,csv from gevent.queue import Queue work = Queue() url_1 = 'http://www.mtime.com/top/tv/top100/' work.put_nowait(url_1) url_2 = 'http://www.mtime.com/top/tv/top100/index-{page}.html' for x in range(1,11): real_url = url_2.format(page=x) work.put_nowait(real_url) def crawler(): while not work.empty(): url = work.get_nowait() res = requests.get(url,headers=headers) # res.encoding = 'utf-8' bs_res = bs4.BeautifulSoup(res.text,'html.parser') datas = bs_res.find_all('div',class_="mov_con") for data in datas: TV_title = data.find('a').text data = data.find_all('p') TV_data ='' for i in data: TV_data =TV_data + ''+ i.text writer.writerow([TV_title,TV_data]) print([TV_title,TV_data]) csv_file = open('D:\\python_common_exercise\\exercise\\time_21444.csv','w',newline='',encoding='utf-8-sig') writer = csv.writer(csv_file) task_list = [] for x in range(3): task = gevent.spawn(crawler) task_list.append(task) gevent.joinall(task_list)
from gevent import monkey monkey.patch_all() from gevent.queue import Queue import gevent , requests from bs4 import BeautifulSoup a='''Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 Accept-Encoding: gzip, deflate Accept-Language: zh-CN,zh;q=0.9 Cache-Control: max-age=0 Connection: keep-alive Cookie: _tt_=2462E5FAAE8403A742AD0486C71F6936; _userCode_=2020417102834190; _userIdentity_=2020417102839948; DefaultCity-CookieKey=364; DefaultDistrict-CookieKey=0; __utmz=196937584.1587091165.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); maxShowNewbie=1; _movies_=147142.15374.14134.15167; waf_cookie=f350783e-8c48-4ea4c223edc8c1fb9f98125419847edb15ed; _ydclearance=2ca23e7c6de11736a47ac649-914d-437e-b771-9d7ed679dfe3-1587660586; __utma=196937584.1627096506.1587091165.1587475823.1587653388.6; __utmc=196937584; __utmt=1; __utmt_~1=1; __utmb=196937584.2.10.1587653388; Hm_lvt_6dd1e3b818c756974fb222f0eae5512e=1587171365,1587390636,1587427442,1587653388; Hm_lpvt_6dd1e3b818c756974fb222f0eae5512e=1587653388 Host: www.mtime.com Referer: http://www.mtime.com/top/tv/top100/index-9.html Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36''' headers = dict([line.split(": ",1) for line in a.split("\n")]) tv_list = ['http://www.mtime.com/top/tv/top100/'] for i in range(2,10): tv_url = 'http://www.mtime.com/top/tv/top100/index-{}.html'.format(i) tv_list.append(tv_url) work = Queue() for ii in tv_list: work.put_nowait(ii) # for i in range(10): # url = work.get_nowait() # print(url) # res = requests.get(url,headers=headers) # print(res.status_code) def crawler(): drama_list = [] while not work.empty(): url = work.get_nowait() res_tv = requests.get(url , headers=headers) print(res_tv.status_code) bs_tv = BeautifulSoup(res_tv.text , 'html.parser') drama_list = bs_tv.find(class_='top_list').find_all('li') for drama in drama_list: name = drama.find('a') director = drama.find_all('a')[1] introduc = drama.find_all('p')[2] print(name['title'] , director.text , introduc.text) actor_list = drama.find('p')[2].find_all('a') for actor in actor_list: print(actor) print('........') task_list = [] for x in range(2): task = gevent.spawn(crawler) task_list.append(task) gevent.joinall(task_list)
from gevent import monkey monkey.patch_all() from gevent.queue import Queue import gevent , requests , csv from bs4 import BeautifulSoup headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'} cookie = {'Cookie': '''userId=0; defaultCity=%25E5%25B9%25BF%25E4%25B8%259C%257C364; _tt_=2462E5FAAE8403A742AD0486C71F6936; waf_cookie=522ce5bd-6f19-400678091dd5c012bf1eb6416cc7058df67c; _ydclearance=ee83b7a0ea13cdf14409e819-c5bf-45eb-9c6e-62e1bd85ca35-1587097681; _userCode_=2020417102834190; _userIdentity_=2020417102839948; DefaultCity-CookieKey=364; DefaultDistrict-CookieKey=0; Hm_lvt_6dd1e3b818c756974fb222f0eae5512e=1587090484; __utma=196937584.1627096506.1587091165.1587091165.1587091165.1; __utmc=196937584; __utmz=196937584.1587091165.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); homePageType=B; userId=0; defaultCity=%25E5%25B9%25BF%25E4%25B8%259C%257C364; strIdCity=China_Beijing; maxShowNewbie=1; _movies_=15167.147142.15374.14134; Hm_lpvt_6dd1e3b818c756974fb222f0eae5512e=1587092252'''} url_list = ['http://www.mtime.com/top/tv/top100/'] for i in range(2,11): url = 'http://www.mtime.com/top/tv/top100/index-{}.html'.format(i) url_list.append(url) work = Queue() for i in url_list: work.put_nowait(i) def crawler(url_list): while not work.empty(): content = [] url = work.get_nowait() res_tv = requests.get(url , headers=headers) print(res_tv.status_code) bs_tv = BeautifulSoup(res_tv.text , 'html.parser') list_tv = bs_tv.find('top_list').find_all('li') for tv in list_tv: title = tv.find('a') introduc = tv.find(class_='mt3') try: content.append('电影名:{},简介:{}'.format(title['title'].text , introduc.text)) except: content.append('电影名:{},简介:无'.format(title['title'].text)) with open('D:\\python_common_exercise\\TV_100.csv' , 'w' , encoding='gbk' , newline='') as file1: writer = file1.writer() for i in content: writer.writerow(i) list_tasks = [] for x in range(2): task = gevent.spawn(crawler , url_list) list_tasks.append(task) gevent.joinall(list_tasks)
import requests import csv file1=open('D:\\python_common_exercise\\zhihu.csv','w',encoding='gbk',newline='') writer = csv.writer(file1) writer.writerow(['标题','链接','摘要']) headers={'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} url='https://www.zhihu.com/api/v4/members/zhang-jia-wei/articles?' articlelist=[] #建立一个空列表,以待写入数据 offset=0 #设置offset的起始值为0 while True: params={ 'include':'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics', 'offset':str(offset), 'limit':'20', 'sort_by':'voteups', } #封装参数 res=requests.get(url,headers=headers,params=params) #发送请求,并把响应内容赋值到变量res里面 articles=res.json() # print(articles) data_ = articles['data'] #定位数据 for i in data_: list1=[i['title'],i['url'],i['excerpt']] #把数据封装成列表 writer.writerow(list1) articlelist.append(list1) # writer.writerow(list1) offset=offset+20 #在while循环内部,offset的值每次增加20 if offset>40: break #如果offset大于40,即爬了两页,就停止 #if articles['paging']['is_end'] == True: #如果键is_end所对应的值是True,就结束while循环。 #break print(articlelist) file1.close() #打印看看