最近决定冲击一下python的爬虫,爬取的目标是https://book.douban.com/top250,非常简单,但是呢?因为频繁的请求导致被网站制裁了,我的ip被封了。=-=。。。。,怎么办呢?换个ip就行了呀?因为俺是面向百度编程,通过一顿搜索之后,解决ip被封的答案就是建立一个ip代理池,通过获取网上的白嫖ip来访问要爬取的页面,但由于能力不够,加上有点懒,我找到了一个github开源项目!GitHub - Python3WebSpider/ProxyPool: An Efficient ProxyPool with Getter, Tester and Serverhttps://github.com/Python3WebSpider/ProxyPool 这是地址!
下面分享爬虫源码用的数据库是sqlite3
import time
# import urllib.request,urllib.error
import requests
# tips 这里我之前用的是urllib.request,但后来发现requests好用点
from bs4 import BeautifulSoup
import sqlite3
import re
def main():
Url = 'https://book.douban.com/top250?start='
data = getText(Url)
savepath = 'book.sql'
save(data,savepath)
text = re.compile(r'<a .*title="(.*?)">.*?</a>',re.S)
img = re.compile(r'<img src="(.*?)" .*>.*',re.S)
details = re.compile(r'<p class="pl">(.*?)</p>',re.S)
def getText(Url):
data = []
for i in range(0,10):
url = Url+ str(i*25)
html = askURL(url)
soup = BeautifulSoup(html,'html.parser')
for item in soup.find_all('tr',class_="item"):
d = []
item = str(item)
txt = re.findall(text,item)
imgs = re.findall(img,item)
detail = re.findall(details,item)
d.append(''.join(txt))
d.append(''.join(imgs))
d.append(''.join(detail))
data.append(d)
return data
def askURL(Url):
head = {
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
proxies = {'http': 'ip:端口'} #这里是代理ip
req = requests.get(Url,headers=head,proxies=proxies,timeout=10)
html = req.text
return html
def save(data,savepath):
init_db(savepath)
conn = sqlite3.connect(savepath)
cur = conn.cursor()
for data in data:
for index in range(len(data)):
if index == 4 or index ==5:
continue
data[index] = '"' + data[index]+'"'
sql = '''
insert into bookTop(
book_name,pic_link,detail
)values(%s)
'''%",".join(data)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
def init_db(savepath):
createSql = '''
create table bookTop
(
id integer primary key autoincrement,
book_name varchar,
pic_link text,
detail text
) '''
con = sqlite3.connect(savepath)
cur = con.cursor()
cur.execute(createSql)
con.commit()
con.close()
if __name__ == '__main__':
print('hh')
main()
print('ok')
然后将它封装成接口用的是flask框架
import sqlite3
from flask import Flask,jsonify,request
from gevent import pywsgi
app = Flask(__name__)
@app.route('/getData', methods=['post']) #这里是请求路径和方法
def getData():
con = sqlite3.connect('book.sql')
cur = con.cursor()
sql = 'select book_name,pic_link,detail from bookTop'
data = []
try:
cur.execute(sql)
book_all = cur.fetchall()
for i in book_all:
data.append(i)
except Exception as e:
print(e)
print('查询失败')
finally:
cur.close()
con.close()
return jsonify(data)
if __name__ == '__main__':
print('服务启动')
server = pywsgi.WSGIServer(('0.0.0.0', 50), app)
server.serve_forever()
前端我用electron创建了vue-cli,有兴趣可以马梓东/Vue electron - Gitee.com