import pymysql
import requests
from bs4 import BeautifulSoup
from lxml import etree
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
}
def bqg(url):
cent = requests.get(url, headers).content
data = BeautifulSoup(cent, 'lxml')
title = data.select('#globalNavUL > li > a')[1:]
for j in title:
da(j.text, url + j['href'])
def da(text, urls):
u = urls.split('_', 1)
for j in range(1, 100):
con = u[0] + '_{}.html'.format(j)
cent = requests.get(con, headers).content
data = BeautifulSoup(cent, 'lxml')
name = data.select('span.mainSoftName > a')
for i in name:
ent(text, 'http://www.qishus.com' + i['href'], i['title'])
def ent(t, ur, tit):
conn = pymysql.connect(
host='***.***.***.***',
port=3306,
user='****',
passwd='****',
db='****',
charset='utf8'
)
cur = conn.cursor()
cent = requests.get(ur, headers).content
data = BeautifulSoup(cent, 'lxml')
link = data.select('#downAddress > a')
time_num = int(time.time())
re = ''
text = ''
for k in link:
re = re + '{},'.format(k['href'])
text = k.text
res = (tit, t, re, text, time_num)
// 存放txt中
//with open('1.txt', 'a') as f:
// f.write('\n'.tit+'---'+t+'---'+re+'---'+text+'---'+time_num)
sql = "insert into de(name, type, link, content, time) VALUES ('%s','%s', '%s', '%s', '%s')" % res
sta = cur.execute(sql)
if sta == 1:
print('成功')
else:
print('失败')
cur.close()
if __name__ == '__main__':
bqg('http://www.qishus.com')