request+bs4 爬取数据存放数据库

import pymysql
import requests
from bs4 import BeautifulSoup
from lxml import etree
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
}


def bqg(url):
    cent = requests.get(url, headers).content
    data = BeautifulSoup(cent, 'lxml')
    title = data.select('#globalNavUL > li > a')[1:]
    for j in title:
        da(j.text, url + j['href'])


def da(text, urls):
    u = urls.split('_', 1)
    for j in range(1, 100):
        con = u[0] + '_{}.html'.format(j)
        cent = requests.get(con, headers).content
        data = BeautifulSoup(cent, 'lxml')
        name = data.select('span.mainSoftName > a')
        for i in name:
            ent(text, 'http://www.qishus.com' + i['href'], i['title'])


def ent(t, ur, tit):
    conn = pymysql.connect(
        host='***.***.***.***',
        port=3306,
        user='****',
        passwd='****',
        db='****',
        charset='utf8'
    )
    cur = conn.cursor()
    cent = requests.get(ur, headers).content
    data = BeautifulSoup(cent, 'lxml')
    link = data.select('#downAddress > a')
    time_num = int(time.time())
    re = ''
    text = ''
    for k in link:
        re = re + '{},'.format(k['href'])
        text = k.text
    res = (tit, t, re, text, time_num)
    // 存放txt中
    //with open('1.txt', 'a') as f:
    //    f.write('\n'.tit+'---'+t+'---'+re+'---'+text+'---'+time_num)
    sql = "insert into de(name, type, link, content, time) VALUES ('%s','%s', '%s', '%s', '%s')" % res
    sta = cur.execute(sql)
    if sta == 1:
        print('成功')
    else:
        print('失败')
    cur.close()


if __name__ == '__main__':
    bqg('http://www.qishus.com')


上一篇:STL 小白学习(7) list


下一篇:数据解析-bs4库使用之红楼梦全文文本爬取