「数据分析师的数据爬虫」网络爬虫数据的存储MySQL和Mongodb

文章目录

内容介绍

本文介绍基于最简单的操作流程保存Python网络爬虫抓取的数据,通过2个完整的代码示例来介绍爬虫抓取的数据是如何保存到数据仓库中。

存储Mysql应用实例

#coding=utf-8

import urllib
import urllib.request
import pymysql
import time
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup

mysql_engine = {
    "host":"localhost", 
    "database":"sampledb", 
    "user":"root", 
    "password":"admin",
    "charset":"utf8"}
#创建数据库
def creat_database_sampledb():
    config_root = {
        "host": "localhost",
        "user": "root",
        "password": "admin"}
    sql = "Create Database If Not Exists sampledb CHARSET=utf8"
    conn = pymysql.connect(**config_root)  # 打开数据库连接
    try:
        with conn.cursor() as cursor:  # 使用cursor()方法获取操作游标,并在语句结束自动关闭
            cursor.execute(sql)  # 执行SQL
            conn.commit()  # 提交
    finally:
        conn.close()
#创建新闻列表
def createNewsTable():
    createTbSql = (
        "Create Table If Not Exists News( "
        "id int primary key auto_increment, "
        "title varchar(100), "
        "url varchar(100), "
        "date date)")
    try:
        corsor = pymysql.connect(**mysql_engine)
        with corsor.cursor() as cursor:
            cursor.execute(createTbSql)
            corsor.commit()
    finally:
        corsor.close()
    return None
#创建新闻内容表
def createNewsBody():
    createTbNews = (
        "Create Table If Not Exists NewsBody( "
        "id INT PRIMARY KEY, "
        "text text, "
        "FOREIGN KEY(id) REFERENCES News(id))")
    try:
        corsor = pymysql.connect(**mysql_engine)
        with corsor.cursor() as cursor:
            cursor.execute(createTbNews)
            corsor.commit()
    finally:
        corsor.close()
    return None
#开始爬取数据
def start_crawler():
    page_num = 1
    while page_num<=2:
        url = "http://www.cctd.com.cn/list-107-{}.html".format(page_num)
        print (url)
        page_num += 1
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent }
        req = urllib.request.Request(url,headers=headers)
        response = urllib.request.urlopen(req)
        content = response.read().decode('gbk')
        one_page = get_page_news(content)
        time.sleep(1)
        if one_page:
            to_mysql(one_page)
            time.sleep(1)
        else:
            break
    print ('新闻抓取完毕')           
#写入数据库
def to_mysql(one_page):
    print (one_page)
    sql = "insert into News(id,title,url,date) values(Null,%s,%s,%s)"
    conn = pymysql.connect(**mysql_engine)
    try:
        with conn.cursor() as cursor:
            cursor.executemany(sql, one_page)
            conn.commit()
    finally:
        conn.close()  
def get_page_news(content):
    soup = BeautifulSoup(content,'lxml')
    one_page = []
    lptable = soup.find('table',id='new_table')
    for i in lptable.find_all('tr'):
        if i.a['href'] in get_exist(i.find('td',width='20%').string):
            continue
        one_page.append(
            (i.a.string,
             i.a['href'],
             i.find('td',width='20%').string))
    return one_page
def get_exist(date):
    sql = "select url from News where date='{}'".format(date)
    conn = pymysql.connect(**mysql_engine)
    try:
        with conn.cursor() as cursor:
            cursor.execute(sql)
            conn.commit()
            result = cursor.fetchall()
    finally:
        conn.close()
    return set(*zip(*result))
#抓取具体内容
def get_new_body():
    link_list = get_news_linksfrom_database()
    for id,url in link_list:
        news_body = get_news_text(url)
        #写入数据库
        writer_news_body_to_database(id, news_body)
    print("新闻主体完毕!")
def get_news_linksfrom_database():
    pymysql_select_newslink = """
    select News.id,News.url from News left join Newsbody
    on News.id = Newsbody.id where Newsbody.id is null;
    """
    conn = pymysql.connect(**mysql_engine)
    try:
        with conn.cursor() as cursor:
            cursor.execute(pymysql_select_newslink)
            conn.commit()
            result = cursor.fetchall()
    finally:
        conn.close()
    return result if result else []
def get_news_text(url):
    html = requests.get(url)
    html.encoding = html.apparent_encoding
    soup = BeautifulSoup(html.text,'html.parser')
    try:
        return soup.find('div',{'id':'Zoom'}).text
    except:
        return None
def writer_news_body_to_database(id, news_body):
    print("INFO: Writing News ID:{} To Database...".format(id))
    pymysql_writer_to_table = """
    insert into Newsbody(id,text) values(%s,%s)
    """
    conn = pymysql.connect(**mysql_engine)
    try:
        with conn.cursor() as cursor:
            cursor.execute(pymysql_writer_to_table,(id,news_body))
            conn.commit()
    finally:
        conn.close()
if __name__ == '__main__':  
    creat_database_sampledb()#创建数据库
    createNewsTable()#创建新闻表
    createNewsBody()#创建新闻详情表
    '''爬取新闻简要'''
    start_crawler() #开始爬虫
    '''爬取具体新闻'''
    get_new_body()

存储Mongodb实例

#coding=utf-8

import urllib
import urllib.request
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
import pymongo 


# 获取mongoClient对象
client = pymongo.MongoClient("localhost", 27017)
# 获取使用的database对象
db = client.news

#开始爬取数据
def start_crawler():
    page_num = 1
    while page_num<=1:
        url = "http://www.cctd.com.cn/list-13-{}.html".format(page_num)
        print (url)
        page_num += 1
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent }
        req = urllib.request.Request(url,headers=headers)
        response=requests.get(url,headers=headers)
        content=response.text
        one_page = get_page_news(content)
        time.sleep(1)
        if one_page:
            to_mysql(one_page)
            time.sleep(1)
        else:
            break
    print ('新闻标题抓取完毕')           
#爬取新闻标题、日期、地址到数据库
def to_mysql(one_page):
    print (one_page)
def get_page_news(content):
    soup = BeautifulSoup(content,'lxml')
    one_page_list = []
    lptable = soup.find('table',id='new_table')
    for i in lptable.find_all('tr'):
        title = i.a.string
        url = i.a['href']
        date = i.find('td',width='20%').string      
        one_page={'title':title,'url':url,'date':date,'type':'analysis','label':'www.cctd.com.cn'}
        db.news.insert_one(one_page)
        one_page_list.append((title,url,date))
    return one_page_list  


#抓取具体内容
def get_new_body():
    link_list = get_news_linksfrom_database()
    for url in link_list['url']:
        news_body = get_news_text(url)
        #写入数据库
        db.news.update({'url':url},{"$set":{'newsbody':news_body}})
    print("新闻主体爬取完毕!")
def get_news_linksfrom_database():  
    result=[]
    for item in db.news.find({'label':'www.cctd.com.cn','type':'analysis'},{'url':1,'_id':1}):
        result.append(item) 
    result = pd.DataFrame(result, columns=['url','_id'])
    return result 
def get_news_text(url):
    html = requests.get(url)
    html.encoding = html.apparent_encoding
    soup = BeautifulSoup(html.text,'html.parser')
    try:
        newsBody=str(soup.find('div',{'id':'Zoom'}))
        print ("记录爬取完毕")
        return newsBody
    except:
        print ("error")
        return None
if __name__ == '__main__':  
    '''爬取新闻简要'''
    start_crawler()
    '''爬取具体新闻'''
    get_new_body()
上一篇:聚合数据+新闻头条+数据入库+数据展示


下一篇:【D-News】国务院将区块链写入“十三五”国家信息化规划 《大数据白皮书2016》发布