python爬虫 双色球数据更新
-
解析网站并获取数据
# 彩票数据所在的url url = 'http://datachart.500.com/ssq/' # 提取数据 response = requests.get(url, headers={"User-Agent": UserAgent().chrome}) # 通过xpath去解析 e = etree.HTML(response.text) date_times = e.xpath('//tbody[@id="tdata"]/tr/td[1]/text()') trs = e.xpath('//tbody[@id="tdata"]/tr[not(@class)]')
-
链接数据库
# 链接数据库 client = pymysql.connect(host='localhost', port=3306, user='root', password='123456', charset='utf8', db='bangumi') cursor = client.cursor()
-
普通获取数据
# 插入数据的sql sql = 'insert into doubleballs values(0,%s,%s,%s)' for data_time, tr in zip(date_times, trs): red_ball = '-'.join(tr.xpath('./td[@class="chartBall01"]/text()')) blue_ball = tr.xpath('./td[@class="chartBall02"]/text()')[0] print("第" + data_time + "红球是:" + red_ball + " 蓝球:" + blue_ball) cursor.execute(sql, [data_time, red_ball, blue_ball]) client.commit()
-
更新数据这部分从数据库中获取数据,然后反转顺序,index作为计数器,循环遍历假如有新数据要更新,那result返回值为0,index+1。
# 查看数据是否存在 select_new_sql = "select * from doubleballs where date_time = %s" date_times.reverse() # 记录有多少条新数据 index = 0 for data_time in date_times: result = cursor.execute(select_new_sql, [data_time]) # 判断数据是否存在 if result == 1: break index+=1
-
数据顺序反转,按照网站上xpath写法获取第1个数据,即最新的数据放到数据库中。
# 数据从新到旧排序 trs.reverse() for i in range(index): # 提取红球 red_ball = '-'.join(trs[i].xpath('./td[@class="chartBall01"]/text()')) # 提取蓝球 blue_ball = trs[i].xpath('./td[@class="chartBall02"]/text()')[0] print("第" + date_times[i] + "红球是:" + red_ball + " 蓝球:" + blue_ball) cursor.execute(sql, [date_times[i], red_ball, blue_ball]) client.commit()
-
完整代码
import requests from fake_useragent import UserAgent from lxml import html import pymysql etree = html.etree # 彩票数据所在的url url = 'http://datachart.500.com/ssq/' # 提取数据 response = requests.get(url, headers={"User-Agent": UserAgent().chrome}) # 通过xpath去解析 e = etree.HTML(response.text) date_times = e.xpath('//tbody[@id="tdata"]/tr/td[1]/text()') trs = e.xpath('//tbody[@id="tdata"]/tr[not(@class)]') # 链接数据库 client = pymysql.connect(host='localhost', port=3306, user='root', password='123456', charset='utf8', db='bangumi') cursor = client.cursor() # 插入数据的sql sql = 'insert into doubleballs values(0,%s,%s,%s)' # 查看数据是否存在 select_new_sql = "select * from doubleballs where date_time = %s" date_times.reverse() # 记录有多少条新数据 index = 0 for data_time in date_times: reslut = cursor.execute(select_new_sql, [data_time]) # 判断数据是否存在 if reslut == 1: break index += 1 # 数据从新到旧排序 trs.reverse() for i in range(index): # 提取红球 red_ball = '-'.join(trs[i].xpath('./td[@class="chartBall01"]/text()')) # 提取蓝球 blue_ball = trs[i].xpath('./td[@class="chartBall02"]/text()')[0] print("第" + date_times[i] + "红球是:" + red_ball + " 蓝球:" + blue_ball) cursor.execute(sql, [date_times[i], red_ball, blue_ball]) client.commit() # for data_time, tr in zip(date_times, trs): # red_ball = '-'.join(tr.xpath('./td[@class="chartBall01"]/text()')) # blue_ball = tr.xpath('./td[@class="chartBall02"]/text()')[0] # print("第" + data_time + "红球是:" + red_ball + " 蓝球:" + blue_ball) # cursor.execute(sql, [data_time, red_ball, blue_ball]) # client.commit() cursor.close() client.close()
-
这样排序会让更新之后最新的在数据最后,但是一开始排序的时候不会出现问题,是从新到旧的排序。于是因为有点强迫症,最后还是改成从旧到新排序。只需要改动几行代码即可,反转数据在提取红球蓝球数据前,保证数据最新,判断时每次存在就让
index+1
,之后加入index = count - index
其中count = data_time.__len__()
。接着将循环中的i
变成index-i-1
。最终代码如下,import requests from fake_useragent import UserAgent from lxml import html import pymysql etree = html.etree # 彩票数据所在的url url = 'http://datachart.500.com/ssq/' # 提取数据 response = requests.get(url, headers={"User-Agent": UserAgent().chrome}) # 通过xpath去解析 e = etree.HTML(response.text) date_times = e.xpath('//tbody[@id="tdata"]/tr/td[1]/text()') trs = e.xpath('//tbody[@id="tdata"]/tr[not(@class)]') count =date_times.__len__() # 链接数据库 client = pymysql.connect(host='localhost', port=3306, user='root', password='123456', charset='utf8', db='bangumi') cursor = client.cursor() # 插入数据的sql sql = 'insert into doubleballs values(0,%s,%s,%s)' # 查看数据是否存在 select_new_sql = "select * from doubleballs where date_time = %s" # date_times.reverse() # 记录有多少条新数据 index = 0 for data_time in date_times: reslut = cursor.execute(select_new_sql, [data_time]) # 判断数据是否存在 if reslut == 1: index += 1 index = count - index # 数据从新到旧排序 # trs.reverse() date_times.reverse() trs.reverse() for i in range(index): # 提取红球 red_ball = '-'.join(trs[index-i-1].xpath('./td[@class="chartBall01"]/text()')) # 提取蓝球 blue_ball = trs[index-i-1].xpath('./td[@class="chartBall02"]/text()')[0] print("第" + date_times[index-i-1] + "红球是:" + red_ball + " 蓝球:" + blue_ball) cursor.execute(sql, [date_times[index-i-1], red_ball, blue_ball]) client.commit() # for data_time, tr in zip(date_times, trs): # red_ball = '-'.join(tr.xpath('./td[@class="chartBall01"]/text()')) # blue_ball = tr.xpath('./td[@class="chartBall02"]/text()')[0] # print("第" + data_time + "红球是:" + red_ball + " 蓝球:" + blue_ball) # cursor.execute(sql, [data_time, red_ball, blue_ball]) # client.commit() cursor.close() client.close()