【shikaobang】 python爬虫脚本

2021-10-10 14:49:09
"""
事考帮更新url加密数字后，无法解码。只能用【<div class="title">相关推荐</div>】里面的链接来处理
解决办法：相关推荐是按题目顺序排列，以最后一个为起始网址，不断循环复制加密编码，起到原来的效果
"""
import pandas as pd
import urllib
import urllib2
from bs4 import BeautifulSoup
import codecs
import re


a1 = 101500 #需要自己修改起始值

urlname_list = []
url_name_start = u'/questionbank/5YmJvWgYm6' #填入查询到开始的urlname
url_name_end = u'/questionbank/G5mbgoM1aX' #填入查询到最后的urlname
urlname_list.append(url_name_start)
a = 1
b = 1
while True:
    url_name = "http://www.shikaobang.cn" + url_name_start
    user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/2011122 Ubuntu/10.10 (maverick) Firefox/2.5.1"
    request = urllib2.Request(url_name, headers={'User-Agent':user_agent})
    html = urllib2.urlopen(request)
    html_data = BeautifulSoup(html,"html.parser")
    if html_data.find(name='a') is None:
        urlname_list.pop()
        url_name_start = urlname_list[-1]
        print "网页抓取失败，此时网址为：" + url_name_start
        continue

    for m in html_data.find_all(href=re.compile("/questionbank/")) :
        if m['href'] == url_name_end:
            urlname_list.append(m['href'])
            break
        else:
            urlname_list.append(m['href'])
            a = a + 1
    url_name_start = urlname_list[-1]
    if url_name_end == url_name_start:
        break
    print u"网页抓取成功，此时网址为：" + url_name_start
    print u"查询结果共" + str(a) + u"条"
print u"最终查询结果共" + str(a) + u"条"


print u'开始爬取网页'
#爬取网页
import pandas as pd
import urllib
import urllib2
from bs4 import BeautifulSoup
import codecs

import time
time_start=time.time()
"""
修改题目对应网页数值

"""
a2 = a1

for i in urlname_list:
    try:
        url_name = "http://www.shikaobang.cn" + i
        user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/2011122 Ubuntu/10.10 (maverick) Firefox/2.5.1"
        request = urllib2.Request(url_name, headers={'User-Agent':user_agent})
        html = urllib2.urlopen(request)
        f = codecs.open('html/sz_'+str(a1),'w')
        f.write(html.read())
        f.close()
        a1 = a1 + 1
    except:
        print i
        pass
    continue
print "下次使用该编码作为起始值:" + str((int(a1/100)+1)*100)
print "爬取网页结束，开始处理文本" 


# -*- coding: utf-8 -*-
def html_chuli(html):
    
    html_data = BeautifulSoup(html)
    
    t_miaosu = html_data.find(attrs={'name':'description'})['content'] #题目描述
    
    t_news_title = html_data.find_all(attrs={'class':'news-content-title'})
    t_news_typs = html_data.find_all(attrs={'class':'news-typs'})
    t_news_time = html_data.find_all(attrs={'class':'news-time'})

    tdata1 = html_data.find("div", attrs={'class':'main-content'})#抓取第一个框架
    if tdata1:
        t_leixing = tdata1.select('span')[0].string #题目类型
        t_content = tdata1.select('div.question-title')[0].string #题目内容 注：id是#；name是.
        t_xueze = tdata1.select('div.question-item') #题目所有选项

        x_ABCD = [] #选项ABCD
        x_content = [] #选项ABCD对应内容
        z_xueze = [] #正确选项

        for item in t_xueze:
            item_middle = item.get_text().split()
            x_ABCD.append(item_middle[:1]) 
            x_content.append(item_middle[1:]) 
    
        for item in tdata1.select('label.actives'):#选择
            z_xueze.append(item.string)   
        for item in tdata1.select('div.question-item.correct i'):#判断
            z_xueze.append(item.string)            
    
        return t_miaosu,t_leixing,t_content,x_ABCD,x_content,z_xueze,t_news_title,t_news_typs,t_news_time
    else:
        return '0'

#文本处理
import pandas as pd
import urllib
import urllib2
import re
import json
import random
from bs4 import BeautifulSoup
import codecs

"""
修改提取后对应文本编码
"""
for i in range(a2,a1):
    try:
        with open('html/sz_'+str(i), 'r') as f:
            s_1 = ""
            s_2 = ""
            t_n = ""
    
            contents = f.read().decode("utf-8", "ignore") #处理�
            t_miaosu,t_leixing,t_content,x_ABCD,x_content,z_xueze,t_news_title,t_news_typs,t_news_time = html_chuli(contents)

            for m in range(len(x_ABCD)):
                if x_ABCD[m][0]:
                    s1 = x_ABCD[m][0]
                else:
                    s1=""
                if x_content[m][0]:
                    s2 = x_content[m][0]
                else:
                    s2=""  
                
                s_1 = s_1 + s1 + ":" + s2 + "  "

            for n in range(len(z_xueze)):
                s_2 = s_2 + z_xueze[n].strip()
    
            for z in range(len(t_news_title)):
                if t_news_title[z]:
                    new1 = t_news_title[z].text
                else:
                    new1=""
                if t_news_typs[z]:
                    new2 = t_news_typs[z].text
                else:
                    new2=""
                if t_news_time[z]:
                    new3 = t_news_time[z].text
                else:
                    new3=""
                
                t_n = t_n + new1 + "|" + new2 + "|" + new3 + "&"
        
            if t_leixing is None:
                continue
                
            k1 = str(i) + "#" + t_miaosu.replace("\n", "") + "#" + t_leixing + "#" + t_content.replace(" ", "").replace("\n", "") + "#" + s_1.replace("\n", "") + "#" + s_2.replace("\n", "") + "#" + t_n.replace("\n", "")
            f1 = codecs.open(u'out/时政202011-20210325.txt','a',encoding="utf-8") #修改导出txt文件编号
            f1.write(k1 + "\n")
    except:
        f2 = codecs.open('out/fail_num.txt','a',encoding="utf-8")
        k2 = str(i)
        f2.write(k2 + "\n")
        print str(i) + u"号html文件导入失败！"
        f2.close()
        pass
    continue
               
f1.close()

print u"处理完毕！再次执行请修改“输出文件名”，并保存py文件，然后重新开始！！！"
　　此代码仅纪念作用，目前已不可用
码农公寓

相关文章