初版
# -*- coding: utf-8 -*- import scrapy import requests from lxml import etree from selenium import webdriver from scrapy.http.response.html import HtmlResponse import re class WxappSpider(scrapy.Spider): name = ‘wxapp‘ allowed_domains = [‘wxapp-union.com‘] start_urls = [‘http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1‘] def parse(self, response): result = response.text print("*"*60) print(result) print("*"*60) divs = re.findall(r‘<div\sclass="mbox_list\srecommend_article_list\scl">(.*?)</div>‘,result,re.DOTALL) for div in divs: print("="*40) print(div) print() # next_url = re.findall(r‘<div\sclass="pg">(.*?)</div>‘,result,re.DOTALL)[0] # print("=" * 40) next_url = re.findall(r‘</strong><a\shref="(.*?)">.*?</a>‘, result, re.DOTALL)[0] print(next_url) next_url = re.sub(r"amp;","",next_url,1) yield scrapy.Request(next_url,callback=self.parse,dont_filter=True) # next_url = "".join(next_url) # next_url = re.findall(r‘href="(.*?)"‘,next_url,re.DOTALL)[0] # print(type(next_url)) # print(next_url)
此版仅为尝试yield方式多页爬取
由于不明原因 爬取出来的href多了"amp;"字符
幸好仔细比对了实际地址,发现爬取出的地址并非真实地址,只能访问到第一页
后在替换掉此多余字串后,成功爬取
未对爬取出的div,进行进一步解析,保存