#spider/first
# -*- coding: utf-8 -*- import scrapy from selenium import webdriver from xinwen.items import XinwenItem
#百度ai部分 from aip import AipNlp APP_ID = '' API_KEY = '' SECRET_KEY = '' client = AipNlp(APP_ID, API_KEY, SECRET_KEY) class FirstSpider(scrapy.Spider): name = 'first' start_urls = ['https://news.163.com/'] bro = webdriver.Chrome(executable_path=r'C:\Users\Administrator.DESKTOP-9DN4SRE\Downloads\Compressed\chromedriver_win32\chromedriver.exe') model_urls = [] def parse(self, response): li_list = response.xpath('//*[@id="index2016_wrap"]/div[1]/div[2]/div[2]/div[2]/div[2]/div/ul/li') # index = [3,4,6,7,8] index = [3] for i in index: model_url=li_list[i].xpath('./a/@href').extract_first() self.model_urls.append(model_url) for url in self.model_urls: yield scrapy.Request(url=url,callback=self.parse_mode) def parse_mode(self,response): div_list = response.xpath('/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div') for div in div_list: print(div) title = div.xpath('./a/img/@alt').extract_first() div_url = div.xpath('./a/@href').extract_first() if div_url: item = XinwenItem() item['title'] = title yield scrapy.Request(url=div_url,callback = self.content_new_mode,meta={'item':item}) def content_new_mode(self,response): item = response.meta['item'] #//*[@id="endText"]/p content = response.xpath('//*[@id="endText"]/p/text()').extract() content = ''.join(content) item['content'] = content.replace(u'\xa0', u'') all_tags = "标签:" tagss = client.keyword(item['title'],item['content']) #tagss = tagss #字典 add_tags = tagss['items'] for i in add_tags: all_tags = all_tags + i['tag'] +" " item['tags'] = all_tags yield item # def baidutags(self,response): def closed(self, spider): self.bro.quit()
item.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class XinwenItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() content = scrapy.Field() tags = scrapy.Field() #pass
middleware.py
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals from scrapy.http import HtmlResponse
from time import sleep class XinwenDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. def process_request(self, request, spider): return None def process_response(self, request, response, spider): model_urls = spider.model_urls bro = spider.bro if request.url in model_urls: bro.get(request.url) sleep(1) page_text = bro.page_source return HtmlResponse(url=request.url,body=page_text,encoding='utf-8',request=request) else: return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import pymysql class XinwenPipeline(object): all_data=[] def process_item(self, item, spider): self.all_data.append(item) # all_tags = "" # tagss = client.keyword(item['title'], item['content']) # add_tags = tagss['items'] # for i in add_tags: # all_tags = all_tags + i['tag'] + " " # # print(all_tags) print(item['tags']) return item def close_spider(self,spider): print(len(self.all_data)) #数据存储到mysql class MysqlPileLine(object): conn = None cursor = None def open_spider(self,spider): self.conn = pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='mima',db='wangyi',charset='utf8') def process_item(self, item, spider): title = item['title'] content = item['content'] tags = item['tags'] self.cursor = self.conn.cursor() sql = 'insert into wangyi values ("%s","%s","%s")'%(title,content,tags) try: self.cursor.execute(sql) self.conn.commit() except Exception as e: print(e) self.conn.rollback() return item def close_spider(self,spider): self.cursor.close() self.conn.close()
settings.py
# -*- coding: utf-8 -*- # Scrapy settings for xinwen project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'xinwen' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36' SPIDER_MODULES = ['xinwen.spiders'] NEWSPIDER_MODULE = 'xinwen.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'xinwen (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False LOG_LEVEL ='ERROR' # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'xinwen.middlewares.XinwenSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'xinwen.middlewares.XinwenDownloaderMiddleware': 543, } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'xinwen.pipelines.XinwenPipeline': 300, 'xinwen.pipelines.MysqlPileLine': 301, #'xinwen.pipelines.BaiduAiPipeline':301, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'