目录
scrapy 爬取股票
stock.py
# -*- coding: utf-8 -*-
import scrapy
from items import StockstarItem, StockstarItemLoader
class StockSpider(scrapy.Spider):
name = 'stock'
allowed_domains = ['quote.stockstar.com']
start_urls = ['http://quote.stockstar.com/stock/ranklist_a_3_1_1.html']
def parse(self, response):
page = int(response.url.split("_")[-1].split(".")[0])
item_nodes = response.css('#datalist tr')
for item_node in item_nodes:
item_loader = StockstarItemLoader(item=StockstarItem(), selector=item_node)
item_loader.add_css("code", "td:nth-child(1) a::text")
item_loader.add_css("abbr", "td:nth-child(2) a::text")
item_loader.add_css("last_trade", "td:nth-child(3) span::text")
item_loader.add_css("chg_ratio", "td:nth-child(4) span::text")
item_loader.add_css("chg_amt", "td:nth-child(5) span::text")
item_loader.add_css("chg_ratio_5min", "td:nth-child(6) span::text")
item_loader.add_css("volumn", "td:nth-child(7)::text")
item_loader.add_css("turn_over", "td:nth-child(8)::text")
stock_item = item_loader.load_item()
yield stock_item
if item_nodes:
next_page = page + 1
next_url = response.url.replace("{0}.html".format(page), "{0}.html".format(next_page))
yield scrapy.Request(url=next_url, callback=self.parse)
item.py:
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class StockstarItemLoader(ItemLoader):
#自定义itemloader
default_output_processor = TakeFirst()
class StockstarItem(scrapy.Item):
code = scrapy.Field() # 股票代码
abbr = scrapy.Field() # 股票简称
last_trade = scrapy.Field() # 最新价
chg_ratio = scrapy.Field() # 涨跌幅
chg_amt = scrapy.Field() # 涨跌额
chg_ratio_5min = scrapy.Field() # 5分钟涨幅
volumn = scrapy.Field() # 成交量
turn_over = scrapy.Field() # 成交额
middlewares.py:
from scrapy import signals
class StockstarSpiderMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
return None
def process_spider_output(self, response, result, spider):
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
pass
def process_start_requests(self, start_requests, spider):
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
pip:
class StockstarPipeline(object):
def process_item(self, item, spider):
return item
settings.py:
from scrapy.exporters import JsonLinesItemExporter
# 默认显示的中文是阅读性较差的Unicode字符
# 需要定义子类显示出原来的字符集(将父类的ensure_ascii属性设置为False即可)
class CustomJsonLinesItemExporter(JsonLinesItemExporter):
def __init__(self, file, **kwargs):
super(CustomJsonLinesItemExporter, self).__init__(file, ensure_ascii=False, **kwargs)
# 启用新定义的Exporter类
FEED_EXPORTERS = {
'json': 'stockstar.settings.CustomJsonLinesItemExporter',
}
BOT_NAME = 'stockstar'
SPIDER_MODULES = ['stockstar.spiders']
NEWSPIDER_MODULE = 'stockstar.spiders'
ROBOTSTXT_OBEY = True
main.py
from scrapy.cmdline import execute
execute(["scrapy", "crawl", "stock", "-o", "items.json"])
scrapy.cfg
[settings]
default = stockstar.settings
[deploy]
#url = http://localhost:6800/
project = stockstar
问题:
编码问题:
import requests
url = 'https://cdn.heweather.com/china-city-list.txt'
response = requests.get(url)
response.encoding='utf8'
data = response.text
data_1 = data.split('\r')
print(data_1)
# 去除前3行
for i in range(3):
data_1.remove(data_1[0])
# 提取每行的字符串索引
for item in data_1:
print(item[0:11])