setting
from fake_useragent import UserAgent BOT_NAME = 'carhome' SPIDER_MODULES = ['carhome.spiders'] NEWSPIDER_MODULE = 'carhome.spiders' ROBOTSTXT_OBEY = False DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': str(UserAgent().random), } ITEM_PIPELINES = { # 'carhome.pipelines.CarhomePipeline': 300, 'scrapy.pipelines.images.ImagesPipeline':1 } IMAGES_STORE = "D:\python\scrapy_demo\carhome\carhome\images"
items:
import scrapy class CarhomeItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() category = scrapy.Field() image_urls = scrapy.Field() imgs = scrapy.Field()
spiders/carhome_spider:
import scrapy from carhome.items import CarhomeItem class CarhomeSpiderSpider(scrapy.Spider): name = 'carhome_spider' allowed_domains = ['car.autohome.com'] start_urls = ['https://car.autohome.com.cn/pic/series/66.html#pvareaid=3454438'] def parse(self, response): divs = response.xpath("//div[@class='uibox']")[1:] for div in divs: category = div.xpath('.//div[@class="uibox-title"]/a/text()').get() urls = div.xpath(".//ul/li/a/img/@src").getall() # for url in urls: # url=response.urljoin(url) # print(url) urls = map(lambda url:response.urljoin(url),urls) urls = list(urls) item = CarhomeItem(category = category,image_urls=urls) yield item