首先读取 .xls 文件,然后根据表格里的ISBN在京东上挨个搜索,再把需要的信息从网页上提取出来保存在另一个文件里。
每次运行 .py 文件后打开浏览器会弹出登录页面(30s),在此期间手动登录,30秒后开始爬取。
#!/usr/bin/python # -*- coding: UTF-8 -*- from selenium import webdriver from selenium.common.exceptions import TimeoutException, NoSuchElementException import json from selenium.webdriver.common.keys import Keys from lxml import etree import xlrd import xlwt import datetime from time import sleep # options = webdriver.ChromeOptions() # options.add_argument(‘--headless‘) # options.add_argument(‘--no-sandbox‘) # options.add_argument(‘--disable-gpu‘) # options.add_argument(‘--disable-dev-shm-usage‘) # driver = webdriver.Chrome(chrome_options=options) data_dict = tDict = {‘ISBN‘: ‘0000000000000‘, ‘出版时间‘: ‘0000-00-00‘, ‘版次‘: ‘1‘} driver = webdriver.Chrome() def test01_login(): driver = webdriver.Chrome() driver.get( "https://passport.jd.com/new/login.aspx?ReturnUrl=https%3A%2F%2Fwww.jd.com%2F") sleep(30) # 手动登陆一次 cookies = driver.get_cookies() # 将 cookies 写入文件 with open("cookies.txt", "w") as f: json.dump(cookies, f) def singleData(para): try: driver.get(‘https://www.jd.com/‘) # 加载 cookies信息 with open("cookies.txt", "r") as f: cookies = json.load(f) for cookie in cookies: driver.add_cookie(cookie) driver.find_element_by_id("key").send_keys(para) driver.find_element_by_xpath(‘//*[@id="search"]/div/div[2]/button/i‘).click() sleep(3) html = driver.page_source h = etree.HTML(html) # 在搜索到的结果中仅取一条链接 driver.get("https:" + h.xpath(‘//*[@id="J_goodsList"]/ul/li[1]/div/div[1]/a/@href‘)[0]) html = driver.page_source h = etree.HTML(html) # 获得所爬信息 list = h.xpath(‘//div/ul[@class="parameter2 p-parameter-list"]/li/text()‘) for li in list: if li.lstrip().startswith(‘ISBN‘): # 消去左边的空格,然后检测以“ISBN”开头的一条 data_dict["ISBN"] = li if li.lstrip().startswith(‘出版时间‘): data_dict["出版时间"] = li if li.lstrip().startswith(‘版次‘): data_dict["版次"] = li # driver.close() return data_dict except Exception as e: # error occurred, log ‘e‘, etc. with open("exception.txt", "a", encoding="utf-8") as f: f.write(str(e) + "\n") f.close() readbook = xlrd.open_workbook(r‘table.xls‘) SheetOfInput = readbook.sheet_by_name(‘Sheet1‘) nrows = SheetOfInput.nrows # 获取最大行 writebook = xlwt.Workbook(encoding="utf8") # 打开一个excel SheetOfOutput = writebook.add_sheet(‘test‘) # 在打开的excel中添加一个sheet test01_login() for gi in range(0,nrows): try: lng = SheetOfInput.cell(gi, 4).value # 获取i行3列的表格值 tDict = singleData(lng) SheetOfOutput.write(gi, 0, tDict["ISBN"]) SheetOfOutput.write(gi, 1, tDict["出版时间"]) SheetOfOutput.write(gi, 2, tDict["版次"]) writebook.save(‘answer.xls‘) print(‘tDict["ISBN"] = %s, tDict["出版时间"] = %s, tDict["版次"] = %s, gi = %d. ‘ %(tDict["ISBN"], tDict["出版时间"], tDict["版次"], gi)) except Exception as e: # error occurred, log ‘e‘, etc. with open("exception.txt", "a", encoding="utf-8") as f: f.write(str(e) + "\n") f.close() driver.quit() ####################################### # 定义一个爬虫函数,针对单条isbn进行爬取,返回一个字典 # 打开table,读取isbn号, # 调用定义的函数,然后将返回的字典写入table