专利网站selenium自动输入申请人、日期爬虫

import re

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import Select
import csv
import time
# browser = webdriver.Chrome()
# 最小化窗口
options = webdriver.ChromeOptions()
prefs = {
        'profile.default_content_setting_values': {
            'images': 2,
            'permissions.default.stylesheet':2
        }
    }
options.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=options)
url='https://kns.cnki.net/kns/brief/result.aspx?dbprefix=SCOD'
browser.get(url)
time.sleep(2)
browser.find_element_by_id('Form1')
browser.find_element_by_class_name('main_sh')
Select(browser.find_element_by_id("txt_1_sel")).select_by_value("SQR")#选择申请人搜索
browser.find_element_by_id('date_gkr_from').send_keys('2019-01-01')#填公开日起始日期
browser.find_element_by_id('date_gkr_to').send_keys('2020-01-01')#填公开日结束日期
f = open('D:\ptang\data2.csv', 'r')
content = f.read()
final_list = list()
lst=[]
rows = content.split('\n')
for row in rows:
    final_list.append(row.split(','))
for i in final_list:
    print(i)
    # browser.find_element_by_id('txt_1_value1').send_keys(i)#填公司名字
    browser.find_element_by_xpath('//*[@id="txt_1_value1"]').send_keys(i)
    # browser.find_element_by_id('btnSearch').click()#点击搜索
    btn_div=browser.find_element_by_xpath('//*[@id="btnSearch"]')
    browser.execute_script("arguments[0].click();", btn_div)
    time.sleep(3)
    try:
        browser.find_element_by_id('iframeResult')# 定位到iframe
    except NoSuchElementException:
        print('no')
    browser.switch_to.parent_frame()  # 切换到父iframe
    browser.switch_to.frame('iframeResult')
    shuzi = browser.find_element_by_class_name('pagerTitleCell').text
    shuzi=re.findall(r"\d+",shuzi)
    q=browser.find_element_by_class_name('GridTableContent')
    www=q.find_element_by_tag_name('tbody')
    tr_content =www.find_elements_by_tag_name("tr")
    for tr in tr_content:
        zhuanlihao=tr.find_element_by_tag_name('input').get_attribute('value')
        print(zhuanlihao)
        zhuanlihao=re.findall(r"CN.*?!", zhuanlihao)
        print(zhuanlihao)
        # name=tr.find_element_by_class_name('name')
        # lst.append(name)
        # dates=tr.find_elements_by_class_name('date')
        # for data in dates:
        #     lst.append(data)
        lst.append(zhuanlihao)
    while True:
        try:
            yeshu = browser.find_element_by_class_name('topTurnSpan')
            yeshu.find_element_by_id('Page_next').click()
            js = 'var action=document.documentElement.scrollTop=10000'
            # 设置滚动条距离顶部的位置,设置为 10000, 超过10000就是最底部
            browser.execute_script(js)  # 执行脚本

            js = 'var action=document.documentElement.scrollTop=0'  # 回到顶部

            browser.execute_script(js)
            time.sleep(3)
            q = browser.find_element_by_class_name('GridTableContent')
            www = q.find_element_by_tag_name('tbody')
            tr_content = www.find_elements_by_tag_name("tr")

            for tr in tr_content:
                zhuanlihao=tr.find_element_by_tag_name('input').get_attribute('value')
                print(zhuanlihao)
                zhuanlihao = re.findall(r"CN.*?!", zhuanlihao)
                print(zhuanlihao)
                # name=tr.find_element_by_class_name('name')
                # lst.append(name)
                # dates=tr.find_elements_by_class_name('date')
                # for data in dates:
                #     lst.append(data)
                lst.append(zhuanlihao)
        except:
            break
    print(lst)
    x=str(lst).count("U")
    y=str(lst).count('S')#外观设计
    z=str(lst).count('A')#发明申请
    zz=str(lst).count('B')#发明申请专利
    print(x)
    lst.clear()
    lst.append(x)
    lst.append(y)
    lst.append(z)
    lst.append(zz)
    lst.append(shuzi)

    print(lst)
    # if x==0 and y==0 and z==0 and zz==0:
    #     break
    with open('D:\ptang\data3.csv','a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(lst)
        lst.clear()
    # browser.find_element_by_id('txt_1_value1').clear()
    browser.switch_to.default_content()
    browser.find_element_by_id('txt_1_value1').clear()
    print('有到这')
    time.sleep(1)
    # browser.find_element_by_xpath('//*[@id="txt_1_value1"]').clear()

print(final_list)

上一篇:获取指定元素的索引


下一篇:Python学习笔记_列表