本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理
以下文章来源于腾讯云 作者:昱良
通过网上爬虫获取了全国所有企业,然后就需要补充企业信息,首先想到的就是企查查,启信宝等专业网站,最终选择了企查查,尝试了多种方法:
1、selenium爬虫,绕过企查查的登录验证,但账号和IP限制太大,最终放弃
2、通过requests直接请求+cookies,遇到了cookie有效期和限制问题
不断的尝试和修改参数,最终发现一种有效方式selenium + wep
只需要IP代理,不需要账号,没有限制,因为是没有登录,拿到的信息有限,能展示的都能获取。
image
一、初始化selenium
sysstr = platform.system() if(sysstr =="Windows"): chromedriver_path = os.getcwd() + "\\utools\\chromedriver.exe" else: #mac chromedriver_path = os.getcwd() + "/mac_chromedriver" logger.info("chromedriver_path: %s" %(chromedriver_path,)) default_agent = '--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"' class wap_QiChaCha(object): def __init__(self, user_agent_header=default_agent, chromedriver_path=chromedriver_path): self.options = webdriver.ChromeOptions() # 创建一个配置对象 self.options.add_argument('--no-sandbox') self.options.add_argument('--disable-dev-shm-usage') self.options.add_argument('--proxy-server=http://47.98.154.206:3008') self.options.add_argument("--headless") # 开启*面模式cd self.options.add_argument("--disable-gpu") # 可选项:禁用gpu,可以解决一些莫名的问题 self.options.add_argument(user_agent_header) mobileEmulation = {'deviceName': 'iPhone X'} self.options.add_experimental_option('mobileEmulation', mobileEmulation) def init(self): self.driver = webdriver.Chrome(executable_path=chromedriver_path, chrome_options=self.options) # 打开登录页面 self.driver.get('https://m.qichacha.com/') self.error_encounter = 0
二、判断公司存在
def search_company(self, company_name): #time.sleep(0.3) try: result = {} result[COMPANY.NAME] = utils.normalizeCompanyName(company_name) logger.info("search for: %s" %(company_name,)) ''' self.driver.get('https://m.qichacha.com/') self.driver.find_element_by_id('searchkey').send_keys(company_name) # 单击搜索按钮 srh_btn = self.driver.find_element_by_xpath('//*[@id="V3_Index_S"]//span') srh_btn.click() ''' self.driver.get('https://m.qcc.com/search?key=%s' %(company_name)) utils.alertWait(WebDriverWait(self.driver, 3).until, expected_conditions.presence_of_element_located( (By.XPATH, '//*[contains(@class,"text-danger") or contains(@class,"nodata")]')), 5, 0, "not found text-danger or nodata") # 检测企业是不是存在 inc_full = self.driver.find_element_by_xpath('//*[@class="text-danger"]').text self.error_encounter = 0 if inc_full == "0": logger.error("company %s not found" %(company_name,)) return None # 获取首个企业文本 cname = self.driver.find_element_by_xpath('//div[@class="list-item-name"]').text href = self.driver.find_element_by_xpath('//a[@class="a-decoration"]').get_attribute("href") # 曾用名 cym = None try: stock_or_others = self.driver.find_element_by_xpath('//div[@class="list-item-bottom"]').text # print(stock_or_others) # 称呼不同:曾用名 或 历史股东等 if utils.normalizeCompanyName(company_name) in stock_or_others: company_bottom = stock_or_others.replace(":", ":") cym = company_bottom.split(":")[1] except: # 获取下面显示失败 pass if utils.normalizeCompanyName(cname) == utils.normalizeCompanyName(company_name) \ or utils.normalizeCompanyName(cym) == utils.normalizeCompanyName(company_name): result[COMPANY.URL] = href # time.sleep(0.2) return self.company_detail(href, result) except Exception as err: # self.driver.delete_all_cookies() logger.error(err) self.error_encounter = self.error_encounter + 1 if self.error_encounter >= 3: self.driver.quit() self.init() return None finally: pass
image
三、获取公司信息
def company_detail(self, href, result): self.driver.get(href) utils.alertWait(WebDriverWait(self.driver, 3).until, expected_conditions.presence_of_element_located((By.XPATH, '//*[@class="company-name"]')), 5, 0, "not found text-danger") try: phone = self.driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[1]/div[3]/a[1]').text if phone and len(phone) > 0: result[COMPANY.TEL] = phone.strip() except Exception as e: pass # logger.info("没有手机号") try: email = self.driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[1]/div[3]/a[2]').text if email and len(email) > 0: result[COMPANY.EMAIL] = email.strip() except Exception as e: pass # logger.info("没有邮箱") try: address = self.driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[1]/div[4]').text if address and len(address) > 0: result[COMPANY.ADDRESS] = address.strip() except Exception as e: pass # logger.info("没有地址") try: infos = self.driver.find_element_by_xpath('//div[@class="basic-wrap"]/table') # infos = self.driver.find_element_by_xpath('//*[@id="Cominfo"]/table') except: return result result[COMPANY.TAX_LEVEL] = "税务等级&&" try: taxcreditlist = self.driver.find_element_by_xpath('//div[@id="taxcreditlist"]').text info = str(taxcreditlist).replace("\n", "&").strip() result[COMPANY.TAX_LEVEL] = result[COMPANY.TAX_LEVEL] + info except: return result # 转为etree data = etree.HTML(infos.get_property("innerHTML")) data_info = data.xpath('.//tr') result[COMPANY.BUSINESS] = "工商信息" for info in data_info: info_list = info.xpath(".//td//text()") new_info_list = [] for info in list(info_list): new_info = str(info).replace("\n", "").strip() new_info_list.append(new_info) new_info_list = [i for i in new_info_list if i != ''] self.retrieveInfo(new_info_list, result) result[COMPANY.BUSINESS] = result[COMPANY.BUSINESS] + " && " + " && ".join( map(str, new_info_list)) # 以 && 分割 连接 list 内容 return result