import requests import cchardet import traceback from lxml import etree def downloader(url,timeout = 10,headers = None,debug = False, binary = False): _headers = { 'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; ' 'Windows NT 6.1; Win64; x64; Trident/5.0)') } redirected_url = url if headers: headers = _headers try: res = requests.get(url,headers,timeout = timeout) if binary: html = res.content else: encoding = cchardet.detect(res.content)["encoding"] html = res.content.decode(encoding) status = res.status_code redirected_url = res.url except: if debug: traceback.print_exc() msg = "failed download:{}".format(url) print(msg) if binary: html =b"" else: html = "" status = 0 return status,html,redirected_url def parser(html): d = 0 tree = etree.HTML(html) divs_list = tree.xpath(".//div[@class = 'main']/div[contains(@class,'clearfix')]") for div in divs_list: a_list = div.xpath(".//ul[contains(@class,'list-a')]//a") for i in a_list: try: href = i.xpath("./@href")[0].strip().replace("\\n",'').replace('\\t','') title = i.xpath("./text()")[0].strip().replace("\\n",'').replace('\\t','') d += 1 print(d,(href,title)) except (IndexError) as e: pass if __name__ == '__main__': url = r"https://www.sina.com.cn/" status,html,redirected_url = downloader(url) paser = parser(html) #print(status,html,redirected_url)