实现:
# 1、获取所有tr标签
# 2、获取第2个tr标签
# 3、获取所有class等于even的tr标签
# 4、获取所有a标签及其属性值
# 5、获取所有的职位信息(纯文本)
1 """lxml&XPath结合使用""" 2 3 4 from lxml import etree 5 6 parser = etree.HTMLParser(encoding='utf-8') 7 html = etree.parse('test.html', parser=parser) 8 9 # 1、获取所有tr标签 10 trs = html.xpath("//tr") # xpath函数返回的是一个列表 11 for tr in trs: 12 print(tr) 13 #print(etree.tostring(tr, encoding='utf-8').decode('GBK'))
1 # 2、获取第2个tr标签 2 tr = html.xpath("//tr[2]")[0] 3 print(etree.tostring(tr, encoding='utf-8').decode('GBK'))
1 # 3、获取所有class等于even的tr标签 2 trs = html.xpath("//tr[@class='even']") 3 for tr in trs: 4 print(etree.tostring(tr, encoding='utf-8').decode('GBK'))
1 # 4_1、获取所有a标签下href属性的值 2 ah = html.xpath("//a/@href") 3 for a in ah: 4 print(a) 5 # 4_2、获取拥有href属性的a标签 6 al = html.xpath("//a[@href]") 7 for a in al: 8 print(etree.tostring(a, encoding='utf-8').decode('GBK'))