from lxml import etree
#####################
基本用法:
#####################
html = '''
<h1 class="header">登录</h1>
<form action="/login" method="post">
<label for="username">用户: </label><input type="text" name="username" />
<label for="password">密码:</label><input type="password" name="password" />
<input type="submit" value="Submit" />
</form>'''
# 生成DOM
dom = etree.HTML(html)
# 取内容 /text()
contents = dom.xpath('//h1[@class="header"]/text()')
print(contents)
# 取属性 /@attrib
attribs = dom.xpath('//form/label[@for="username"]/@for')
print(attribs)
#####################
复杂用法:
#####################
html2 = '''
<div class="content">
==> 有相同字符开头的属性的标签:
<p id="test-1">需要的内容1</p>
<p id="test-2">需要的内容2</p>
<p id="test-default">需要的内容3</p>
</div>
<div class="question">
==> 签嵌套标签:
<p id="class3">美女,
<font color="red">你的微信号是多少?</font>
</p>
</div> '''
dom = etree.HTML(html2)
# 取有相同字符开头的属性的标签的内容 starts-with(@attrib, "abcd")
contents2 = dom.xpath('//p[starts-with(@id, "test")]/text()')
print(contents2)
# 取标签嵌套标签的所有内容 xpath('string(.)')
contents3 = dom.xpath('//div[@class="question"]/p')[0].xpath('string(.)')
contents3 = contents3.replace('\n', '').replace(' ', '')
print(contents3)