第一种:
正则:
r = session.get(the_url, headers=header)
res = r.json()
city_list = jsonpath.jsonpath(res, "$..text")#"$..name"取所有k=name的v
for name in city_list:
name=name.translate(non_bmp_map)
tags = re.compile('<.*?>',re.S)
name=tags.sub('',name)
第二种:
借助 pyquery 将正文中的 HTML 标签去掉:
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.text())