from lxml import etree text = ''' <div class="top-nav-info"> <a href="https://accounts.douban.com/passport/login?source=movie" class="nav-login" rel="nofollow">登录/注册</a> </div> <div class="top-nav-doubanapp"> <a href="https://www.douban.com/doubanapp/app?channel=top-nav" class="lnk-doubanapp">下载豆瓣客户端</a> <div id="doubanapp-tip"> <a href="https://www.douban.com/doubanapp/app?channel=qipao" class="tip-link">豆瓣 <span class="version">6.0</span> 全新发布</a> <a href="javascript: void 0;" class="tip-close">×</a> </div> <div id="top-nav-appintro" class="more-items"> <p class="appintro-title">豆瓣</p> <p class="qrcode">扫码直接下载</p> <div class="download"> <a href="https://www.douban.com/doubanapp/redirect?channel=top-nav&direct_dl=1&download=iOS">iPhone</a> <span>·</span> <a href="https://www.douban.com/doubanapp/redirect?channel=top-nav&direct_dl=1&download=Android" class="download-android">Android</a> </div> </div> </div> ''' # 方法1 :解析一个html格式的字符串 # requests获取响应,content中获取到的html页面时正常的页面 #etree.HTML()对html字符串进行解析为html文件 html = etree.HTML(text) print(html) result = etree.tostring(html,encoding='utf8').decode('utf8') # print(result) # 方法2 :解析html 文件 #parse 将html导入python解析 html_2 = etree.parse(r'./test.html') result_2 = etree.tostring(html_2,encoding='utf8').decode() # print(result_2) # 方法3 :自定义解析器,解析html文件。解决方法2可能解析失败的问题 ''' 原因:有些直接从网站下载下来的html文件有标签缺失,使用etree.parse()方法解析会报错。 默认使用的xml解析器,当解析html页面时会造成错误,需要自定义解析器 解决方法:自定义一个解析器,将自定义的解析器作为参数传递给parse ''' # 自定义一个解析器 paser = etree.HTMLParser(encoding='utf8') html_3 = etree.parse(r'./test.html',parser=paser) result_3 = etree.tostring(html_3,encoding='utf8').decode() print(result_3)