xpath-xml路径语言.py
#xpath
'''
专门在xml中查找信息的语言
使用时加上 from lxml import etree
'''
from lxml import etree
#方法一
text = '''
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>我的学习</title>
</head>
<body>
<ul class="menu>
<li><a href = "/a/b/c/java/">java工程师</li>
<li><a href = "/a/b/c/c/">c工程师</li>
<li><a href = "/a/b/c/python/">python工程师</li>
<li><a href = "/a/b/c/ai/">ai工程师</li>
</ul>
<div class = "teacher">
<ul>
<li><a href = "/a/b/c/java/">j工程师</li>
<li><a href = "/a/b/c/c/">c工程师</li>
<li><a href = "/a/b/c/python/">p工程师</li>
<li><a href = "/a/b/c/ai/">a工程师</li>
</ul>
</div>
</body>
</html>
'''
#使用etree解析html字符串
html = etree.HTML(text)
print(html)
# #提取数据
# r = html.xpath('/html/body/ul/li[1]/a/text()')
# print(r)
# 获取所有li里面的数据
# rr = html.xpath('//li/a/text()')
# print(rr)
#获取指定标签里面li的数据
r = html.xpath('//div[@class="teacher"]//li/a/text()')
print(r)
h= html.xpath('//div[@class="teacher"]//li/a/@href')
print(h)
print(*zip(r,h))
# #方法二:读取一个html文件并解析
#
# html = etree.parse('./testt.html',etree.HTMLParser())
#
# print(html)
# result = etree.tostring(html)
# print(result)
# print(result.decode('utf-8'))
# r = html.xpath('/html/body/ul/li/a/text()')
# print(r)
requests的基本使用.py
# -*- coding=utf-8 -*-
import requests
#定义请求的url
#url = 'https://www.baidu.com/'
#url = 'http://www.xicidaili.com/nn/'
url = 'https://b.faloo.com/'
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
}
#发送请求
res = requests.get(url= url)
# #获取响应结果
# print(res)
#print(res.content) #b'.......'二进制文本流
print(res.encoding)
res.encoding = 'gbk'
print(res.encoding)
print(res.text)
#获取响应状态码
code = res.status_code
print(code)
#响应成功后把响应内容写入文件
if code == 200:
with open("D:\\zhangt\\pystudy\\pywenjian\\pythonProject\\crawweb\\test.html","w") as fp:
fp.write(res.text)