实例需求:运用python语言在http://www.ip138.com/post/网站爬取全国各个省市县级城市的邮政编码,并且保存在excel文件中
实例环境:python3.7
requests库(内置的python库,无需手动安装)
xlwt库(需要自己手动安装)
实例网站:
第一步,在http://www.ip138.com/post/网站通过查询源代码可以找到各个省份的链接
第二步,点击链接,即可看到所点击省份的城市的邮政编码
实例代码:
import requests
import xlwt # 返回一个字典,键是各个省份的名字,值是对应省份的网址url
def getProvinceCode(url):
response = requests.get(url)
response.encoding = response.apparent_encoding
content = response.text
start = content.find('<map name="map_86" id="map_86">') + len('<map name="map_86" id="map_86">') + len("\n")
end = content.find('</map>')
mapStr = content[start:end]
#print(mapStr)
lines = mapStr.split("\n")
baseUrl = 'http://www.ip138.com/'
city_urls = []
city_name = []
for line in lines:
if line:
index1 = line.find('href="/') + len('href="/')
index2 = line.find('/"')
code = line[index1:index2]
url = baseUrl + code
city_urls.append(url)
title1 = line.find('title="')+len('title="')
title2 = line.find('"', title1)
title = line[title1:title2]
city_name.append(title)
dict_prov_url = dict(zip(city_name,city_urls))
for item in dict_prov_url.items(): # 显示各个省份名称和对应的url
print(item)
return dict_prov_url # 根据url得到省份的各个城市的城市名、邮政编码以及长途区号,返回一个二维的列表。
def getPostCode(url):
response = requests.get(url)
response.encoding = response.apparent_encoding
content = response.text
start = content.find('长途区号</b></td></tr>') + len("长途区号</b></td></tr>")
end = content.find('</table>', start)
add_post = content[start:end] posts = add_post.strip().split('<tr bgcolor="#ffffff">') # posts为每一个去掉<tr bgcolor="#ffffff">组成的列表
code_list = []
for post in posts:
if post:
lines = post.strip().split('<td')
if len(lines) >= 2:
if 'nbsp' in lines[4]:
if len(lines) >= 6:
if 'nbsp' in lines[5]:
test = []
city = lines[1][lines[1].find('>')+len('>'):lines[1].find('</')]
post_code = lines[2][lines[2].find('">')+len('">'):lines[2].find('</')]
area_code = lines[3][lines[3].find('">')+len('">'):lines[3].find('</')]
test.append(city)
test.append(post_code)
test.append(area_code)
code_list.append(test)
else:
test = []
city = lines[1][lines[1].find('<b>')+len('<b>'):lines[1].find('</')]
post_code = lines[2][lines[2].find('">')+len('">'):lines[2].find('</')]
area_code = lines[3][lines[3].find('">')+len('">'):lines[3].find('</')]
test.append(city)
test.append(post_code)
test.append(area_code)
code_list.append(test)
else :
test1 = []
city = lines[1][lines[1].find('>')+len('>'):lines[1].find('</')]
post_code = lines[2][lines[2].find('">')+len('">'):lines[2].find('</')]
area_code = lines[3][lines[3].find('">')+len('">'):lines[3].find('</')]
test1.append(city)
test1.append(post_code)
test1.append(area_code)
code_list.append(test1)
test2 = []
city = lines[4][lines[4].find('>')+len('>'):lines[4].find('</')]
post_code = lines[5][lines[5].find('">')+len('">'):lines[5].find('</')]
area_code = lines[6][lines[6].find('">')+len('">'):lines[6].find('</')]
test2.append(city)
test2.append(post_code)
test2.append(area_code)
code_list.append(test2)
showPost(code_list)
return code_list # 在终端上显示上面getPostCode(url)函数的得到二维的列表
def showPost(code_list):
for i in range(len(code_list)):
print(code_list[i]) # 写入excel文件
def write_excel(path):
# 创建工作簿
workbook = xlwt.Workbook(encoding='utf-8')
# 创建sheet
for title,url in getProvinceCode('http://www.ip138.com/post/').items():
data_sheet = workbook.add_sheet(title)
row0 = [u'城市名称', u'邮政编码', u'长途区号'] # 每个表的第一行文字,表头
for i in range(len(row0)):
data_sheet.write(0, i, row0[i])
code_list = getPostCode(url)
for i in range(len(code_list)): # 循环写入所有邮政编码信息
for j in range(len(code_list[i])):
data_sheet.write(i+1,j,code_list[i][j]) workbook.save(path) if __name__ == '__main__':
path = './postcode.xls'
write_excel(path)
print(u'写入postcode.xls文件成功')
实例结果:
终端显示:
excel文件: