淘宝商品html--网页结构
本篇爬虫紧接上一篇关于 泸州老窖 的爬虫随笔:
import re
import json
def get_space_end(level):
return ' ' * level + '-'
def get_space_expand(level):
return ' ' * level + '+'
def find_keys(targets, level):
"""
设置递归函数,
:param targets:
:param level:
:return:
"""
keys = iter(targets)
for each in keys:
if type(targets[each]) is not dict:
with open("keys.txt", "a+", encoding="utf-8") as file:
file.write(get_space_end(level) + each + '\n')
print(get_space_end(level) + each)
else:
next_level = level + 1
with open("keys.txt", "a+", encoding="utf-8") as file:
file.write(get_space_expand(level) + each + '\n')
print(get_space_expand(level) + each)
find_keys(targets[each], next_level)
def main():
with open("items.txt", "r", encoding="utf-8") as file1:
g_page_config = re.search(r"g_page_config = (.*?);\n", file1.read())
page_config_json = json.loads(g_page_config.group(1))
find_keys(page_config_json, 1)
if __name__ == "__main__":
main()