"""
字典setdefault方法的使用:
d.setdefault(k, [default]):
如果有k,则返回。如果没有则让 k = default,然会返回default
案例: 统计文本中每个单词出现的位置
格式: {'单词‘,[(行号, 列号),(行号,列号)]}
"""
import re
import sys
# 返回Pattern对象
WORD_RE = re.compile(r'\w+') # [A-Za-z0-9_]
index = {}
with open('./zen.txt', encoding='utf-8') as fp:
for line_no, line in enumerate(fp, 1): # 获得行号和一行数据
for match in WORD_RE.finditer(line): # 匹配每一行中的每个单词
word = match.group()
column_no = match.start() + 1 # start() 该单词在整个串中的位置
location = (line_no, column_no)
# occurrences = index.get(word, []) # <1>
# occurrences.append(location) # <2>
# index[word] = occurrences # <3>
index.setdefault(word, []).append(location)
print(index)
for word in sorted(index, key=str.upper): # <4>
print(word, index[word])
zen.txt:
The Zen of Python, by Tim Peters
Beautiful is better than ugly.
Explicit is better than implicit.
输出:
{'The': [(1, 1)], 'Zen': [(1, 5)], 'of': [(1, 9)], 'Python': [(1, 12)], 'by': [(1, 20)], 'Tim': [(1, 23)], 'Peters': [(1, 27)], 'Beautiful': [(3, 1)], 'is': [(3, 11), (4, 10)], 'better': [(3, 14), (4, 13)], 'than': [(3, 21), (4, 20)], 'ugly': [(3, 26)], 'Explicit': [(4, 1)], 'implicit': [(4, 25)]}
Beautiful [(3, 1)]
better [(3, 14), (4, 13)]
by [(1, 20)]
Explicit [(4, 1)]
implicit [(4, 25)]
is [(3, 11), (4, 10)]
of [(1, 9)]
Peters [(1, 27)]
Python [(1, 12)]
than [(3, 21), (4, 20)]
The [(1, 1)]
Tim [(1, 23)]
ugly [(3, 26)]
Zen [(1, 5)]
Process finished with exit code 0