构造方法:删除缩写词,替换相似词。
def read_samples_by_string(self, path):
for tokens in self.read_tokens(path):
source = []
target = []
for token in tokens:
target.append(token)
# Randomly dropout some words from the input.
dropout_token = (token in MovieDialogReader.DROPOUT_TOKENS and
random.random() < self.dropout_prob)
replace_token = (token in MovieDialogReader.REPLACEMENTS and
random.random() < self.replacement_prob)
if replace_token:
source.append(MovieDialogReader.REPLACEMENTS[token])
elif not dropout_token:
source.append(token)
yield source, target
MovieDialogReader.DROPOUT_TOKENS={"'ll", "'s", 'a', 'an', "'ve", "'m", 'the'}
MovieDialogReader.REPLACEMENTS={'than': 'then', 'their': 'there', 'there': 'their', 'then': 'than'}
渔舟唱晚兮
发布了41 篇原创文章 · 获赞 2 · 访问量 1564
私信
关注