【python】入门学习（十）

2024-03-05 23:14:55

#入门学习系列的内容均是在学习《Python编程入门（第3版）》时的学习笔记

统计一个文本文档的信息，并输出出现频率最高的10个单词

#text.py

#保留的字符

keep = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p'

        'q','r','s','t','u','v','w','x','y','z',' ','-',"'"}

#将文本规范化

def normalize(s):

    """Convert s to a normalized string."""

    result = ''

    for c in s.lower():

        if c in keep:

            result += c

    return result

#获取文本基本信息

def file_stats(fname):

    """Print statistics for the given file."""

    s = open(fname,'r').read()

    num_chars = len(s)

    num_lines = s.count('\n')

    num_words = len(normalize(s).split())

    print("The file %s has:" % fname)

    print("  %s characters" % num_chars)

    print("  %s lines" % num_lines)

    print("  %s words" % num_words)

#将字符串转化为字典

def make_freq_dict(s):

    """Return a dictionary whose keys are the words of s,and whose values are the counts of those words."""

    s = normalize(s)

    words = s.split()

    d = {}

    for w in words:

        if w in d:

            d[w] += 1

        else:

            d[w] = 1

    return d

#获取文本基本信息

def file_stats2(fname):

    """Print statistics for the given file."""

    s = open(fname,'r').read()

    num_chars = len(s)

    num_lines = s.count('\n')

    d = make_freq_dict(s)

    num_words = sum(d[w] for w in d)

    lst = [(d[w],w) for w in d]

    lst.sort()

    lst.reverse()

    print("The file %s has:" % fname)

    print("  %s characters" % num_chars)

    print("  %s lines" % num_lines)

    print("  %s words" % num_words)

    print("\nThe top 10 most frequent words are:")

    i = 1

    for count,word in lst[:99]:

        print('%2s. %4s %s' % (i, count, word))

        i += 1

>>> file_stats2('a.txt')

The file a.txt has:

  12927 characters

  297 lines

  1645 words

The top 10 most frequent words are:

 1.   62 to

 2.   62 the

 3.   47 is

 4.   42 a

 5.   41 of

 6.   40 it

 7.   36 that

 8.   35 and

 9.   32 as

10.   24 so

进一步完善的代码：

#text.py

#保留的字符

keep = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p'

        'q','r','s','t','u','v','w','x','y','z',' ','-',"'"}

#将文本规范化

def normalize(s):

    """Convert s to a normalized string."""

    result = ''

    for c in s.lower():

        if c in keep:

            result += c

    return result

#获取文本基本信息

def file_stats(fname):

    """Print statistics for the given file."""

    s = open(fname,'r').read()

    num_chars = len(s)

    num_lines = s.count('\n')

    num_words = len(normalize(s).split())

    print("The file %s has:" % fname)

    print("  %s characters" % num_chars)

    print("  %s lines" % num_lines)

    print("  %s words" % num_words)

#将字符串转化为字典

def make_freq_dict(s):

    """Return a dictionary whose keys are the words of s,and whose values are the counts of those words."""

    s = normalize(s)

    words = s.split()

    d = {}

    for w in words:

        if w in d:

            d[w] += 1

        else:

            d[w] = 1

    return d

#获取文本基本信息

def file_stats2(fname):

    """Print statistics for the given file."""

    s = open(fname,'r').read()

    num_chars = len(s)

    num_lines = s.count('\n')

    d = make_freq_dict(s)

    num_different_words = sum(d[w]/d[w] for w in d)

    num_words = sum(d[w] for w in d)

    words_average_length = sum(len(w) for w in d)/num_different_words

    num_once = sum(d[w] for w in d if d[w] == 1)

    lst = [(d[w],w) for w in d]

    lst.sort()

    lst.reverse()

    print("The file %s has:" % fname)

    print("  %s characters" % num_chars)

    print("  %s lines" % num_lines)

    print("  %s words" % num_words)

    print("  %s words appreance one time" % num_once)

    print("  %s different words" % int(num_different_words))

    print("  %s average length" % words_average_length)

    print("\nThe top 10 most frequent words are:")

    i = 1

    for count,word in lst[:10]:

        print('%2s. %4s %s' % (i, count, word))

        i += 1

def main():

    file_stats2('a.txt')

if __name__=='__main__':

    main()

>>> ================================ RESTART ================================

>>>

The file a.txt has:

  12927 characters

  297 lines

  1645 words

  515 words appreance one time

  699 different words

  6.539341917024321 average length

The top 10 most frequent words are:

 1.   62 to

 2.   62 the

 3.   47 is

 4.   42 a

 5.   41 of

 6.   40 it

 7.   36 that

 8.   35 and

 9.   32 as

10.   24 so

码农公寓

相关文章