Python文本处理与统计-CSDN博客

#Part I

keys=list(counts.keys())
    for key in keys:
        if key in thewords:
            counts.pop(key)

要遍历字典过程中，如果没有

keys=list(counts.keys()) #对keys转换成list,不再是dict_key.

对字典的处理过程中删除增加会报错：RuntimeError: dictionary changed size during iteration
#del counts[key]也可以换成counts.pop(key)

#Part II

上述写法可以更为精简，参考： Python if 和 for 的多种写法

#源代码是Mooc的python课程的6.9的作业，

#上面切词的方式是用空格替换掉所有非字母数字字符，此代码用正则表达式来切词

import  re
re_word=re.compile(r'[a-z]+')
def get_words():#切词
    txt_path = "hamlet.txt"
    with open(txt_path,"r") as txt:
        txt_r=txt.read()
        txt=txt_r.lower()
        word_list=re_word.findall(txt)
        print('word:%s'%word_list)
    return word_list


#对每个单词出现的次数统计
def count_words(object):
    counts={}
    for word in object:#也可以写成counts[word]=counts.get(word,0)
        if word in counts:
            counts[word]=counts[word]+1
        else:
            counts[word]=1  
    thewords=['the', 'a', 'to', 'of', 'a', 'i', 'in', 'and', 'you', 'your','it','that','is']
    keys=list(counts.keys())
    # for key in keys:
    #     if key in thewords:
    #         counts.pop(key)
    [counts.pop(key) for key in keys if key in thewords]#等价comment的部分
    items=list(counts.items())
    items.sort(key=lambda x:x[1],reverse=True)
    print(items)
    for i in range(10):
        word,count=items[i]
        print("{0:<10}{1:>5}".format(word,count))


if __name__ == '__main__':

    words=get_words()
    count_words(words)

此上的代码可以用来做 Github 上的0006题