from nltk.corpus import brown
from functools import reduce
import matplotlib.pyplot as pyplot
import re
def load_brown_corpus():
p = re.compile('\W')
texte = [token.lower() for token in brown.words()]
texte = [token for token in texte if not p.match(token)]
return texte
def texte_to_dict(texte):
texte_dict = {}
for token in texte:
if token in texte_dict:
texte_dict[token] += 1
else:
texte_dict[token] = 1
return texte_dict
def dict_to_list(texte_dict):
texte_list=[]
for mot in texte_dict.keys():
texte_list.append([texte_dict[mot], mot])
texte_list.sort(reverse=True)
return texte_list
def afficher_n(texte_list, n):
cumul = 0
print("rang\tmot\tfrequence\tfrequence(Zipf)")
print("-"*50)
for _ in range(n):
cumul += texte_list[_][0]
print("{}\t{}\t{}\t\t{:.0f}".format(_+1, texte_list[_][1], texte_list[_][0], texte_list[0][0]/(_+1)))
total = reduce(lambda x, y: x+y, [_[0] for _ in texte_list])
prop = cumul/total*100
print("-"*50)
print("Ces {} mots représentent le {:0.2f}% du corpus".format(n, prop))
def plot_zipf(texte_list, log=False):
pyplot.rcParams['figure.figsize'] = [15, 10]
y = [_[0] for _ in texte_list]
y_ = []
for _ in range(len(texte_list)):
y_.append(int(texte_list[0][0]/(_+1)))
pyplot.plot(y, "-", label="Réelle")
pyplot.plot(y_, "--", label="Approximation (Zipf)")
if log:
pyplot.yscale("log")
pyplot.xscale("log")
pyplot.legend()
pyplot.title("Loi de Zipf (Brown Corpus)")
pyplot.xlabel("Rang")
pyplot.ylabel("Fréquence")
pyplot.show()
texte = load_brown_corpus()
print("Quantité des mots (tokens) :", len(texte))
print("Quantité des mots differentes (types) :", len(set(texte)))
print(texte[:50])
texte_dict = texte_to_dict(texte)
mots = ["the", "of", "and", "i"]
print("mot\tfrequence")
for mot in mots:
print("{}\t{}".format(mot, texte_dict[mot]))
texte_list = dict_to_list(texte_dict)
afficher_n(texte_list, 20)
plot_zipf(texte_list[:135], log=False)