Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 50 additions & 3 deletions frequency.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,45 @@
Project Gutenberg """

import string
from pattern.web import *
from pattern.en import tokenize
from operator import itemgetter, attrgetter, methodcaller

def get_word_list(file_name):
""" Reads the specified project Gutenberg book. Header comments,
punctuation, and whitespace are stripped away. The function
returns a list of the words used in the book as a list.
All words are converted to lower case.
"""
pass
# Open pre-loaded file
f = open(file_name,'r')

# Preparing to remove unwanted txt
lines = f.readlines() # a single line of txt file
s_curr_line = 0 # initiate line index for searching from start
e_curr_line = 0 # initiate line index for searching from end

# For lines in txt look for string
# Returns -1 for failure to find string
while lines[s_curr_line].find('CHAPTER I') == -1:
s_curr_line += 1
while lines[e_curr_line].rfind('THE END') == -1:
e_curr_line += 1
# while lines[e_curr_line].rfind('Her sister, Miss Watson, a tolerable slim old maid') == -1:
# e_curr_line += 1

# Lines now only contains story text
lines = str(lines[s_curr_line + 1: e_curr_line -1])

# Clean lines
lines = lines.lower() # convert to lower case
lines = lines.translate(string.maketrans("",""), string.punctuation) # remove punctuation
lines = lines.strip() # strip whitespace
word_list = lines.split() # split into list of words

return word_list


def get_top_n_words(word_list, n):
""" Takes a list of words as input and returns a list of the n most frequently
occurring words ordered from most to least frequently occurring.
Expand All @@ -19,6 +49,23 @@ def get_top_n_words(word_list, n):
punctuation
n: the number of words to return
returns: a list of n most frequently occurring words ordered from most
frequently to least frequentlyoccurring
frequently to least frequently soccurring
"""
pass
dictionary = dict() # initializing empty dictionary

# In dictionary words=keys and occurance=value
for word in word_list: # go through each word in list
if word not in dictionary: # if the word is already in the dictionary
dictionary[word] = 1 # update dictionary entry
else: # create a new entry with occurance of one
dictionary[word] += 1

# Orders words based on frequency
t = dictionary.items() # creates list of tuples of dictionary entires
ordered = sorted(t, key=itemgetter(1), reverse=True) # sorts the list of tuples by second entry (value)
highest_freq = ordered[0:n] # pulls n most freq occuring words

return highest_freq

print get_top_n_words(get_word_list('pg32325.txt'),100)