-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwordcount.py
executable file
·148 lines (131 loc) · 4.72 KB
/
wordcount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/python -tt
# Copyright 2010 Google Inc.
# Licensed under the Apache License, Version 2.0
# http://www.apache.org/licenses/LICENSE-2.0
# Google's Python Class
# http://code.google.com/edu/languages/google-python-class/
"""Wordcount exercise
Google's Python class
The main() below is already defined and complete. It calls print_words()
and print_top() functions which you write.
"""
import sys
# +++your code here+++
# Define print_words(filename) and print_top(filename) functions.
# You could write a helper utility function that reads a file
# and builds and returns a word/count dict for it.
# Then print_words() and print_top() can just call the utility function.
''' 1. For the --count flag, implement a print_words(filename) function that counts
how often each word appears in the text and prints:
word1 count1
word2 count2
...
Print the above list in order sorted by word (python will sort punctuation to
come before letters -- that's fine). Store all the words as lowercase,
so 'The' and 'the' count as the same word. '''
def print_words(filename):
# Opening the file and reading it in as one big string
f = open(filename, 'rU')
lines = f.read()
# Making it lowercase
lines = lines.lower()
# Taking out any endlines or symbols that are not letters
lines = lines.replace('\n', ' ')
lines = lines.replace('"', '')
lines = lines.replace(',', '')
lines = lines.replace("'", '')
lines = lines.replace('.', '')
lines = lines.replace('?', '')
lines = lines.replace(';', '')
lines = lines.replace('--', ' ')
lines = lines.replace('!', '')
lines = lines.replace(':', '')
lines = lines.replace(')', ' ')
lines = lines.replace('(', ' ')
lines = lines.replace('`', '')
# Splitting the string into a list of words
words = lines.split()
# Creating empty dict and setting the counter to 0
counts = {}
i = 0
for word in words:
i += 1
# If the word is already in the dict, move on to the next one
if word in counts:
continue
# Otherwise create a new key in the dict for the word and set count = 1
counts[word] = 1
# New list of other words that are in front of word of interest
other_words = words[i:]
for other_word in other_words:
# If this other word is equal to the word of interest increment the count
if other_word == word:
counts[word] += 1
# Print all keys and counts, sorted by first letter of key
for key in sorted(counts):
print key, counts[key]
''' 2. For the --topcount flag, implement a print_top(filename) which is similar
to print_words() but which prints just the top 20 most common words sorted
so the most common word is first, then the next most common, and so on.
Use str.split() (no arguments) to split on all whitespace. '''
def print_top(filename):
# Opening the file and reading it in as one big string
f = open(filename, 'rU')
lines = f.read()
# Making it lowercase
lines = lines.lower()
# Taking out any endlines or symbols that are not letters
lines = lines.replace('\n', ' ')
lines = lines.replace('"', '')
lines = lines.replace(',', '')
lines = lines.replace("'", '')
lines = lines.replace('.', '')
lines = lines.replace('?', '')
lines = lines.replace(';', '')
lines = lines.replace('--', ' ')
lines = lines.replace('!', '')
lines = lines.replace(':', '')
lines = lines.replace(')', ' ')
lines = lines.replace('(', ' ')
lines = lines.replace('`', '')
# Splitting the string into a list of words
words = lines.split()
# Creating empty dict and setting the counter to 0
counts = {}
i = 0
for word in words:
i += 1
# If the word is already in the dict, move on to the next one
if word in counts:
continue
# Otherwise create a new key in the dict for the word and set count = 1
counts[word] = 1
# New list of other words that are in front of word of interest
other_words = words[i:]
for other_word in other_words:
# If this other word is equal to the word of interest increment the count
if other_word == word:
counts[word] += 1
# list of words (the keys), ordered from highest to lowest
words_sorted = sorted(counts, key=counts.get, reverse=True)
# Print the 20 words with the most counts
for i in range(20):
print words_sorted[i]
###
# This basic command line argument parsing code is provided and
# calls the print_words() and print_top() functions which you must define.
def main():
if len(sys.argv) != 3:
print 'usage: ./wordcount.py {--count | --topcount} file'
sys.exit(1)
option = sys.argv[1]
filename = sys.argv[2]
if option == '--count':
print_words(filename)
elif option == '--topcount':
print_top(filename)
else:
print 'unknown option: ' + option
sys.exit(1)
if __name__ == '__main__':
main()