-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_helper.py
82 lines (60 loc) · 2.14 KB
/
data_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import re
def read_file(fname):
with open(fname, "rb") as fin:
raw_data = fin.read().decode("latin1")
return raw_data
def get_score(review):
"""
This function extracts the integer score from the review.
Write a regular expression that searches for the Overall score
and then extract the score number.
:param review: All text associated with the review.
:return: int: score --- the score of the review
"""
score = int( re.search( r'Overall = ([1-5])', review).group(1) )
return score
def get_text(review):
"""
This function extracts the description part of the
restaurant review.
Use regex to extract the Text field of the review,
similar to the get_score() function.
:param review:
:return: str: text -- the textual description part of the restaurant review.
"""
text = re.search( r'Text = "(.*)"', review ).group(1)
return text
def get_reviews(raw_data):
"""
Process the restaurant review data. Split the data into two
lists, one list for positive reviews and one list for negative
reviews. The list items should be the descriptive text of
each restaurant review.
A positive review has a overall score of at least 3 and
negative reviews have scores less than 3.
:param raw_data:
:return:
"""
positive_texts = []
negative_texts = []
for review in re.split(r'\.\n', raw_data):
overall_score = get_score(review)
review_text = get_text(review)
if overall_score > 3:
positive_texts.append( review_text )
elif overall_score < 3:
negative_texts.append( review_text )
### YOUR CODE GOES HERE
#raise NotImplemented
return positive_texts, negative_texts
def test_main():
datafile = "restaurant-training.data"
# datafile = "restaurant-help.data"
raw_data = read_file(datafile)
p, n = get_reviews(raw_data)
#assert p[0.startswith("An excellent restaurant."), p[0]
#assert n[0].startswith("Place was nice did not care for the BBQ or the service."), n[0]
print( p )
print( n )
if __name__ == "__main__":
test_main()