Skip to content

Commit 4ba1cf4

Browse files
committed
Initial commit
0 parents  commit 4ba1cf4

File tree

352 files changed

+52938
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

352 files changed

+52938
-0
lines changed

Readme.txt

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
Instructions to run our project on cade:
2+
3+
1) We are using Python3.5 and NLTK package
4+
2) Please run following command and install additional packages.
5+
/usr/local/stow/python/amd64_linux26/python-3.5.0/bin/python3 -m nltk.downloader -d /home/$USERNAME/nltk_data all
6+
7+
Replace $USERNAME with your cade username.
8+
9+
Open our folder. Go to question-answers folder where there are other files.
10+
11+
3) /usr/local/stow/python/amd64_linux26/python-3.5.0/bin/python3 question_answers.py ./developset/input.txt > Response.answer
12+
13+
After running above command, Response.answer file will be generated. This is our output file.
14+
15+
You can calculate accuracy on this file. We got accuracy as 25.89% on developset.

contributors.txt

+239
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
import nltk
2+
3+
from nltk.tokenize import word_tokenize
4+
from nltk.tokenize import sent_tokenize
5+
from nltk.corpus import stopwords
6+
import string
7+
from nltk.stem.lancaster import LancasterStemmer
8+
9+
10+
name_list = []
11+
location_list = []
12+
month_list = []
13+
time_list = []
14+
occupation_list = []
15+
16+
stopwordSet = stopwords.words('english')
17+
morePunctuations = set(['``','"','...',"''","n't","'re","'s","--"])
18+
punctuationSet = set(string.punctuation) | morePunctuations
19+
lancaster_stemmer = LancasterStemmer()
20+
21+
22+
def parse_story(story_filename):
23+
story_dict = {}
24+
with open(story_filename) as myfile:
25+
parts = myfile.read().split("TEXT:")
26+
27+
headline = parts[0].splitlines()[0]
28+
date = parts[0].splitlines()[1]
29+
storyid = parts[0].splitlines()[2]
30+
text = sent_tokenize(parts[1].lstrip("\n").replace("\n"," "))
31+
story_dict[(headline,date,storyid)] = text
32+
return story_dict
33+
34+
35+
def removeStopWordsAndTagPOS(story_dict):
36+
storyWithoutStopWords_dict = {}
37+
storyPOS_dict = {}
38+
for key in story_dict:
39+
text = story_dict[key]
40+
for line in text:
41+
words = word_tokenize(line)
42+
lineWithoutStopWord = []
43+
for word in words:
44+
if word.lower() not in stopwordSet:
45+
if word.lower() not in punctuationSet:
46+
lineWithoutStopWord.append(word)
47+
storyWithoutStopWords_dict[line] = lineWithoutStopWord
48+
storyPOS_dict[line] = nltk.pos_tag(lineWithoutStopWord)
49+
50+
return storyWithoutStopWords_dict, storyPOS_dict
51+
52+
def camel(s):
53+
return (s != s.lower() and s != s.upper())
54+
55+
def contains_proper_noun(question):
56+
proper_noun = ""
57+
wordsInAQuestion = word_tokenize(question)
58+
questionWithoutStopWord = []
59+
for word in wordsInAQuestion:
60+
if word.lower() not in stopwordSet:
61+
if word.lower() not in punctuationSet:
62+
questionWithoutStopWord.append(word)
63+
64+
for word in questionWithoutStopWord:
65+
if (camel(word)):
66+
proper_noun = proper_noun +" "+ word
67+
68+
proper_noun_list = proper_noun.split()
69+
for each_proper_noun in proper_noun_list:
70+
if any(each_proper_noun in s for s in name_list):
71+
return True
72+
else:
73+
return False
74+
75+
76+
def semantic_classes(name_filename):
77+
with open(name_filename+"names.txt") as f:
78+
name_list.append(f.read().splitlines())
79+
# print(type(name_list))
80+
81+
with open(name_filename+"location.txt") as f:
82+
location_list.append(f.read().splitlines())
83+
84+
with open(name_filename+"month.txt") as f:
85+
month_list.append(f.read().lower().splitlines())
86+
87+
with open(name_filename+"time.txt") as f:
88+
time_list.append(f.read().lower().splitlines())
89+
90+
with open(name_filename+"occupation.txt") as f:
91+
occupation_list.append(f.read().lower().splitlines())
92+
# print(occupation_list)
93+
94+
95+
def contains_name_word(sent):
96+
proper_noun = ""
97+
wordsInASent = word_tokenize(sent)
98+
sentWithoutStopWord = []
99+
for word in wordsInASent:
100+
if word.lower() not in stopwordSet:
101+
if word.lower() not in punctuationSet:
102+
sentWithoutStopWord.append(word)
103+
104+
if any("name" in s for s in sentWithoutStopWord):
105+
return True
106+
else:
107+
return False
108+
109+
def contains_name_occupation(sent):
110+
proper_noun = ""
111+
wordsInASent = word_tokenize(sent)
112+
sentWithoutStopWord = []
113+
for word in wordsInASent:
114+
if word.lower() not in stopwordSet:
115+
if word.lower() not in punctuationSet:
116+
sentWithoutStopWord.append(word)
117+
118+
for word in sentWithoutStopWord:
119+
if (camel(word)):
120+
proper_noun = proper_noun +" "+ word
121+
122+
proper_noun_list = proper_noun.split()
123+
124+
for each_proper_noun in proper_noun_list:
125+
if any(each_proper_noun in s for s in name_list):
126+
return True
127+
128+
for word in sentWithoutStopWord:
129+
if any(word in s for s in occupation_list):
130+
# print(word)
131+
return True
132+
return False
133+
134+
135+
def who_rule(question, sent, storyPOS_dict):
136+
score = 0
137+
status = False
138+
score = score+ wordMatch(question,sent,storyPOS_dict)
139+
if(not contains_proper_noun(question) and contains_proper_noun(sent)):
140+
score = score + 6
141+
if (not contains_proper_noun(question) and contains_name_word(sent)):
142+
score = score + 4
143+
status = contains_name_occupation(sent)
144+
if (status):
145+
score = score + 4
146+
# print(score)
147+
148+
def when_rule(question, sent):
149+
print()
150+
151+
def data_forward(questions_data,story_dict):
152+
storyWithoutStopWords_dict,storyPOS_dict = removeStopWordsAndTagPOS(story_dict)
153+
154+
for question in questions_data:
155+
for story_key in story_dict:
156+
text_list = story_dict[story_key]
157+
wordMatch(question[1],text_list,storyPOS_dict)
158+
for sent in text_list:
159+
who_rule(question[1],sent,storyPOS_dict)
160+
161+
def wordMatch(question, text, storyPOS_dict):
162+
wordsInAQuestion = word_tokenize(question)
163+
rootsInAQuestion = set()
164+
for word in wordsInAQuestion:
165+
root = lancaster_stemmer.stem(word)
166+
rootsInAQuestion.add(root)
167+
168+
for line in storyPOS_dict:
169+
verbmatch_score = 0
170+
rootmatch_score = 0
171+
scoreOfALine = {}
172+
for (word,tag) in storyPOS_dict[line]:
173+
if 'V' in tag:
174+
verb_root = lancaster_stemmer.stem(word)
175+
if verb_root in rootsInAQuestion:
176+
verbmatch_score = verbmatch_score + 6
177+
else:
178+
word_root = lancaster_stemmer.stem(word)
179+
if word_root in rootsInAQuestion:
180+
rootmatch_score = rootmatch_score + 3
181+
scoreOfALine[line] = rootmatch_score + verbmatch_score
182+
# print(scoreOfALine)
183+
# print("\n")
184+
185+
186+
def main():
187+
input_path = "/Users/roshaninagmote/Downloads/sample/"
188+
input_file = open(input_path+"/input.txt")
189+
semantic_classes("/Users/roshaninagmote/PycharmProjects/question-answers/")
190+
191+
192+
input_data = input_file.read().splitlines()
193+
path = input_data[0]
194+
195+
for i in range(1,len(input_data)):
196+
197+
each_story = input_data[i]+".story"
198+
each_question = input_data[i]+".questions"
199+
story_file = open(input_path+each_story)
200+
questions_file = open(input_path+each_question)
201+
story_data = story_file.read()
202+
questions_data_raw = questions_file.read().splitlines()
203+
questions_total = filter(None, questions_data_raw)
204+
205+
que = questions_file.read()
206+
207+
questions_data = []
208+
for j in range(0,len(questions_total),3):
209+
question_temp = []
210+
quesid = questions_total[j].split(":")[1].lstrip(" ")
211+
question_temp.append(quesid)
212+
ques = questions_total[j+1].split(":")[1].lstrip(" ")
213+
question_temp.append(ques)
214+
question_temp.append(questions_total[j+2])
215+
216+
questions_data.append(question_temp)
217+
218+
219+
story_dict = parse_story(input_path+each_story)
220+
# print(story_dict)
221+
data_forward(questions_data,story_dict)
222+
print("\n")
223+
224+
if __name__ == "__main__":
225+
main()
226+
227+
228+
if not any(word in question[1].lower() for word in quest_words):
229+
print("roshani")
230+
max_score_else = 0
231+
for sent in text_list:
232+
current_score = wordMatch(question[1],sent,storyPOS_dict)
233+
if current_score > max_score_else:
234+
max_score_else = current_score
235+
answer = sent
236+
# print "in else", question[1], sent
237+
print "QuestionID:",question[0]
238+
print "Answer:", answer
239+
print("\n")

developset/1999-W02-5.answers

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
QuestionID: 1999-W02-5-1
2+
Question: Where is South Queens Junior High School located?
3+
Answer: Liverpool, Nova Scotia | Canada
4+
Difficulty: Moderate
5+
6+
QuestionID: 1999-W02-5-2
7+
Question: Who is the principal of South Queens Junior High School?
8+
Answer: Betty Jean Aucoin
9+
Difficulty: Easy
10+
11+
QuestionID: 1999-W02-5-3
12+
Question: What has South Queens Junior High School done with its old metal shop?
13+
Answer: turned it into a fitness club
14+
Difficulty: Moderate
15+
16+
QuestionID: 1999-W02-5-5
17+
Question: Who runs the club?
18+
Answer: a non-profit society | school and community volunteers
19+
Difficulty: Moderate
20+
21+
QuestionID: 1999-W02-5-6
22+
Question: How big is the club?
23+
Answer: 12,000 square feet
24+
Difficulty: Easy
25+
26+
QuestionID: 1999-W02-5-7
27+
Question: If you were a student, how much would a club membership cost you?
28+
Answer: $135 a year
29+
Difficulty: Moderate
30+

developset/1999-W02-5.questions

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
QuestionID: 1999-W02-5-1
2+
Question: Where is South Queens Junior High School located?
3+
Difficulty: Moderate
4+
5+
QuestionID: 1999-W02-5-2
6+
Question: Who is the principal of South Queens Junior High School?
7+
Difficulty: Easy
8+
9+
QuestionID: 1999-W02-5-3
10+
Question: What has South Queens Junior High School done with its old metal shop?
11+
Difficulty: Moderate
12+
13+
QuestionID: 1999-W02-5-5
14+
Question: Who runs the club?
15+
Difficulty: Moderate
16+
17+
QuestionID: 1999-W02-5-6
18+
Question: How big is the club?
19+
Difficulty: Easy
20+
21+
QuestionID: 1999-W02-5-7
22+
Question: If you were a student, how much would a club membership cost you?
23+
Difficulty: Moderate
24+

developset/1999-W02-5.story

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
HEADLINE: Nova Scotia School Pumps Up
2+
DATE: January 8, 1999
3+
STORYID: 1999-W02-5
4+
5+
TEXT:
6+
7+
A middle school in Liverpool, Nova Scotia is pumping up bodies as well
8+
as minds.
9+
10+
It's an example of a school teaming up with the community to raise
11+
money. South Queens Junior High School is taking aim at the fitness
12+
market.
13+
14+
The school has turned its one-time metal shop - lost to budget cuts
15+
almost two years ago - into a money-making professional fitness club.
16+
The club will be open seven days a week.
17+
18+
The club, operated by a non-profit society made up of school and
19+
community volunteers, has sold more than 30 memberships and hired a
20+
full-time co-ordinator.
21+
22+
Principal Betty Jean Aucoin says the club is a first for a Nova Scotia
23+
public school. She says the school took it on itself to provide a
24+
service needed in Liverpool.
25+
26+
"We don't have any athletic facilities here on the South Shore of Nova
27+
Scotia, so if we don't use our schools, communities such as Queens are
28+
going to be struggling to get anything going," Aucoin said.
29+
30+
More than a $100,000 was raised through fund-raising and donations from
31+
government, Sport Nova Scotia, and two local companies.
32+
33+
Some people are wondering if the ties between the businesses and the
34+
school are too close. Schools are not set up to make profits or promote
35+
businesses.
36+
37+
Southwest Regional School Board superintendent Ann Jones says there's no
38+
fear the lines between education and business are blurring.
39+
40+
"First call on any school facility belongs to... the youngsters in the
41+
school," says Ann Jones.
42+
43+
The 12,000-square-foot club has seven aerobic machines, including
44+
treadmills, steppers, and stationary bicycles, as well as weight
45+
machines and freeweights.
46+
47+
Memberships cost $180 a year for adults and $135 for students and
48+
seniors.
49+
50+
Proceeds pay the salary of the centre co-ordinator and upkeep of the
51+
facility.

0 commit comments

Comments
 (0)