diff --git a/README.md b/README.md index 3ed6723..4bfdbb2 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ The first step is to examine how different answers are referred to in the dataset. ``` -python3 intermediate_results/lat_frequency.json +make intermediate_results/lat_frequency.json ``` Next, we transform question from the QB format to look like the NQ diff --git a/compute_lat_frequency.py b/compute_lat_frequency.py index b58a20c..16ba2ab 100644 --- a/compute_lat_frequency.py +++ b/compute_lat_frequency.py @@ -46,7 +46,7 @@ def compute_lat_frequency(self, orig_qb_path: str, limit: int=-1) -> None: text = qb_data[i]['text'] lats = self.count_answer_types(Question(qid, page, text)) # Printing here could cause unicode conversion error ifpage is not pure ASCII - if i % 10000 == 0: + if i % 100 == 0: print("===> %i/%i: %s %s" % (i, len(qb_data), page, str(lats))) if limit > 0 and i > limit: break @@ -73,6 +73,9 @@ def write_most_freq_answer_type_for_qid(self, qanta_train_with_answer_type_path: page_to_most_freq_answer_type_dict[qb_data[i]['qanta_id']] = self.most_common(qb_data[i]['page']) #save the most freq answer type for each qid into dictionary + dir_name = os.path.dirname(output_file) + if not os.path.exists(dir_name): # create path if it doesn't exist + os.makedirs(dir_name) with open(output_file, 'w') as fp: json.dump(page_to_most_freq_answer_type_dict, fp, indent=2) diff --git a/quality_classifier.py b/quality_classifier.py index 911ae5c..2679292 100755 --- a/quality_classifier.py +++ b/quality_classifier.py @@ -350,7 +350,7 @@ def generate_feature_weight(self, model): feature_weight["BIAS"] = model.intercept_[0] return feature_weight - def save_dictionary(self, questions, file_path): + def save_dictionary(self, questions, file_path, model): #add argument "model" x = self.prepare_features(questions) results = model.predict_proba(x) @@ -394,7 +394,7 @@ def save_dictionary(self, questions, file_path): parser.add_argument('--nq_data', type=str, default='TriviaQuestion2NQ_Transform_Dataset/NaturalQuestions_train_reformatted.json') parser.add_argument('--nqlike_data', type=str, default='intermediate_results/nqlike_train.json') parser.add_argument('--max_term_features', type=int, default=50) - parser.add_argument('--seq', type=str, default='') + parser.add_argument('--seq', type=str, default='nq_like') args = parser.parse_args() # set flag and if_qb_last_sent here # 0 --wellformedness accuracy output @@ -422,5 +422,5 @@ def save_dictionary(self, questions, file_path): if args.predictions: # test print('Evaluate NQ-like') - c.save_dictionary(nq_like, prediction_path) + c.save_dictionary(nq_like, prediction_path, model) diff --git a/requirements.txt b/requirements.txt index 1c47b4f..00ab198 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ -spacy +spacy==2.1.0 neuralcoref matplotlib +sklearn