AIRI-Institute · Vitaly-Protasov · Dec 20, 2023 · Dec 20, 2023 · Dec 20, 2023 · Jan 11, 2024
diff --git a/probing/data_former.py b/probing/data_former.py
@@ -24,7 +24,7 @@ def __init__(
         self.shuffle = shuffle
         self.data_path = get_probe_task_path(probe_task, data_path)
 
-        self.samples, self.unique_labels = self.form_data(sep=sep)
+        self.samples, self.unique_labels, self.num_words = self.form_data(sep=sep)
 
     def __len__(self):
         return len(self.samples)
@@ -48,8 +48,9 @@ def form_data(
         samples_dict = defaultdict(list)
         unique_labels = set()
         dataset = pd.read_csv(self.data_path, sep=sep, header=None, dtype=str)
-        for _, (stage, label, text) in dataset.iterrows():
-            samples_dict[stage].append((text, label))
+        for _, (stage, label, word_indices, text) in dataset.iterrows():
+            num_words = len(word_indices.split(","))
+            samples_dict[stage].append((text, label, word_indices))
             unique_labels.add(label)
 
         if self.shuffle:
@@ -58,7 +59,7 @@ def form_data(
             }
         else:
             samples_dict = {k: np.array(v) for k, v in samples_dict.items()}
-        return samples_dict, unique_labels
+        return samples_dict, unique_labels, num_words
 
 
 class EncodedVectorFormer(Dataset):

diff --git a/probing/ud_filter/filtering_probing.py b/probing/ud_filter/filtering_probing.py
@@ -41,7 +41,7 @@ def __init__(self, shuffle: bool = True):
         self.classes: Dict[
             str, Tuple[Dict[str, Dict[str, Any]], Dict[Tuple[str, str], Dict[str, Any]]]
         ] = {}
-        self.probing_dict: Dict[str, List[str]] = {}
+        self.probing_dict: Dict[str, List[Tuple[str, List[int]]]] = {}
         self.parts_data: Dict[str, List[List[str]]] = {}
 
     def upload_files(
@@ -74,11 +74,13 @@ def upload_files(
         self.language = extract_lang_from_udfile_path(self.paths[0], language=language)
         self.sentences = parse(conllu_data)
 
-    def _filter_conllu(self, class_label: str) -> Tuple[List[str], List[str]]:
+    def _filter_conllu(
+        self, class_label: str
+    ) -> Tuple[List[Tuple[str, List[int]]], List[Tuple[str, List[int]]]]:
         """Filters sentences by class's query and saves the result to the relevant fields"""
 
-        matching = []
-        not_matching = []
+        matching: List[Tuple[str, List[int]]] = []
+        not_matching: List[Tuple[str, List[int]]] = []
 
         node_pattern = self.classes[class_label][0]
         constraints = self.classes[class_label][1]
@@ -91,10 +93,11 @@ def _filter_conllu(self, class_label: str) -> Tuple[List[str], List[str]]:
         for sentence in self.sentences:
             sf = SentenceFilter(sentence)
             tokenized_sentence = " ".join(wordpunct_tokenize(sentence.metadata["text"]))
-            if sf.filter_sentence(node_pattern, constraints):
-                matching.append(tokenized_sentence)
+            filter_result = sf.filter_sentence(node_pattern, constraints)
+            if filter_result is not None:
+                matching.append((tokenized_sentence, filter_result))
             else:
-                not_matching.append(tokenized_sentence)
+                not_matching.append((tokenized_sentence, []))
         return matching, not_matching
 
     def filter_and_convert(
@@ -128,7 +131,7 @@ def filter_and_convert(
             matching, not_matching = self._filter_conllu(label)
             self.probing_dict[label] = matching
             if len(self.classes) == 1:
-                self.probing_dict["not_" + list(self.classes.keys())[0]] = not_matching
+                self.probing_dict["not_" + label] = not_matching
         self.probing_dict = delete_duplicates(self.probing_dict)
 
         self.parts_data = subsamples_split(