Merge pull request #93 from Sandrro/dev

Dev
Sandrro · Sep 19, 2024 · d1c478b · d1c478b
2 parents 03ac0d5 + 3e7a434
commit d1c478b
Show file tree

Hide file tree

Showing 16 changed files with 145 additions and 201 deletions.
diff --git a/.github/workflows/2dev_ci_on_pr.yaml b/.github/workflows/2dev_ci_on_pr.yaml
@@ -3,10 +3,10 @@ name: Tests
 on:
   push:
       branches:
-        - dev
+        - "*"
   pull_request:
       branches:
-        - dev
+        - master
 
 jobs:
   test:

diff --git a/README.md b/README.md
@@ -1,9 +1,9 @@
 # SLOYKA
 [![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active)
-[![Documentation Status](https://readthedocs.org/projects/soika/badge/?version=latest)](https://soika.readthedocs.io/en/latest/?badge=latest)
+[![Documentation Status](https://readthedocs.org/projects/sloyka/badge/?version=latest)](https://sloyka.readthedocs.io/ru/latest/?badge=latest)
 [![PythonVersion](https://img.shields.io/badge/python-3.11-blue)](https://pypi.org/project/scikit-learn/)
 [![Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
-[![Tests](https://github.com/GeorgeKontsevik/sloyka/.github/workflows/2dev_ci_on_pr.yaml/badge.svg?branch=dev)](https://github.com/GeorgeKontsevik/sloyka/.github/workflows/2dev_ci_on_pr.yaml)
+[![Tests](https://github.com/GeorgeKontsevik/sloyka/actions/workflows/2dev_ci_on_pr.yaml/badge.svg?branch=dev)](https://github.com/GeorgeKontsevik/sloyka/actions/workflows/2dev_ci_on_pr.yaml)
 
 [![sloyka_community_chat](https://img.shields.io/badge/-community-blue?logo=telegram)](https://t.me/sloyka_community)
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wCUJAqlq9GMKw1wpTsWrzYwr10pWDeHv?usp=sharing)
@@ -38,5 +38,15 @@ [email protected] (Александр Антонов, Project Lead)
 [email protected] (just in case).
 
 ## Цитирование
-Antonov, A., Gornova, G., Kontsevik, G., Turkov, L., Vorona, V., & Mityagin, S. (2024, July). Transformation of Local Communities from Neighborhoods to Urban Commons in the Production of Social Representations of Space. In International Conference on Computational Science and Its Applications (pp. 436-447). Cham: Springer Nature Switzerland.
+
+``` bibtex
+@inproceedings{antonov2024transformation,
+  title={Transformation of Local Communities from Neighborhoods to Urban Commons in the Production of Social Representations of Space},
+  author={Antonov, Aleksandr and Gornova, Galina and Kontsevik, Georgii and Turkov, Leonid and Vorona, Vladimir and Mityagin, Sergey},
+  booktitle={International Conference on Computational Science and Its Applications},
+  pages={436--447},
+  year={2024},
+  organization={Springer}
+}
+```
 ---
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -12,7 +12,8 @@
 project = 'sloyka'
 copyright = '2024, itmo_idu'
 author = 'itmo_idu'
-release = '0.1.0'
+version = '0.2'
+release = '0.2.1'
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sloyka"
-version = "0.1.8"
+version = "0.2.1"
 description = "Library for city identity analysis from social media posts and comments"
 authors = ["sandrro, georgekontsevik"]
 readme = "README.md"

diff --git a/sloyka/src/risks/emotion_classifier.py b/sloyka/src/risks/emotion_classifier.py
@@ -34,7 +34,7 @@ class EmotionRecognizer:
     - text_column: The name of the column containing the text to be analyzed.
     """
 
-    def __init__(self, model_name=HuggingFaceModel.Text.Bert_Large, device='cpu'):
+    def __init__(self, model_name=HuggingFaceModel.Text.Bert_Tiny, device='cpu'):
         self.device = device
         self.model_name = model_name
 
@@ -46,11 +46,11 @@ def __init__(self, model_name=HuggingFaceModel.Text.Bert_Large, device='cpu'):
             HuggingFaceModel.Text.Bert_Tiny2,
         ]
 
-        self.recognizer = None
-
-    def init_base_recognizer(self):
         self.recognizer = TextRecognizer(model=self.model_name, device=self.device)
 
+    #def init_base_recognizer(self):
+    #    self.recognizer = TextRecognizer(model=self.model_name, device=self.device)
+
 
     def recognize_emotion(self, text):
         """

diff --git a/sloyka/src/risks/regional_activity.py b/sloyka/src/risks/regional_activity.py
@@ -99,7 +99,7 @@ def run_sloyka_modules(self) -> gpd.GeoDataFrame:
             .to_list()
         )
 
-        processed_geodata.dropna(subset=["text"], inplace=True)
+        processed_geodata.dropna(subset=[self.text], inplace=True)
         processed_geodata = City_services().run(
             df=processed_geodata, text_column=self.text
         )

diff --git a/sloyka/src/semantic_graph/keyword_extracter.py b/sloyka/src/semantic_graph/keyword_extracter.py
@@ -97,7 +97,7 @@ def extract_keywords(
             texts_to_add = []
 
             for j, text in zip(ids_text_to_extract, texts_to_extract):
-                extraction = self.model.extract_keywords(text, top_n=top_n, stop_words=RUS_STOPWORDS)
+                extraction = KeyBERT().extract_keywords(docs=text, top_n=top_n, stop_words=RUS_STOPWORDS)
                 if extraction:
                     score = extraction[0][1]
                     if score > semantic_key_filter:

diff --git a/sloyka/src/utils/data_getter/vk_data_getter.py b/sloyka/src/utils/data_getter/vk_data_getter.py
@@ -244,11 +244,15 @@ def run_comments(domain, post_ids, access_token):
         for post_id in tqdm(post_ids):
             comments = VKParser().get_comments(owner_id, post_id, access_token)
             all_comments.extend(comments)
-        df = VKParser.comments_to_dataframe(all_comments)
-        df["type"] = "comment"
-        df = df.reset_index(drop=True)
-        print("comments downloaded")
-        return df
+        if len(all_comments) > 0:
+            df = VKParser.comments_to_dataframe(all_comments)
+            df["type"] = "comment"
+            df = df.reset_index(drop=True)
+            print("comments downloaded")
+            return df
+        else:
+            print("no comments")
+            return None
 
     @staticmethod
     def run_parser(domain, access_token, cutoff_date, number_of_messages=float("inf"), step=100):
@@ -267,17 +271,20 @@ def run_parser(domain, access_token, cutoff_date, number_of_messages=float("inf"
         post_ids = df_posts["id"].tolist()
 
         df_comments = VKParser.run_comments(domain=owner_id, post_ids=post_ids, access_token=access_token)
-        df_comments.loc[df_comments["parents_stack"].apply(lambda x: len(x) > 0), "type"] = "reply"
-        for i in range(len(df_comments)):
-            tmp = df_comments["parents_stack"].iloc[i]
-            if tmp is not None:
-                if len(tmp) > 0:
-                    df_comments["parents_stack"].iloc[i] = tmp[0]
-                else:
-                    df_comments["parents_stack"].iloc[i] = None
-
-        df_combined = df_comments.join(df_posts, on="post_id", rsuffix="_post")
-        df_combined = pd.concat([df_posts, df_comments], ignore_index=True)
+        if df_comments is not None:
+            df_comments.loc[df_comments["parents_stack"].apply(lambda x: len(x) > 0), "type"] = "reply"
+            for i in range(len(df_comments)):
+                tmp = df_comments["parents_stack"].iloc[i]
+                if tmp is not None:
+                    if len(tmp) > 0:
+                        df_comments["parents_stack"].iloc[i] = tmp[0]
+                    else:
+                        df_comments["parents_stack"].iloc[i] = None
+
+            df_combined = df_comments.join(df_posts, on="post_id", rsuffix="_post")
+            df_combined = pd.concat([df_posts, df_comments], ignore_index=True)
+        else:
+            df_combined = df_posts
         df_group_name = VKParser.get_group_name(domain, access_token)
         df_combined["group_name"] = df_group_name["group_name"][0]
 

diff --git a/tests/test_area_matcher.py b/tests/test_area_matcher.py
diff --git a/tests/test_classifiers.py b/tests/test_classifiers.py
@@ -1,40 +1,24 @@
-# import pytest
-# import torch
-# import pandas as pd
-# from sloyka import TextClassifiers
+import pytest
+import torch
+import pandas as pd
+from sloyka import TextClassifiers
 
-# path_to_file = "sloyka/sample_data/raw/Адмиралтейский.csv"
+@pytest.fixture
+def sample_dataframe():
+    s_data = {'Текст комментария': {203: 'На Чайковского 63 тоже идет кап.ремонт. В квартире у пенсионеров побили стекла. Куда им обратиться?',
+    204: 'Вся улица Жуковского и Восстания заклеена рекламой! Почему не действует полиция и администрация с ЖСК-1 ?'},
+    'message_id': {203: 195, 204: 196}}
+    return pd.DataFrame(s_data)
 
-# @pytest.fixture
-# def test_data():
-#     df_predict = pd.read_csv(path_to_file, sep=";")
-#     df_predict.rename(columns={"Текст комментария": "Текст"}, inplace=True)
-#     df_predict = df_predict.dropna(subset=["Текст"])
-#     df_predict = df_predict.head(3)
-#     return df_predict
+@pytest.fixture
+def model():
+    return TextClassifiers(
+        repository_id="Sandrro/text_to_function_v2",
+        number_of_categories=1,
+        device_type=torch.device("cpu"),
+    )
 
-# @pytest.fixture
-# def model():
-#     return TextClassifiers(
-#         repository_id="Sandrro/text_to_subfunction_v10",
-#         number_of_categories=1,
-#         device_type=torch.device("cpu"),
-#     )
-
-# def test_cats_probs(model, test_data):
-#     expected_df = pd.DataFrame(
-#         {
-#             "cats": [
-#                 "Вопросы граждан о проектах/планах/сроках/ходе проведения работ по благоустройству",
-#                 "Не ЦУР",
-#                 "Вопросы по оплате проезда в общественном транспорте",
-#             ],
-#             "probs": ["1.0", "0.999", "0.98"],
-#         }
-#     )
-
-#     test_data[["cats", "probs"]] = pd.DataFrame(
-#         test_data["Текст"].progress_map(lambda x: model.run_text_classifier_topics(x)).to_list()
-#     )
-#     assert test_data["cats"].equals(expected_df["cats"])
-#     assert test_data["probs"].equals(expected_df["probs"])
+def test_cats_probs(model, sample_dataframe):
+    sample_dataframe[["cats", "probs"]] = sample_dataframe["Текст комментария"].progress_map(lambda x: model.run_text_classifier(x)).to_list()
+    print(sample_dataframe)
+    assert sample_dataframe.iloc[0]["cats"] == "ЖКХ"
diff --git a/tests/test_emotion_classifier.py b/tests/test_emotion_classifier.py
@@ -0,0 +1,20 @@
+import pytest
+import torch
+import pandas as pd
+from sloyka import EmotionRecognizer
+
+@pytest.fixture
+def sample_dataframe():
+    s_data = {'Текст комментария': {203: 'На Чайковского 63 тоже идет кап.ремонт. В квартире у пенсионеров побили стекла. Куда им обратиться?',
+    204: 'Вся улица Жуковского и Восстания заклеена рекламой! Почему не действует полиция и администрация с ЖСК-1 ?'},
+    'message_id': {203: 195, 204: 196}}
+    return pd.DataFrame(s_data)
+
+@pytest.fixture
+def model():
+    return EmotionRecognizer()
+
+def test_emotion_recognizer(model, sample_dataframe):
+    sample_dataframe["emotion"] = sample_dataframe["Текст комментария"].progress_map(lambda x: model.recognize_emotion(x))
+    print(sample_dataframe)
+    assert sample_dataframe.iloc[0]["emotion"] == "neutral"
diff --git a/tests/test_events_modelling.py b/tests/test_events_modelling.py
diff --git a/tests/test_geocoder.py b/tests/test_geocoder.py
@@ -4,20 +4,19 @@
 
 @pytest.fixture
 def sample_dataframe():
-    s_data = {
-        "text": [
-            "На биржевой 15 снова шумят!!"
-        ]
-    }
+    s_data = {'Текст комментария': {203: 'На Чайковского 63 тоже идет кап.ремонт. В квартире у пенсионеров побили стекла. Куда им обратиться?',
+    204: 'Вся улица Жуковского и Восстания заклеена рекламой! Почему не действует полиция и администрация с ЖСК-1 ?'},
+    'message_id': {203: 195, 204: 196}}
     return pd.DataFrame(s_data)
 
 
 def test_run_function(sample_dataframe):
-    instance = Geocoder(osm_id=337422, city_tags = { "place": ["state"] })
+    osm_id = 337422 # Saint Petersburg
+    geocoder = Geocoder(df=sample_dataframe, osm_id=osm_id, city_tags={'place':['state']}, text_column_name='Текст комментария')
 
-    result_df = instance.run(df=sample_dataframe, group_column=None)
+    result = geocoder.run(group_column=None)
 
-    print(result_df[['Street', 'Numbers']])
+    print(result[['Street', 'Numbers']])
 
-    assert result_df.loc[0, "Street"] == "биржевой"
-    assert result_df.loc[0, "Numbers"] == "15"
+    assert result.loc[0, "Street"] == "чайковского"
+    assert result.loc[0, "Numbers"] == "63"
diff --git a/tests/test_geocoder_matcher.py b/tests/test_geocoder_matcher.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,10 +3,10 @@ name: Tests @@
     on:
       push:
           branches:
-            - dev
+            - "*"
       pull_request:
           branches:
-            - dev
+            - master
     jobs:
       test:
@@ Expand Down @@