diff --git a/.github/workflows/2dev_ci_on_pr.yaml b/.github/workflows/2dev_ci_on_pr.yaml index 86dcbcb..c534df2 100644 --- a/.github/workflows/2dev_ci_on_pr.yaml +++ b/.github/workflows/2dev_ci_on_pr.yaml @@ -3,10 +3,10 @@ name: Tests on: push: branches: - - dev + - "*" pull_request: branches: - - dev + - master jobs: test: diff --git a/README.md b/README.md index 406dba7..969fbbc 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ # SLOYKA [![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) -[![Documentation Status](https://readthedocs.org/projects/soika/badge/?version=latest)](https://soika.readthedocs.io/en/latest/?badge=latest) +[![Documentation Status](https://readthedocs.org/projects/sloyka/badge/?version=latest)](https://sloyka.readthedocs.io/ru/latest/?badge=latest) [![PythonVersion](https://img.shields.io/badge/python-3.11-blue)](https://pypi.org/project/scikit-learn/) [![Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) -[![Tests](https://github.com/GeorgeKontsevik/sloyka/.github/workflows/2dev_ci_on_pr.yaml/badge.svg?branch=dev)](https://github.com/GeorgeKontsevik/sloyka/.github/workflows/2dev_ci_on_pr.yaml) +[![Tests](https://github.com/GeorgeKontsevik/sloyka/actions/workflows/2dev_ci_on_pr.yaml/badge.svg?branch=dev)](https://github.com/GeorgeKontsevik/sloyka/actions/workflows/2dev_ci_on_pr.yaml) [![sloyka_community_chat](https://img.shields.io/badge/-community-blue?logo=telegram)](https://t.me/sloyka_community) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wCUJAqlq9GMKw1wpTsWrzYwr10pWDeHv?usp=sharing) @@ -38,5 +38,15 @@ asantonov@itmo.ru (Александр Антонов, Project Lead) kontsevik@itmo.ru (just in case). ## Цитирование -Antonov, A., Gornova, G., Kontsevik, G., Turkov, L., Vorona, V., & Mityagin, S. (2024, July). Transformation of Local Communities from Neighborhoods to Urban Commons in the Production of Social Representations of Space. In International Conference on Computational Science and Its Applications (pp. 436-447). Cham: Springer Nature Switzerland. + +``` bibtex +@inproceedings{antonov2024transformation, + title={Transformation of Local Communities from Neighborhoods to Urban Commons in the Production of Social Representations of Space}, + author={Antonov, Aleksandr and Gornova, Galina and Kontsevik, Georgii and Turkov, Leonid and Vorona, Vladimir and Mityagin, Sergey}, + booktitle={International Conference on Computational Science and Its Applications}, + pages={436--447}, + year={2024}, + organization={Springer} +} +``` --- diff --git a/docs/source/conf.py b/docs/source/conf.py index c160718..ffb95b8 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -12,7 +12,8 @@ project = 'sloyka' copyright = '2024, itmo_idu' author = 'itmo_idu' -release = '0.1.0' +version = '0.2' +release = '0.2.1' # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/pyproject.toml b/pyproject.toml index f4b342f..3500f54 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "sloyka" -version = "0.1.8" +version = "0.2.1" description = "Library for city identity analysis from social media posts and comments" authors = ["sandrro, georgekontsevik"] readme = "README.md" diff --git a/sloyka/src/risks/emotion_classifier.py b/sloyka/src/risks/emotion_classifier.py index 301e5e8..8215b06 100644 --- a/sloyka/src/risks/emotion_classifier.py +++ b/sloyka/src/risks/emotion_classifier.py @@ -34,7 +34,7 @@ class EmotionRecognizer: - text_column: The name of the column containing the text to be analyzed. """ - def __init__(self, model_name=HuggingFaceModel.Text.Bert_Large, device='cpu'): + def __init__(self, model_name=HuggingFaceModel.Text.Bert_Tiny, device='cpu'): self.device = device self.model_name = model_name @@ -46,11 +46,11 @@ def __init__(self, model_name=HuggingFaceModel.Text.Bert_Large, device='cpu'): HuggingFaceModel.Text.Bert_Tiny2, ] - self.recognizer = None - - def init_base_recognizer(self): self.recognizer = TextRecognizer(model=self.model_name, device=self.device) + #def init_base_recognizer(self): + # self.recognizer = TextRecognizer(model=self.model_name, device=self.device) + def recognize_emotion(self, text): """ diff --git a/sloyka/src/risks/regional_activity.py b/sloyka/src/risks/regional_activity.py index bcd0077..66968a7 100644 --- a/sloyka/src/risks/regional_activity.py +++ b/sloyka/src/risks/regional_activity.py @@ -99,7 +99,7 @@ def run_sloyka_modules(self) -> gpd.GeoDataFrame: .to_list() ) - processed_geodata.dropna(subset=["text"], inplace=True) + processed_geodata.dropna(subset=[self.text], inplace=True) processed_geodata = City_services().run( df=processed_geodata, text_column=self.text ) diff --git a/sloyka/src/semantic_graph/keyword_extracter.py b/sloyka/src/semantic_graph/keyword_extracter.py index e2a8a56..61d467b 100644 --- a/sloyka/src/semantic_graph/keyword_extracter.py +++ b/sloyka/src/semantic_graph/keyword_extracter.py @@ -97,7 +97,7 @@ def extract_keywords( texts_to_add = [] for j, text in zip(ids_text_to_extract, texts_to_extract): - extraction = self.model.extract_keywords(text, top_n=top_n, stop_words=RUS_STOPWORDS) + extraction = KeyBERT().extract_keywords(docs=text, top_n=top_n, stop_words=RUS_STOPWORDS) if extraction: score = extraction[0][1] if score > semantic_key_filter: diff --git a/sloyka/src/utils/data_getter/vk_data_getter.py b/sloyka/src/utils/data_getter/vk_data_getter.py index cc11dbf..c265091 100644 --- a/sloyka/src/utils/data_getter/vk_data_getter.py +++ b/sloyka/src/utils/data_getter/vk_data_getter.py @@ -244,11 +244,15 @@ def run_comments(domain, post_ids, access_token): for post_id in tqdm(post_ids): comments = VKParser().get_comments(owner_id, post_id, access_token) all_comments.extend(comments) - df = VKParser.comments_to_dataframe(all_comments) - df["type"] = "comment" - df = df.reset_index(drop=True) - print("comments downloaded") - return df + if len(all_comments) > 0: + df = VKParser.comments_to_dataframe(all_comments) + df["type"] = "comment" + df = df.reset_index(drop=True) + print("comments downloaded") + return df + else: + print("no comments") + return None @staticmethod def run_parser(domain, access_token, cutoff_date, number_of_messages=float("inf"), step=100): @@ -267,17 +271,20 @@ def run_parser(domain, access_token, cutoff_date, number_of_messages=float("inf" post_ids = df_posts["id"].tolist() df_comments = VKParser.run_comments(domain=owner_id, post_ids=post_ids, access_token=access_token) - df_comments.loc[df_comments["parents_stack"].apply(lambda x: len(x) > 0), "type"] = "reply" - for i in range(len(df_comments)): - tmp = df_comments["parents_stack"].iloc[i] - if tmp is not None: - if len(tmp) > 0: - df_comments["parents_stack"].iloc[i] = tmp[0] - else: - df_comments["parents_stack"].iloc[i] = None - - df_combined = df_comments.join(df_posts, on="post_id", rsuffix="_post") - df_combined = pd.concat([df_posts, df_comments], ignore_index=True) + if df_comments is not None: + df_comments.loc[df_comments["parents_stack"].apply(lambda x: len(x) > 0), "type"] = "reply" + for i in range(len(df_comments)): + tmp = df_comments["parents_stack"].iloc[i] + if tmp is not None: + if len(tmp) > 0: + df_comments["parents_stack"].iloc[i] = tmp[0] + else: + df_comments["parents_stack"].iloc[i] = None + + df_combined = df_comments.join(df_posts, on="post_id", rsuffix="_post") + df_combined = pd.concat([df_posts, df_comments], ignore_index=True) + else: + df_combined = df_posts df_group_name = VKParser.get_group_name(domain, access_token) df_combined["group_name"] = df_group_name["group_name"][0] diff --git a/tests/test_area_matcher.py b/tests/test_area_matcher.py deleted file mode 100644 index 8b602c4..0000000 --- a/tests/test_area_matcher.py +++ /dev/null @@ -1,22 +0,0 @@ -# # test_area_matcher.py -# import pytest -# import pandas as pd -# import re -# from sloyka.src.area_matcher import AreaMatcher -# from sloyka.src.data_getter import VKParser - -# @pytest.fixture -# def test_df_groups(): -# domain = 'pkio_klp' -# access_token = '...' -# df_groups = VKParser().run_parser(domain, access_token, step=100, cutoff_date='2024-03-20', number_of_messages=10) -# return df_groups - -# def test_df_areas(test_df_groups: pd.DataFrame): -# osm_id = 337422 -# area_matcher = AreaMatcher() -# test_df_groups['territory'] = test_df_groups['group_name'].apply(lambda x: area_matcher.match_area(x, osm_id)) - -# assert any(test_df_groups['territory'].apply(lambda x: bool(re.search(r'Колпин', x)))) - - diff --git a/tests/test_classifiers.py b/tests/test_classifiers.py index 3a7da9a..439eb94 100644 --- a/tests/test_classifiers.py +++ b/tests/test_classifiers.py @@ -1,40 +1,24 @@ -# import pytest -# import torch -# import pandas as pd -# from sloyka import TextClassifiers +import pytest +import torch +import pandas as pd +from sloyka import TextClassifiers -# path_to_file = "sloyka/sample_data/raw/Адмиралтейский.csv" +@pytest.fixture +def sample_dataframe(): + s_data = {'Текст комментария': {203: 'На Чайковского 63 тоже идет кап.ремонт. В квартире у пенсионеров побили стекла. Куда им обратиться?', + 204: 'Вся улица Жуковского и Восстания заклеена рекламой! Почему не действует полиция и администрация с ЖСК-1 ?'}, + 'message_id': {203: 195, 204: 196}} + return pd.DataFrame(s_data) -# @pytest.fixture -# def test_data(): -# df_predict = pd.read_csv(path_to_file, sep=";") -# df_predict.rename(columns={"Текст комментария": "Текст"}, inplace=True) -# df_predict = df_predict.dropna(subset=["Текст"]) -# df_predict = df_predict.head(3) -# return df_predict +@pytest.fixture +def model(): + return TextClassifiers( + repository_id="Sandrro/text_to_function_v2", + number_of_categories=1, + device_type=torch.device("cpu"), + ) -# @pytest.fixture -# def model(): -# return TextClassifiers( -# repository_id="Sandrro/text_to_subfunction_v10", -# number_of_categories=1, -# device_type=torch.device("cpu"), -# ) - -# def test_cats_probs(model, test_data): -# expected_df = pd.DataFrame( -# { -# "cats": [ -# "Вопросы граждан о проектах/планах/сроках/ходе проведения работ по благоустройству", -# "Не ЦУР", -# "Вопросы по оплате проезда в общественном транспорте", -# ], -# "probs": ["1.0", "0.999", "0.98"], -# } -# ) - -# test_data[["cats", "probs"]] = pd.DataFrame( -# test_data["Текст"].progress_map(lambda x: model.run_text_classifier_topics(x)).to_list() -# ) -# assert test_data["cats"].equals(expected_df["cats"]) -# assert test_data["probs"].equals(expected_df["probs"]) \ No newline at end of file +def test_cats_probs(model, sample_dataframe): + sample_dataframe[["cats", "probs"]] = sample_dataframe["Текст комментария"].progress_map(lambda x: model.run_text_classifier(x)).to_list() + print(sample_dataframe) + assert sample_dataframe.iloc[0]["cats"] == "ЖКХ" \ No newline at end of file diff --git a/tests/test_emotion_classifier.py b/tests/test_emotion_classifier.py new file mode 100644 index 0000000..9722732 --- /dev/null +++ b/tests/test_emotion_classifier.py @@ -0,0 +1,20 @@ +import pytest +import torch +import pandas as pd +from sloyka import EmotionRecognizer + +@pytest.fixture +def sample_dataframe(): + s_data = {'Текст комментария': {203: 'На Чайковского 63 тоже идет кап.ремонт. В квартире у пенсионеров побили стекла. Куда им обратиться?', + 204: 'Вся улица Жуковского и Восстания заклеена рекламой! Почему не действует полиция и администрация с ЖСК-1 ?'}, + 'message_id': {203: 195, 204: 196}} + return pd.DataFrame(s_data) + +@pytest.fixture +def model(): + return EmotionRecognizer() + +def test_emotion_recognizer(model, sample_dataframe): + sample_dataframe["emotion"] = sample_dataframe["Текст комментария"].progress_map(lambda x: model.recognize_emotion(x)) + print(sample_dataframe) + assert sample_dataframe.iloc[0]["emotion"] == "neutral" \ No newline at end of file diff --git a/tests/test_events_modelling.py b/tests/test_events_modelling.py deleted file mode 100644 index aaf5f59..0000000 --- a/tests/test_events_modelling.py +++ /dev/null @@ -1,31 +0,0 @@ -# import pytest -# import geopandas as gpd -# from sloyka import EventDetection - -# path_to_population = "sloyka/sample_data/raw/population.geojson" -# path_to_data = "sloyka/sample_data/processed/messages.geojson" - - -# @pytest.fixture -# def gdf(): -# gdf = gpd.read_file(path_to_data) -# gdf = gdf.head(6) -# return gdf - - -# def test_event_detection(gdf): -# expected_name = "0_фурштатская_штукатурного слоя_слоя_отслоение" -# expected_risk = 0.405 -# expected_messages = [4, 5, 3, 2] -# event_model = EventDetection() -# _, events, _ = event_model.run( -# gdf, path_to_population, "Санкт-Петербург", 32636, min_event_size=3 -# ) -# event_name = events.iloc[0]["name"] -# event_risk = events.iloc[0]["risk"].round(3) -# event_messages = [ -# int(mid) for mid in events.iloc[0]["message_ids"].split(", ") -# ] -# assert event_name == expected_name -# assert event_risk == expected_risk -# assert all(mid in event_messages for mid in expected_messages) diff --git a/tests/test_geocoder.py b/tests/test_geocoder.py index 5f16b6b..17aeed5 100644 --- a/tests/test_geocoder.py +++ b/tests/test_geocoder.py @@ -4,20 +4,19 @@ @pytest.fixture def sample_dataframe(): - s_data = { - "text": [ - "На биржевой 15 снова шумят!!" - ] - } + s_data = {'Текст комментария': {203: 'На Чайковского 63 тоже идет кап.ремонт. В квартире у пенсионеров побили стекла. Куда им обратиться?', + 204: 'Вся улица Жуковского и Восстания заклеена рекламой! Почему не действует полиция и администрация с ЖСК-1 ?'}, + 'message_id': {203: 195, 204: 196}} return pd.DataFrame(s_data) def test_run_function(sample_dataframe): - instance = Geocoder(osm_id=337422, city_tags = { "place": ["state"] }) + osm_id = 337422 # Saint Petersburg + geocoder = Geocoder(df=sample_dataframe, osm_id=osm_id, city_tags={'place':['state']}, text_column_name='Текст комментария') - result_df = instance.run(df=sample_dataframe, group_column=None) + result = geocoder.run(group_column=None) - print(result_df[['Street', 'Numbers']]) + print(result[['Street', 'Numbers']]) - assert result_df.loc[0, "Street"] == "биржевой" - assert result_df.loc[0, "Numbers"] == "15" + assert result.loc[0, "Street"] == "чайковского" + assert result.loc[0, "Numbers"] == "63" diff --git a/tests/test_geocoder_matcher.py b/tests/test_geocoder_matcher.py deleted file mode 100644 index 7569c2c..0000000 --- a/tests/test_geocoder_matcher.py +++ /dev/null @@ -1,29 +0,0 @@ -# import pytest -# import pandas as pd -# import re -# from sloyka.src.geocoder import Geocoder # Импортируйте новый класс - -# # Фикстура для создания DataFrame -# @pytest.fixture -# def test_df_groups(): -# data = { -# 'group_name': ['Пушкин'], -# 'Текст комментария': ['Рубинштейна 25 дворовую территорию уберите, где работники?'] -# } -# df_groups = pd.DataFrame(data) -# return df_groups - -# # Тест для функции run -# def test_run(test_df_groups: pd.DataFrame): -# osm_id = 338635 -# tags = {"admin_level": ["8"]} -# date = "2024-04-22T00:00:00Z" -# osm_city_level: int = 5 -# osm_city_name: str = "Санкт-Петербург" - -# instance = Geocoder(osm_city_name=osm_city_name, osm_city_level=osm_city_level) -# result_df = instance.run(osm_id, tags, date, test_df_groups) - -# assert any(result_df['territory'].apply(lambda x: bool(re.search(r'Пушкин', x)))) -# assert result_df.loc[0, "Street"] == "рубинштейна" -# assert result_df.loc[0, "Numbers"] == "25" diff --git a/tests/test_semantic_graph.py b/tests/test_semantic_graph.py index 5920a38..0d68b9d 100644 --- a/tests/test_semantic_graph.py +++ b/tests/test_semantic_graph.py @@ -1,44 +1,30 @@ -# import pandas as pd - -# from sloyka import Semgraph - - -# sm = Semgraph() -# test_df = pd.read_feather("sloyka/sample_data/processed/df_strts.feather")[:20] -# text_column='Текст комментария' -# toponim_column='only_full_street_name' -# toponim_name_column='initial_street' -# toponim_type_column='Toponims' - -# def test_extract_keywords(): -# result = sm.extract_keywords(test_df, -# text_column, -# toponim_column, -# toponim_name_column, -# toponim_type_column, -# semantic_key_filter=0.6, -# top_n=5) - -# assert len(result) == 6 - -# def test_get_semantic_closeness(): -# df = pd.DataFrame([['TOPONIM_1', 'роза'], ['TOPONIM_2', 'куст']], columns=['toponims', 'words']) -# result = sm.get_semantic_closeness(df, -# column='words', -# similaryty_filter=0.5) - -# check = round(float(result['SIMILARITY_SCORE'].iloc[0]), 3) - -# assert check == round(0.655513, 3) - -# def test_build_semantic_graph(): -# result = sm.build_semantic_graph(test_df, -# text_column, -# toponim_column, -# toponim_name_column, -# toponim_type_column, -# key_score_filter=0.4, -# semantic_score_filter=0.6, -# top_n=5) +import sys +import os +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../sloyka/sample_data"))) +import geopandas as gpd +import pytest +from sloyka import Semgraph + +@pytest.fixture +def sample_data(): + sample_data_path = os.path.join(os.path.dirname(__file__), "../sloyka/sample_data/sample_data_geocoded_emotioned.parquet") + gdf = gpd.read_parquet(sample_data_path) + gdf['type'] = 'post' + return gdf + + +def test_build_semantic_graph(sample_data): + sm = Semgraph() + G = sm.build_graph(sample_data, + id_column='message_id', + text_column='Текст комментария', + text_type_column="type", + toponym_column='full_street_name', + toponym_name_column='only_full_street_name', + toponym_type_column='Toponyms', + post_id_column="message_id", + parents_stack_column="message_id", + location_column='Location', + geometry_column='geometry') -# assert len(result.edges) == 216 + assert len(G.edges) == 88 \ No newline at end of file diff --git a/tests/test_services_extractor.py b/tests/test_services_extractor.py new file mode 100644 index 0000000..db243d8 --- /dev/null +++ b/tests/test_services_extractor.py @@ -0,0 +1,19 @@ +import pytest +import pandas as pd +from sloyka import City_services + +@pytest.fixture +def sample_dataframe(): + s_data = {'Текст комментария': {203: 'Когда уже на Юго западе будет метро? Весь день в пути проводим!', + 204: 'Вся улица Жуковского и Восстания заклеена рекламой! Почему не действует полиция и администрация с ЖСК-1 ?'}, + 'message_id': {203: 195, 204: 196}} + return pd.DataFrame(s_data) + +@pytest.fixture +def model(): + return City_services() + +def test_services(model, sample_dataframe): + result = model.run(sample_dataframe, "Текст комментария") + print(result) + assert result.iloc[0]["City_services"][0] == "Метро" \ No newline at end of file