Skip to content

Commit

Permalink
Merge pull request #93 from Sandrro/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
Sandrro authored Sep 19, 2024
2 parents 03ac0d5 + 3e7a434 commit d1c478b
Show file tree
Hide file tree
Showing 16 changed files with 145 additions and 201 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/2dev_ci_on_pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ name: Tests
on:
push:
branches:
- dev
- "*"
pull_request:
branches:
- dev
- master

jobs:
test:
Expand Down
16 changes: 13 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# SLOYKA
[![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active)
[![Documentation Status](https://readthedocs.org/projects/soika/badge/?version=latest)](https://soika.readthedocs.io/en/latest/?badge=latest)
[![Documentation Status](https://readthedocs.org/projects/sloyka/badge/?version=latest)](https://sloyka.readthedocs.io/ru/latest/?badge=latest)
[![PythonVersion](https://img.shields.io/badge/python-3.11-blue)](https://pypi.org/project/scikit-learn/)
[![Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
[![Tests](https://github.com/GeorgeKontsevik/sloyka/.github/workflows/2dev_ci_on_pr.yaml/badge.svg?branch=dev)](https://github.com/GeorgeKontsevik/sloyka/.github/workflows/2dev_ci_on_pr.yaml)
[![Tests](https://github.com/GeorgeKontsevik/sloyka/actions/workflows/2dev_ci_on_pr.yaml/badge.svg?branch=dev)](https://github.com/GeorgeKontsevik/sloyka/actions/workflows/2dev_ci_on_pr.yaml)

[![sloyka_community_chat](https://img.shields.io/badge/-community-blue?logo=telegram)](https://t.me/sloyka_community)
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wCUJAqlq9GMKw1wpTsWrzYwr10pWDeHv?usp=sharing)
Expand Down Expand Up @@ -38,5 +38,15 @@ [email protected] (Александр Антонов, Project Lead)
[email protected] (just in case).

## Цитирование
Antonov, A., Gornova, G., Kontsevik, G., Turkov, L., Vorona, V., & Mityagin, S. (2024, July). Transformation of Local Communities from Neighborhoods to Urban Commons in the Production of Social Representations of Space. In International Conference on Computational Science and Its Applications (pp. 436-447). Cham: Springer Nature Switzerland.

``` bibtex
@inproceedings{antonov2024transformation,
title={Transformation of Local Communities from Neighborhoods to Urban Commons in the Production of Social Representations of Space},
author={Antonov, Aleksandr and Gornova, Galina and Kontsevik, Georgii and Turkov, Leonid and Vorona, Vladimir and Mityagin, Sergey},
booktitle={International Conference on Computational Science and Its Applications},
pages={436--447},
year={2024},
organization={Springer}
}
```
---
3 changes: 2 additions & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
project = 'sloyka'
copyright = '2024, itmo_idu'
author = 'itmo_idu'
release = '0.1.0'
version = '0.2'
release = '0.2.1'

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "sloyka"
version = "0.1.8"
version = "0.2.1"
description = "Library for city identity analysis from social media posts and comments"
authors = ["sandrro, georgekontsevik"]
readme = "README.md"
Expand Down
8 changes: 4 additions & 4 deletions sloyka/src/risks/emotion_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class EmotionRecognizer:
- text_column: The name of the column containing the text to be analyzed.
"""

def __init__(self, model_name=HuggingFaceModel.Text.Bert_Large, device='cpu'):
def __init__(self, model_name=HuggingFaceModel.Text.Bert_Tiny, device='cpu'):
self.device = device
self.model_name = model_name

Expand All @@ -46,11 +46,11 @@ def __init__(self, model_name=HuggingFaceModel.Text.Bert_Large, device='cpu'):
HuggingFaceModel.Text.Bert_Tiny2,
]

self.recognizer = None

def init_base_recognizer(self):
self.recognizer = TextRecognizer(model=self.model_name, device=self.device)

#def init_base_recognizer(self):
# self.recognizer = TextRecognizer(model=self.model_name, device=self.device)


def recognize_emotion(self, text):
"""
Expand Down
2 changes: 1 addition & 1 deletion sloyka/src/risks/regional_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def run_sloyka_modules(self) -> gpd.GeoDataFrame:
.to_list()
)

processed_geodata.dropna(subset=["text"], inplace=True)
processed_geodata.dropna(subset=[self.text], inplace=True)
processed_geodata = City_services().run(
df=processed_geodata, text_column=self.text
)
Expand Down
2 changes: 1 addition & 1 deletion sloyka/src/semantic_graph/keyword_extracter.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def extract_keywords(
texts_to_add = []

for j, text in zip(ids_text_to_extract, texts_to_extract):
extraction = self.model.extract_keywords(text, top_n=top_n, stop_words=RUS_STOPWORDS)
extraction = KeyBERT().extract_keywords(docs=text, top_n=top_n, stop_words=RUS_STOPWORDS)
if extraction:
score = extraction[0][1]
if score > semantic_key_filter:
Expand Down
39 changes: 23 additions & 16 deletions sloyka/src/utils/data_getter/vk_data_getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,11 +244,15 @@ def run_comments(domain, post_ids, access_token):
for post_id in tqdm(post_ids):
comments = VKParser().get_comments(owner_id, post_id, access_token)
all_comments.extend(comments)
df = VKParser.comments_to_dataframe(all_comments)
df["type"] = "comment"
df = df.reset_index(drop=True)
print("comments downloaded")
return df
if len(all_comments) > 0:
df = VKParser.comments_to_dataframe(all_comments)
df["type"] = "comment"
df = df.reset_index(drop=True)
print("comments downloaded")
return df
else:
print("no comments")
return None

@staticmethod
def run_parser(domain, access_token, cutoff_date, number_of_messages=float("inf"), step=100):
Expand All @@ -267,17 +271,20 @@ def run_parser(domain, access_token, cutoff_date, number_of_messages=float("inf"
post_ids = df_posts["id"].tolist()

df_comments = VKParser.run_comments(domain=owner_id, post_ids=post_ids, access_token=access_token)
df_comments.loc[df_comments["parents_stack"].apply(lambda x: len(x) > 0), "type"] = "reply"
for i in range(len(df_comments)):
tmp = df_comments["parents_stack"].iloc[i]
if tmp is not None:
if len(tmp) > 0:
df_comments["parents_stack"].iloc[i] = tmp[0]
else:
df_comments["parents_stack"].iloc[i] = None

df_combined = df_comments.join(df_posts, on="post_id", rsuffix="_post")
df_combined = pd.concat([df_posts, df_comments], ignore_index=True)
if df_comments is not None:
df_comments.loc[df_comments["parents_stack"].apply(lambda x: len(x) > 0), "type"] = "reply"
for i in range(len(df_comments)):
tmp = df_comments["parents_stack"].iloc[i]
if tmp is not None:
if len(tmp) > 0:
df_comments["parents_stack"].iloc[i] = tmp[0]
else:
df_comments["parents_stack"].iloc[i] = None

df_combined = df_comments.join(df_posts, on="post_id", rsuffix="_post")
df_combined = pd.concat([df_posts, df_comments], ignore_index=True)
else:
df_combined = df_posts
df_group_name = VKParser.get_group_name(domain, access_token)
df_combined["group_name"] = df_group_name["group_name"][0]

Expand Down
22 changes: 0 additions & 22 deletions tests/test_area_matcher.py

This file was deleted.

58 changes: 21 additions & 37 deletions tests/test_classifiers.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,24 @@
# import pytest
# import torch
# import pandas as pd
# from sloyka import TextClassifiers
import pytest
import torch
import pandas as pd
from sloyka import TextClassifiers

# path_to_file = "sloyka/sample_data/raw/Адмиралтейский.csv"
@pytest.fixture
def sample_dataframe():
s_data = {'Текст комментария': {203: 'На Чайковского 63 тоже идет кап.ремонт. В квартире у пенсионеров побили стекла. Куда им обратиться?',
204: 'Вся улица Жуковского и Восстания заклеена рекламой! Почему не действует полиция и администрация с ЖСК-1 ?'},
'message_id': {203: 195, 204: 196}}
return pd.DataFrame(s_data)

# @pytest.fixture
# def test_data():
# df_predict = pd.read_csv(path_to_file, sep=";")
# df_predict.rename(columns={"Текст комментария": "Текст"}, inplace=True)
# df_predict = df_predict.dropna(subset=["Текст"])
# df_predict = df_predict.head(3)
# return df_predict
@pytest.fixture
def model():
return TextClassifiers(
repository_id="Sandrro/text_to_function_v2",
number_of_categories=1,
device_type=torch.device("cpu"),
)

# @pytest.fixture
# def model():
# return TextClassifiers(
# repository_id="Sandrro/text_to_subfunction_v10",
# number_of_categories=1,
# device_type=torch.device("cpu"),
# )

# def test_cats_probs(model, test_data):
# expected_df = pd.DataFrame(
# {
# "cats": [
# "Вопросы граждан о проектах/планах/сроках/ходе проведения работ по благоустройству",
# "Не ЦУР",
# "Вопросы по оплате проезда в общественном транспорте",
# ],
# "probs": ["1.0", "0.999", "0.98"],
# }
# )

# test_data[["cats", "probs"]] = pd.DataFrame(
# test_data["Текст"].progress_map(lambda x: model.run_text_classifier_topics(x)).to_list()
# )
# assert test_data["cats"].equals(expected_df["cats"])
# assert test_data["probs"].equals(expected_df["probs"])
def test_cats_probs(model, sample_dataframe):
sample_dataframe[["cats", "probs"]] = sample_dataframe["Текст комментария"].progress_map(lambda x: model.run_text_classifier(x)).to_list()
print(sample_dataframe)
assert sample_dataframe.iloc[0]["cats"] == "ЖКХ"
20 changes: 20 additions & 0 deletions tests/test_emotion_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import pytest
import torch
import pandas as pd
from sloyka import EmotionRecognizer

@pytest.fixture
def sample_dataframe():
s_data = {'Текст комментария': {203: 'На Чайковского 63 тоже идет кап.ремонт. В квартире у пенсионеров побили стекла. Куда им обратиться?',
204: 'Вся улица Жуковского и Восстания заклеена рекламой! Почему не действует полиция и администрация с ЖСК-1 ?'},
'message_id': {203: 195, 204: 196}}
return pd.DataFrame(s_data)

@pytest.fixture
def model():
return EmotionRecognizer()

def test_emotion_recognizer(model, sample_dataframe):
sample_dataframe["emotion"] = sample_dataframe["Текст комментария"].progress_map(lambda x: model.recognize_emotion(x))
print(sample_dataframe)
assert sample_dataframe.iloc[0]["emotion"] == "neutral"
31 changes: 0 additions & 31 deletions tests/test_events_modelling.py

This file was deleted.

19 changes: 9 additions & 10 deletions tests/test_geocoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,19 @@

@pytest.fixture
def sample_dataframe():
s_data = {
"text": [
"На биржевой 15 снова шумят!!"
]
}
s_data = {'Текст комментария': {203: 'На Чайковского 63 тоже идет кап.ремонт. В квартире у пенсионеров побили стекла. Куда им обратиться?',
204: 'Вся улица Жуковского и Восстания заклеена рекламой! Почему не действует полиция и администрация с ЖСК-1 ?'},
'message_id': {203: 195, 204: 196}}
return pd.DataFrame(s_data)


def test_run_function(sample_dataframe):
instance = Geocoder(osm_id=337422, city_tags = { "place": ["state"] })
osm_id = 337422 # Saint Petersburg
geocoder = Geocoder(df=sample_dataframe, osm_id=osm_id, city_tags={'place':['state']}, text_column_name='Текст комментария')

result_df = instance.run(df=sample_dataframe, group_column=None)
result = geocoder.run(group_column=None)

print(result_df[['Street', 'Numbers']])
print(result[['Street', 'Numbers']])

assert result_df.loc[0, "Street"] == "биржевой"
assert result_df.loc[0, "Numbers"] == "15"
assert result.loc[0, "Street"] == "чайковского"
assert result.loc[0, "Numbers"] == "63"
29 changes: 0 additions & 29 deletions tests/test_geocoder_matcher.py

This file was deleted.

Loading

0 comments on commit d1c478b

Please sign in to comment.