Skip to content

Commit

Permalink
Merge pull request #81 from GeorgeKontsevik/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
GeorgeKontsevik authored Jun 27, 2024
2 parents 8cc74fc + 0861fd2 commit 6e41e0d
Show file tree
Hide file tree
Showing 11 changed files with 2,271 additions and 2,120 deletions.
734 changes: 19 additions & 715 deletions examples/geocoder_example.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "sloyka"
version = "0.1.7"
version = "0.1.8"
description = "Library for city identity analysis from social media posts and comments"
authors = ["sandrro, georgekontsevik"]
readme = "README.md"
Expand Down
14 changes: 7 additions & 7 deletions sloyka/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
"EmotionRecognizer",
]

logger.remove()
logger.add(
sys.stdout,
format="<green>{time:MM-DD HH:mm}</green> | <level>{level: <8}</level> | <cyan>{message}</cyan>",
level="INFO",
colorize=True,
)
# logger.remove()
# logger.add(
# sys.stdout,
# format="<green>{time:MM-DD HH:mm}</green> | <level>{level: <8}</level> | <cyan>{message}</cyan>",
# level="INFO",
# colorize=True,
# )
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,13 @@
import osmnx as ox
from shapely.geometry import Point, Polygon, MultiPolygon
from loguru import logger
from natasha import MorphVocab
import pymorphy2
from sloyka.src.utils.constants import NUM_CITY_OBJ
from sloyka.src.geocoder.objects_address_extractor_by_rules import AddressExtractorExtra
from sloyka.src.utils.data_getter.geo_data_getter import GeoDataGetter
from rapidfuzz import fuzz
import numpy as np

import warnings

warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)


class OtherGeoObjects:
@staticmethod
Expand Down Expand Up @@ -43,18 +38,23 @@ def run_osm_dfs(osm_id: int) -> pd.DataFrame:
{"historic": ["monument", "memorial"]},
{"place": ["square"]},
]

osm_dfs = list()
for tags in tags_list:
logger.debug(f'getting {osm_id, tags}')
try:
tmp_df = OtherGeoObjects.get_and_process_osm_data(osm_id, tags)
osm_dfs.append(tmp_df)
except RuntimeError:
logger.warning(f'Runtime error during fetching {osm_id, tags}')
continue
if osm_dfs:
osm_combined_df = pd.concat(osm_dfs, axis=0)
logger.debug(f'got {osm_id, tags}')
logger.debug(f'{osm_combined_df.shape}')
return osm_combined_df
else:
logger.warning(f'No data were gathered about city objects in {osm_id}')
return pd.DataFrame()

@staticmethod
Expand All @@ -76,7 +76,7 @@ def extract_geo_obj(text) -> List[str]:
"""
if text is None:
return None
morph = MorphVocab()
morph = pymorphy2.MorphAnalyzer()
extractor = AddressExtractorExtra(morph)

other_geo_obj = []
Expand All @@ -96,10 +96,10 @@ def extract_geo_obj(text) -> List[str]:
other_geo_obj.append(part.value)
elif part.type:
other_geo_obj.append(part.type)
if not other_geo_obj:
return other_geo_obj
if not other_geo_obj:
return other_geo_obj
except Exception as e:
# logger.exception(f"Error extracting geo objects: {e}")
# logger.warning(f"Error extracting geo objects: {e}")
return other_geo_obj
return other_geo_obj

Expand Down
25 changes: 9 additions & 16 deletions sloyka/src/geocoder/geocoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,18 +52,13 @@
from loguru import logger

from pandarallel import pandarallel
from sloyka.src.geocoder.city_objects_getter import OtherGeoObjects
from sloyka.src.geocoder.city_objects_extractor import OtherGeoObjects
from sloyka.src.utils.data_getter.street_getter import Streets
from sloyka.src.utils.data_getter.location_getter import Location
from sloyka.src.utils.data_getter.geo_data_getter import GeoDataGetter
from sloyka.src.geocoder.street_extractor import StreetExtractor
from sloyka.src.geocoder.word_form_matcher import WordFormFinder

import warnings

warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

pandarallel.initialize(progress_bar=True, nb_workers=-1)

# segmenter = Segmenter()
Expand All @@ -74,11 +69,6 @@
# morph_tagger = NewsMorphTagger(emb)
# syntax_parser = NewsSyntaxParser(emb)
# ner_tagger = NewsNERTagger(emb)
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)




stemmer = SnowballStemmer("russian")

Expand Down Expand Up @@ -333,6 +323,11 @@ def run(
"""

initial_df = df.copy()

if search_for_objects:
df_obj = OtherGeoObjects.run(self.osm_id, df, text_column)


if tags:
df_areas = self.get_df_areas(self.osm_id, tags)
df_areas = self.preprocess_area_names(df_areas)
Expand All @@ -358,11 +353,9 @@ def run(
del street_names
gdf = self.create_gdf(df)

if search_for_objects:
df_obj = OtherGeoObjects.run(self.osm_id, df, text_column)
gdf = pd.concat([gdf, df_obj], ignore_index=True)
del df_obj
gdf["geo_obj_tag"] = gdf["geo_obj_tag"].apply(Geocoder.assign_street)
gdf = pd.concat([gdf, df_obj], ignore_index=True)
del df_obj
gdf["geo_obj_tag"] = gdf["geo_obj_tag"].apply(Geocoder.assign_street)

gdf = pd.merge(gdf, initial_df, on=text_column, how='right')

Expand Down
37 changes: 5 additions & 32 deletions sloyka/src/geocoder/objects_address_extractor_by_rules.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
# addr_extractor.py
import warnings

warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore")

from natasha.extractors import Match
from natasha.extractors import Extractor

from ..utils.data_processing.rule_for_natasha import ADDR_PART

from loguru import logger


class AddrExtractorError(Exception):
"""Custom exception for address extractor errors"""
Expand All @@ -17,44 +16,18 @@ class AddrExtractorError(Exception):


class AddressExtractorExtra(Extractor):
"""
Extractor for addresses
"""

# logger = logging.getLogger(__name__)

def __init__(self, morph):
"""
Initialize the address extractor
:param morph: Morphological analyzer
"""
super().__init__(ADDR_PART, morph)
Extractor.__init__(self, ADDR_PART, morph)

def find(self, text):
"""
Extract addresses from the given text
:param text: Input text
:return: Match object containing the extracted address
"""
# self.logger.info(f"Extracting addresses from text: {text}")
matches = self(text)
if not matches:
# self.logger.debug("No matches found")
return

matches = sorted(matches, key=lambda _: _.start)
if not matches:
# self.logger.debug("No matches found after sorting")
return

start = matches[0].start
stop = matches[-1].stop
parts = [_.fact for _ in matches]
# self.logger.debug(f"Extracted address parts: {parts}")
try:
return Match(start, stop, obj.Addr(parts))
except Exception as e:
# self.logger.error(f"Error creating Match object: {e}")
raise AddrExtractorError(f"Error creating Match object: {e}")
return Match(start, stop, obj.Addr(parts))
16 changes: 8 additions & 8 deletions sloyka/src/geocoder/street_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def process_pipeline(df: pd.DataFrame, text_column: str, classifier) -> pd.DataF
texts = StreetExtractor._preprocess_text_column(local_df, text_column)
extracted_streets = StreetExtractor._extract_streets(texts, classifier)
refined_streets = StreetExtractor._refine_street_data(extracted_streets)
building_numbers = StreetExtractor._extract_building_numbers(texts, refined_streets)
building_numbers = StreetExtractor._get_number(texts, refined_streets)
toponyms = StreetExtractor._extract_toponyms(texts, refined_streets)

# Combine results into a DataFrame
Expand Down Expand Up @@ -152,7 +152,7 @@ def _refine_street_name(street: str) -> str:
return ""

@staticmethod
def _extract_building_numbers(texts: List[str], streets: List[Optional[str]]) -> List[Optional[str]]:
def _get_number(texts: List[str], streets: List[Optional[str]]) -> List[Optional[str]]:
"""
Extract building numbers from the text data.
Expand All @@ -167,7 +167,7 @@ def _extract_building_numbers(texts: List[str], streets: List[Optional[str]]) ->
for text, street in zip(texts, streets):
if street:
try:
building_numbers.append(StreetExtractor._extract_building_number(text, street))
building_numbers.append(StreetExtractor._extract_building_number_from_text(text, street))
except Exception as e:
logger.warning(f"Error extracting building number from text '{text}' with street '{street}': {e}")
building_numbers.append(None)
Expand All @@ -176,7 +176,7 @@ def _extract_building_numbers(texts: List[str], streets: List[Optional[str]]) ->
return building_numbers

@staticmethod
def _extract_building_number(text: str, street: str) -> str:
def _extract_building_number_from_text(text: str, street: str) -> str:
"""
Extract building number from the text.
Expand All @@ -189,9 +189,9 @@ def _extract_building_number(text: str, street: str) -> str:
"""
try:
numbers = " ".join(re.findall(r"\d+", text))
return StreetExtractor.extract_building_num(text, street, numbers)
return StreetExtractor._check_if_extracted_number_legit(text, street, numbers)
except Exception as e:
logger.warning(f"Error in _extract_building_number with text '{text}' and street '{street}': {e}")
logger.warning(f"Error in _extract_building_number_from_text with text '{text}' and street '{street}': {e}")
return ""

@staticmethod
Expand Down Expand Up @@ -312,7 +312,7 @@ def _search_toponyms(words: List[str], position: int) -> Optional[str]:
return None

@staticmethod
def extract_building_num(text: str, street_name: str, number: Optional[str]) -> str:
def _check_if_extracted_number_legit(text: str, street_name: str, number: Optional[str]) -> str:
"""
Extract building numbers near the specified street name in the text.
Expand Down Expand Up @@ -358,7 +358,7 @@ def _find_street_name_positions(words: List[str], street_name: str) -> List[int]
Returns:
List[int]: List of positions where the street name occurs.
"""
return [index for index, word in enumerate(words) if word == street_name]
return [index for index, word in enumerate(words) if word.lower() == street_name]

@staticmethod
def _search_building_number(words: List[str], position: int) -> str:
Expand Down
3 changes: 0 additions & 3 deletions sloyka/src/geocoder/text_address_extractor_by_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@
Doc,
)

import pandas as pd
import pymorphy2

from sloyka.src.utils.constants import (
EXCEPTIONS_CITY_COUNTRY)

Expand Down
20 changes: 10 additions & 10 deletions sloyka/src/geocoder/word_form_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,22 +45,22 @@ def _process_row(self, row: pd.Series, strts_df: pd.DataFrame) -> dict:
"""
try:
search_val = row.get("Street")
search_toponym = row.get("Toponims")
search_toponym = row.get("Toponyms")
val_num = row.get("Numbers", "")

if not search_val or pd.isna(search_val):
logger.warning(f"Error processing row with street '{row.get('Street')}' and toponym '{row.get('Toponims')}")
logger.warning(f"Error processing row with street '{row.get('Street')}' and toponym '{row.get('Toponyms')}")
return {"full_street_name": None, "only_full_street_name": None}

for col in strts_df.columns[2:]:
matching_rows = self._find_matching_rows(strts_df, col, search_val, search_toponym)

if not matching_rows.empty:
full_streets = [self._format_full_address(street, val_num) for street in matching_rows["street"].values]
return {
"full_street_name": ",".join(full_streets),
"only_full_street_name": ",".join(matching_rows["street"].values)
}
if not matching_rows.empty:
full_streets = [self._format_full_address(street, val_num) for street in matching_rows["street"].values]
return {
"full_street_name": ",".join(full_streets),
"only_full_street_name": ",".join(matching_rows["street"].values)
}

# If no exact match found, check without toponym
if search_val in strts_df[col].values:
Expand All @@ -71,11 +71,11 @@ def _process_row(self, row: pd.Series, strts_df: pd.DataFrame) -> dict:
"only_full_street_name": ",".join(only_streets_full)
}
else:
logger.warning(f"Error processing row with street '{row.get('Street')}' and toponym '{row.get('Toponims')}'")
logger.warning(f"Error processing row with street '{row.get('Street')}' and toponym '{row.get('Toponyms')}'")
return {"full_street_name": None, "only_full_street_name": None}

except Exception as e:
logger.warning(f"Error processing row with street '{row.get('Street')}' and toponym '{row.get('Toponims')}': {e}")
logger.warning(f"Error processing row with street '{row.get('Street')}' and toponym '{row.get('Toponyms')}': {e}")

return {"full_street_name": None, "only_full_street_name": None}

Expand Down
9 changes: 5 additions & 4 deletions sloyka/src/utils/data_getter/geo_data_getter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@

import warnings
warnings.filterwarnings("ignore")

import osmnx as ox
import geopandas as gpd
Expand Down Expand Up @@ -112,7 +113,7 @@ def _process_tags(tags, place):
gdf = GeoDataGetter._get_features_from_place(place_name, category, tag)
gdf_list.append(gdf)
except AttributeError:
# GeoDataGetter._handle_error(category, tag)
logger.warning(f'Error processing {tags, place}')
pass
return gdf_list

Expand Down Expand Up @@ -142,10 +143,10 @@ def get_drive_graph(city_bounds: gpd.GeoDataFrame) -> nx.MultiDiGraph:
# Streets.logger.info("Retrieving drive graph")
try:
G_drive = ox.graph_from_polygon(city_bounds.dissolve()["geometry"].squeeze(), network_type="drive")
# Streets.logger.debug(f"Drive graph retrieved: {G_drive}")
logger.debug(f"Drive graph retrieved: {G_drive}")
return G_drive
except Exception as e:
# Streets.logger.error(f"Error retrieving drive graph: {e}")
logger.error(f"Error retrieving drive graph: {e}")
raise e

# def _handle_error(self, category, tag):
Expand Down
Loading

0 comments on commit 6e41e0d

Please sign in to comment.