Skip to content

Commit

Permalink
Merge pull request #103 from Sandrro/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
Sandrro authored Nov 14, 2024
2 parents 68183ff + e934c37 commit 98498f7
Show file tree
Hide file tree
Showing 3 changed files with 134 additions and 31 deletions.
14 changes: 3 additions & 11 deletions sloyka/src/geocoder/geocoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from sloyka.src.utils.data_getter.street_getter import Streets
from sloyka.src.utils.data_getter.location_getter import Location
from sloyka.src.utils.data_getter.geo_data_getter import GeoDataGetter
from sloyka.src.utils.data_processing.area_matcher import AreaMatcher
from sloyka.src.utils.data_preprocessing.preprocessor import PreprocessorInput
from sloyka.src.geocoder.street_extractor import StreetExtractor
from sloyka.src.geocoder.word_form_matcher import WordFormFinder
Expand Down Expand Up @@ -315,17 +316,8 @@ def run(
df_obj = OtherGeoObjects.run(self.osm_id, df, text_column)


if tags:
df_areas = self.get_df_areas(self.osm_id, tags)
df_areas = self.preprocess_area_names(df_areas)

if group_column and group_column in df.columns:
for i, group_name in enumerate(df[group_column]):
processed_group_name = self.preprocess_group_name(group_name)
best_match, admin_level = self.match_group_to_area(processed_group_name, df_areas)
df.at[i, "territory"] = best_match
df.at[i, "key"] = admin_level
del df_areas
if group_column:
df = AreaMatcher.run(self.osm_id)

df[text_column] = df[text_column].astype(str).str.replace('\n', ' ')
df[text_column] = df[text_column].apply(str)
Expand Down
2 changes: 1 addition & 1 deletion sloyka/src/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -1233,4 +1233,4 @@

REGEX_PATTERN = r"[\"!?\u2665\u2022()|,.-:]"

REPLACEMENT_STRING = ""
REPLACEMENT_STRING = " "
149 changes: 130 additions & 19 deletions sloyka/src/utils/data_processing/area_matcher.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import re
import requests
import pandas as pd
import geopandas as gpd
from rapidfuzz import fuzz, process
from nltk.stem.snowball import SnowballStemmer
from shapely.geometry import Polygon, Point, MultiPolygon
from sloyka.src.utils.data_getter.historical_geo_data_getter import HistGeoDataGetter
from sloyka.src.utils.constants import AREA_STOPWORDS
from sloyka.src.utils.constants import GROUP_STOPWORDS



from sloyka.src.utils.constants import (
AREA_STOPWORDS,
GROUP_STOPWORDS,
REGEX_PATTERN,
REPLACEMENT_STRING,
)

class AreaMatcher:

Expand All @@ -23,29 +27,139 @@ def get_df_areas(self, osm_id, tags, date):
self.area_cache[osm_id] = df_areas
return self.area_cache[osm_id]

def get_osm_areas(self, osm_id, osm_type="relation"):
"""
Загружает и обрабатывает геометрии областей и населенных пунктов из OSM для заданного OSM ID.
Args:
osm_id (int): Идентификатор объекта в OSM.
osm_type (str): Тип объекта в OSM (по умолчанию "relation").
Returns:
GeoDataFrame: Обработанный GeoDataFrame с уникальными областями и геометриями.
"""
query = f"""
[out:json];
{osm_type}({osm_id});
map_to_area -> .a;
(
node(area.a)[place];
way(area.a)[place];
relation(area.a)[place];
relation(area.a)[boundary=administrative];
);
out geom;
"""
url = "http://overpass-api.de/api/interpreter"
response = requests.get(url, params={'data': query})
data = response.json()

elements = data['elements']
places = []

for elem in elements:
if 'tags' in elem:
name = elem['tags'].get('name')
place = elem['tags'].get('place')
admin_level = elem['tags'].get('admin_level')

if elem['type'] == 'node':
geometry = Point(elem['lon'], elem['lat'])
elif elem['type'] == 'way' and 'geometry' in elem:
coords = [(point['lon'], point['lat']) for point in elem['geometry']]
geometry = Polygon(coords) if len(coords) >= 4 else Point(coords[0])
elif elem['type'] == 'relation' and 'members' in elem:
polygons = []
for member in elem['members']:
if member['type'] == 'way' and 'geometry' in member:
coords = [(point['lon'], point['lat']) for point in member['geometry']]
if len(coords) >= 4:
polygons.append(Polygon(coords))
elif len(coords) == 1:
polygons.append(Point(coords[0]).buffer(100))
geometry = MultiPolygon(polygons) if len(polygons) > 1 else polygons[0] if polygons else None

if name and geometry:
places.append({
'name': name,
'place': place,
'admin_level': admin_level,
'geometry': geometry
})

gdf = gpd.GeoDataFrame(places, geometry='geometry', crs="EPSG:4326")

centroid = gdf.geometry.centroid.to_crs("EPSG:4326").unary_union.centroid
utm_zone = int((centroid.x + 180) // 6) + 1
utm_crs = f"EPSG:{32600 + utm_zone if centroid.y >= 0 else 32700 + utm_zone}"
gdf = gdf.to_crs(utm_crs)

gdf['geometry'] = gdf.apply(lambda row: row.geometry.buffer(100)
if row.geometry.geom_type == 'Point' or
(row.geometry.geom_type == 'Polygon' and len(row.geometry.exterior.coords) < 4)
else row.geometry, axis=1)

gdf = gdf.to_crs("EPSG:4326")

gdf.admin_level.fillna(12, inplace=True)
gdf.admin_level = gdf.admin_level.astype(int)
gdf = gdf.sort_values('admin_level').drop_duplicates(subset=['name'], keep='first')

return gdf

def preprocess_group_name(self, group_name):
"""
Preprocesses a group name by converting it to lowercase, removing special characters, and removing specified stopwords.
Args:
group_name (str): The group name to preprocess.
Returns:
str: The preprocessed group name.
"""
group_name = group_name.lower()
group_name = re.sub(r"[\"!?\u2665\u2022()|,.-:]", "", group_name)
group_name = re.sub(REGEX_PATTERN, REPLACEMENT_STRING, group_name)
words_to_remove = GROUP_STOPWORDS
for word in words_to_remove:
group_name = re.sub(word, "", group_name, flags=re.IGNORECASE)
return group_name

def preprocess_area_names(self, df_areas):
"""
Preprocesses the area names in the given DataFrame by removing specified stopwords, converting the names to lowercase,
and stemming them.
Parameters:
df_areas (DataFrame): The DataFrame containing the area names.
Returns:
DataFrame: The DataFrame with preprocessed area names, where the 'area_name' column contains the original names
with stopwords removed, the 'area_name_processed' column contains the lowercase names with special characters
removed, and the 'area_stems' column contains the stemmed names.
"""
words_to_remove = AREA_STOPWORDS
for word in words_to_remove:
df_areas["area_name"] = df_areas["name"].str.replace(word, "", regex=True)

df_areas["area_name_processed"] = df_areas["area_name"].str.lower()
df_areas["area_name_processed"] = df_areas["area_name_processed"].str.replace(
r"[\"!?\u2665\u2022()|,.-:]", "", regex=True
REGEX_PATTERN, REPLACEMENT_STRING, regex=True
)
df_areas["area_stems"] = df_areas["area_name_processed"].apply(
lambda x: [self.stemmer.stem(word) for word in x.split()]
)
return df_areas

def match_group_to_area(self, group_name, df_areas):
"""
Matches a given group name to an area in a DataFrame of areas.
Args:
group_name (str): The name of the group to match.
df_areas (DataFrame): The DataFrame containing the areas to match against.
Returns:
tuple: A tuple containing the best match for the group name and the admin level of the match.
If no match is found, returns (None, None).
"""
group_name_stems = [self.stemmer.stem(word) for word in group_name.split()]
max_partial_ratio = 20
max_token_sort_ratio = 20
Expand All @@ -62,18 +176,15 @@ def match_group_to_area(self, group_name, df_areas):
max_partial_ratio = partial_ratio
max_token_sort_ratio = token_sort_ratio
best_match = row["area_name"]
admin_level = row["key"]
admin_level = row["admin_level"]

return best_match, admin_level

def run(self, df, osm_id, tags, date):
df_areas = self.get_df_areas(osm_id, tags, date)

def run(self, df, osm_id):
df['processed_group_name'] = df.group_name.map(lambda x: self.preprocess_group_name(x))
df_areas = self.get_osm_areas(osm_id)
df_areas = self.preprocess_area_names(df_areas)

for i, group_name in enumerate(df["group_name"]):
processed_group_name = self.preprocess_group_name(group_name)
best_match, admin_level = self.match_group_to_area(processed_group_name, df_areas)
df.at[i, "territory"] = best_match
df.at[i, "admin_level"] = admin_level

return df
df[["best_match", "admin_level"]] = df.apply(
lambda row: pd.Series(self.match_group_to_area(row["processed_group_name"], df_areas)), axis=1
)
return df

0 comments on commit 98498f7

Please sign in to comment.