diff --git a/examples/geocoder_example.ipynb b/examples/geocoder_example.ipynb index 576585b..439147c 100644 --- a/examples/geocoder_example.ipynb +++ b/examples/geocoder_example.ipynb @@ -16,41 +16,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO: Pandarallel will run on -1 workers.\n", - "INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package stopwords to /Users/test/nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n", - "[nltk_data] Downloading package stopwords to /Users/test/nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-06-27 15:34:03,808 SequenceTagger predicts: Dictionary with 7 tags: O, S-Service, B-Service, E-Service, I-Service, , \n" - ] - } - ], + "outputs": [], "source": [ - "import warnings\n", - "\n", - "warnings.simplefilter(\"ignore\")\n", - "warnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n", - "\n", "from sloyka.src.geocoder.geocoder import Geocoder" ] }, @@ -63,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -71,7 +40,8 @@ "\n", "s_data = {\n", " \"text\": [\n", - " \"На биржевой 15 снова шумят!!\"\n", + " # 'Рубинштейна 25 дворовую территорию уберите, гд...',\n", + " 'На Биржевой 14 у Школы отремантируйте! Сад Василеостровец во...'\n", " ]\n", " }\n", "df = pd.DataFrame(s_data)" @@ -79,112 +49,27 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# df = pd.read_csv('/Users/test/Documents/code/sloyka/sloyka/sample_data/vnukovomos.csv', sep=';')" + "df = pd.read_csv('/Users/test/Documents/code/sloyka/sloyka/sample_data/sample_data.csv', sep=',', index_col=0)" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
text
0У Рубинштейна 25 дом дворовую территорию убери...
1На биржевой 15 снова шумят!!
\n", - "
" - ], - "text/plain": [ - " text\n", - "0 У Рубинштейна 25 дом дворовую территорию убери...\n", - "1 На биржевой 15 снова шумят!!" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-06-27 15:34:04,935 SequenceTagger predicts: Dictionary with 5 tags: O, S-ADDRESS, B-ADDRESS, E-ADDRESS, I-ADDRESS\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Processing category place: 0%| | 0/1 [00:00 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)\n", - " return self._transformer._transform_point(\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pyproj/transformer.py:820: DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)\n", - " return self._transformer._transform_point(\n", - "Processing category place: 100%|██████████| 1/1 [00:00<00:00, 3.65it/s]\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n" - ] - } - ], + "outputs": [], "source": [ "osm_id = 337422\n", "geocoder = Geocoder(osm_id=osm_id, city_tags={'place':['state']})\n" @@ -199,609 +84,28 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 2/2 [00:00<00:00, 162.18it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m06-27 15:34\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mError in extract_toponym with text 'На биржевой 15 снова шумят!!' and street_name 'биржевой': list index out of range\u001b[0m\n", - "\u001b[32m06-27 15:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mget_stem started\u001b[0m\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m06-27 15:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcreate_gdf started\u001b[0m\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 4/4 [00:02<00:00, 1.98it/s]\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n", - "/Users/test/Library/Caches/pypoetry/virtualenvs/sloyka-hdx_c6ud-py3.11/lib/python3.11/site-packages/geopandas/array.py:1406: UserWarning: CRS not set for some of the concatenation inputs. Setting output's CRS as WGS 84 (the single non-null crs provided).\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
textStreetNumbersToponymsfull_street_namelocation_optionsaddr_to_geocodeonly_full_street_nameLocationgeometryother_geo_objgeo_obj_tag
0У Рубинштейна 25 дом дворовую территорию убери...рубинштейнаNoneулица Рубинштейна Санкт-Петербург Россия[улица Рубинштейна Санкт-Петербург Россия]улица Рубинштейна Санкт-Петербург Россияулица Рубинштейнаулица Рубинштейна, Владимирский округ, Санкт-П...POINT (30.34356 59.92829)NaNstreet
1На биржевой 15 снова шумят!!биржевой15NoneБиржевой переулок 15 Санкт-Петербург Россия,Би...[Биржевой переулок 15 Санкт-Петербург Россия, ...Биржевой переулок 15 Санкт-Петербург РоссияБиржевой переулокБиржевой переулок, округ № 7, Санкт-Петербург,...POINT (30.29380 59.94515)NaNstreet
2На биржевой 15 снова шумят!!биржевой15NoneБиржевой переулок 15 Санкт-Петербург Россия,Би...[Биржевой переулок 15 Санкт-Петербург Россия, ...Биржевой мост 15 Санкт-Петербург РоссияБиржевой мостБиржевой мост, Введенский округ, Санкт-Петербу...POINT (30.30346 59.94736)NaNstreet
3На биржевой 15 снова шумят!!биржевой15NoneБиржевой переулок 15 Санкт-Петербург Россия,Би...[Биржевой переулок 15 Санкт-Петербург Россия, ...Биржевой проезд 15 Санкт-Петербург РоссияБиржевой проездБиржевой проезд, округ № 7, Санкт-Петербург, С...POINT (30.30556 59.94319)NaNstreet
\n", - "
" - ], - "text/plain": [ - " text Street Numbers \\\n", - "0 У Рубинштейна 25 дом дворовую территорию убери... рубинштейна \n", - "1 На биржевой 15 снова шумят!! биржевой 15 \n", - "2 На биржевой 15 снова шумят!! биржевой 15 \n", - "3 На биржевой 15 снова шумят!! биржевой 15 \n", - "\n", - " Toponyms full_street_name \\\n", - "0 None улица Рубинштейна Санкт-Петербург Россия \n", - "1 None Биржевой переулок 15 Санкт-Петербург Россия,Би... \n", - "2 None Биржевой переулок 15 Санкт-Петербург Россия,Би... \n", - "3 None Биржевой переулок 15 Санкт-Петербург Россия,Би... \n", - "\n", - " location_options \\\n", - "0 [улица Рубинштейна Санкт-Петербург Россия] \n", - "1 [Биржевой переулок 15 Санкт-Петербург Россия, ... \n", - "2 [Биржевой переулок 15 Санкт-Петербург Россия, ... \n", - "3 [Биржевой переулок 15 Санкт-Петербург Россия, ... \n", - "\n", - " addr_to_geocode only_full_street_name \\\n", - "0 улица Рубинштейна Санкт-Петербург Россия улица Рубинштейна \n", - "1 Биржевой переулок 15 Санкт-Петербург Россия Биржевой переулок \n", - "2 Биржевой мост 15 Санкт-Петербург Россия Биржевой мост \n", - "3 Биржевой проезд 15 Санкт-Петербург Россия Биржевой проезд \n", - "\n", - " Location \\\n", - "0 улица Рубинштейна, Владимирский округ, Санкт-П... \n", - "1 Биржевой переулок, округ № 7, Санкт-Петербург,... \n", - "2 Биржевой мост, Введенский округ, Санкт-Петербу... \n", - "3 Биржевой проезд, округ № 7, Санкт-Петербург, С... \n", - "\n", - " geometry other_geo_obj geo_obj_tag \n", - "0 POINT (30.34356 59.92829) NaN street \n", - "1 POINT (30.29380 59.94515) NaN street \n", - "2 POINT (30.30346 59.94736) NaN street \n", - "3 POINT (30.30556 59.94319) NaN street " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "result = geocoder.run(df, search_for_objects=True)\n", + "result = geocoder.run(df, search_for_objects=True, group_column=None, text_column='Текст комментария')\n", "display(result)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
textStreetNumbersToponymsfull_street_namelocation_optionsaddr_to_geocodeonly_full_street_nameLocationgeometryother_geo_objgeo_obj_tag
0У Рубинштейна 25 дом дворовую территорию убери...рубинштейнаNoneулица Рубинштейна Санкт-Петербург Россия[улица Рубинштейна Санкт-Петербург Россия]улица Рубинштейна Санкт-Петербург Россияулица Рубинштейнаулица Рубинштейна, Владимирский округ, Санкт-П...POINT (30.34356 59.92829)NaNstreet
1На биржевой 15 снова шумят!!биржевой15NoneБиржевой переулок 15 Санкт-Петербург Россия,Би...[Биржевой переулок 15 Санкт-Петербург Россия, ...Биржевой переулок 15 Санкт-Петербург РоссияБиржевой переулокБиржевой переулок, округ № 7, Санкт-Петербург,...POINT (30.29380 59.94515)NaNstreet
2На биржевой 15 снова шумят!!биржевой15NoneБиржевой переулок 15 Санкт-Петербург Россия,Би...[Биржевой переулок 15 Санкт-Петербург Россия, ...Биржевой мост 15 Санкт-Петербург РоссияБиржевой мостБиржевой мост, Введенский округ, Санкт-Петербу...POINT (30.30346 59.94736)NaNstreet
3На биржевой 15 снова шумят!!биржевой15NoneБиржевой переулок 15 Санкт-Петербург Россия,Би...[Биржевой переулок 15 Санкт-Петербург Россия, ...Биржевой проезд 15 Санкт-Петербург РоссияБиржевой проездБиржевой проезд, округ № 7, Санкт-Петербург, С...POINT (30.30556 59.94319)NaNstreet
\n", - "
" - ], - "text/plain": [ - " text Street Numbers \\\n", - "0 У Рубинштейна 25 дом дворовую территорию убери... рубинштейна \n", - "1 На биржевой 15 снова шумят!! биржевой 15 \n", - "2 На биржевой 15 снова шумят!! биржевой 15 \n", - "3 На биржевой 15 снова шумят!! биржевой 15 \n", - "\n", - " Toponyms full_street_name \\\n", - "0 None улица Рубинштейна Санкт-Петербург Россия \n", - "1 None Биржевой переулок 15 Санкт-Петербург Россия,Би... \n", - "2 None Биржевой переулок 15 Санкт-Петербург Россия,Би... \n", - "3 None Биржевой переулок 15 Санкт-Петербург Россия,Би... \n", - "\n", - " location_options \\\n", - "0 [улица Рубинштейна Санкт-Петербург Россия] \n", - "1 [Биржевой переулок 15 Санкт-Петербург Россия, ... \n", - "2 [Биржевой переулок 15 Санкт-Петербург Россия, ... \n", - "3 [Биржевой переулок 15 Санкт-Петербург Россия, ... \n", - "\n", - " addr_to_geocode only_full_street_name \\\n", - "0 улица Рубинштейна Санкт-Петербург Россия улица Рубинштейна \n", - "1 Биржевой переулок 15 Санкт-Петербург Россия Биржевой переулок \n", - "2 Биржевой мост 15 Санкт-Петербург Россия Биржевой мост \n", - "3 Биржевой проезд 15 Санкт-Петербург Россия Биржевой проезд \n", - "\n", - " Location \\\n", - "0 улица Рубинштейна, Владимирский округ, Санкт-П... \n", - "1 Биржевой переулок, округ № 7, Санкт-Петербург,... \n", - "2 Биржевой мост, Введенский округ, Санкт-Петербу... \n", - "3 Биржевой проезд, округ № 7, Санкт-Петербург, С... \n", - "\n", - " geometry other_geo_obj geo_obj_tag \n", - "0 POINT (30.34356 59.92829) NaN street \n", - "1 POINT (30.29380 59.94515) NaN street \n", - "2 POINT (30.30346 59.94736) NaN street \n", - "3 POINT (30.30556 59.94319) NaN street " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "result" + "print(result.dropna(subset='other_geo_obj').drop(columns=['Location']).shape)" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "result.drop(columns=['Location']).iloc[:,:].explore()" ] diff --git a/pyproject.toml b/pyproject.toml index 4593ef2..6cafd30 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "sloyka" -version = "0.1.7" +version = "0.1.8" description = "Library for city identity analysis from social media posts and comments" authors = ["sandrro, georgekontsevik"] readme = "README.md" diff --git a/sloyka/__init__.py b/sloyka/__init__.py index 910d40f..f59946f 100644 --- a/sloyka/__init__.py +++ b/sloyka/__init__.py @@ -25,10 +25,10 @@ "EmotionRecognizer", ] -logger.remove() -logger.add( - sys.stdout, - format="{time:MM-DD HH:mm} | {level: <8} | {message}", - level="INFO", - colorize=True, -) +# logger.remove() +# logger.add( +# sys.stdout, +# format="{time:MM-DD HH:mm} | {level: <8} | {message}", +# level="INFO", +# colorize=True, +# ) diff --git a/sloyka/src/geocoder/city_objects_getter.py b/sloyka/src/geocoder/city_objects_extractor.py similarity index 94% rename from sloyka/src/geocoder/city_objects_getter.py rename to sloyka/src/geocoder/city_objects_extractor.py index 85f4db3..192a532 100644 --- a/sloyka/src/geocoder/city_objects_getter.py +++ b/sloyka/src/geocoder/city_objects_extractor.py @@ -4,18 +4,13 @@ import osmnx as ox from shapely.geometry import Point, Polygon, MultiPolygon from loguru import logger -from natasha import MorphVocab +import pymorphy2 from sloyka.src.utils.constants import NUM_CITY_OBJ from sloyka.src.geocoder.objects_address_extractor_by_rules import AddressExtractorExtra from sloyka.src.utils.data_getter.geo_data_getter import GeoDataGetter from rapidfuzz import fuzz import numpy as np -import warnings - -warnings.simplefilter("ignore") -warnings.filterwarnings("ignore", category=DeprecationWarning) - class OtherGeoObjects: @staticmethod @@ -43,18 +38,23 @@ def run_osm_dfs(osm_id: int) -> pd.DataFrame: {"historic": ["monument", "memorial"]}, {"place": ["square"]}, ] - + osm_dfs = list() for tags in tags_list: + logger.debug(f'getting {osm_id, tags}') try: tmp_df = OtherGeoObjects.get_and_process_osm_data(osm_id, tags) osm_dfs.append(tmp_df) except RuntimeError: + logger.warning(f'Runtime error during fetching {osm_id, tags}') continue if osm_dfs: osm_combined_df = pd.concat(osm_dfs, axis=0) + logger.debug(f'got {osm_id, tags}') + logger.debug(f'{osm_combined_df.shape}') return osm_combined_df else: + logger.warning(f'No data were gathered about city objects in {osm_id}') return pd.DataFrame() @staticmethod @@ -76,7 +76,7 @@ def extract_geo_obj(text) -> List[str]: """ if text is None: return None - morph = MorphVocab() + morph = pymorphy2.MorphAnalyzer() extractor = AddressExtractorExtra(morph) other_geo_obj = [] @@ -96,10 +96,10 @@ def extract_geo_obj(text) -> List[str]: other_geo_obj.append(part.value) elif part.type: other_geo_obj.append(part.type) - if not other_geo_obj: - return other_geo_obj + if not other_geo_obj: + return other_geo_obj except Exception as e: - # logger.exception(f"Error extracting geo objects: {e}") + # logger.warning(f"Error extracting geo objects: {e}") return other_geo_obj return other_geo_obj diff --git a/sloyka/src/geocoder/geocoder.py b/sloyka/src/geocoder/geocoder.py index a68a1e1..73899a9 100644 --- a/sloyka/src/geocoder/geocoder.py +++ b/sloyka/src/geocoder/geocoder.py @@ -52,18 +52,13 @@ from loguru import logger from pandarallel import pandarallel -from sloyka.src.geocoder.city_objects_getter import OtherGeoObjects +from sloyka.src.geocoder.city_objects_extractor import OtherGeoObjects from sloyka.src.utils.data_getter.street_getter import Streets from sloyka.src.utils.data_getter.location_getter import Location from sloyka.src.utils.data_getter.geo_data_getter import GeoDataGetter from sloyka.src.geocoder.street_extractor import StreetExtractor from sloyka.src.geocoder.word_form_matcher import WordFormFinder -import warnings - -warnings.simplefilter("ignore") -warnings.filterwarnings("ignore", category=DeprecationWarning) - pandarallel.initialize(progress_bar=True, nb_workers=-1) # segmenter = Segmenter() @@ -74,11 +69,6 @@ # morph_tagger = NewsMorphTagger(emb) # syntax_parser = NewsSyntaxParser(emb) # ner_tagger = NewsNERTagger(emb) -warnings.simplefilter(action="ignore", category=FutureWarning) -warnings.filterwarnings("ignore", category=DeprecationWarning) - - - stemmer = SnowballStemmer("russian") @@ -333,6 +323,11 @@ def run( """ initial_df = df.copy() + + if search_for_objects: + df_obj = OtherGeoObjects.run(self.osm_id, df, text_column) + + if tags: df_areas = self.get_df_areas(self.osm_id, tags) df_areas = self.preprocess_area_names(df_areas) @@ -358,11 +353,9 @@ def run( del street_names gdf = self.create_gdf(df) - if search_for_objects: - df_obj = OtherGeoObjects.run(self.osm_id, df, text_column) - gdf = pd.concat([gdf, df_obj], ignore_index=True) - del df_obj - gdf["geo_obj_tag"] = gdf["geo_obj_tag"].apply(Geocoder.assign_street) + gdf = pd.concat([gdf, df_obj], ignore_index=True) + del df_obj + gdf["geo_obj_tag"] = gdf["geo_obj_tag"].apply(Geocoder.assign_street) gdf = pd.merge(gdf, initial_df, on=text_column, how='right') diff --git a/sloyka/src/geocoder/objects_address_extractor_by_rules.py b/sloyka/src/geocoder/objects_address_extractor_by_rules.py index 4b191d8..3b5e997 100644 --- a/sloyka/src/geocoder/objects_address_extractor_by_rules.py +++ b/sloyka/src/geocoder/objects_address_extractor_by_rules.py @@ -1,14 +1,13 @@ -# addr_extractor.py import warnings - -warnings.simplefilter("ignore") -warnings.filterwarnings("ignore", category=DeprecationWarning) +warnings.filterwarnings("ignore") from natasha.extractors import Match from natasha.extractors import Extractor from ..utils.data_processing.rule_for_natasha import ADDR_PART +from loguru import logger + class AddrExtractorError(Exception): """Custom exception for address extractor errors""" @@ -17,44 +16,18 @@ class AddrExtractorError(Exception): class AddressExtractorExtra(Extractor): - """ - Extractor for addresses - """ - - # logger = logging.getLogger(__name__) - def __init__(self, morph): - """ - Initialize the address extractor - - :param morph: Morphological analyzer - """ - super().__init__(ADDR_PART, morph) + Extractor.__init__(self, ADDR_PART, morph) def find(self, text): - """ - Extract addresses from the given text - - :param text: Input text - :return: Match object containing the extracted address - """ - # self.logger.info(f"Extracting addresses from text: {text}") matches = self(text) if not matches: - # self.logger.debug("No matches found") return matches = sorted(matches, key=lambda _: _.start) if not matches: - # self.logger.debug("No matches found after sorting") return - start = matches[0].start stop = matches[-1].stop parts = [_.fact for _ in matches] - # self.logger.debug(f"Extracted address parts: {parts}") - try: - return Match(start, stop, obj.Addr(parts)) - except Exception as e: - # self.logger.error(f"Error creating Match object: {e}") - raise AddrExtractorError(f"Error creating Match object: {e}") + return Match(start, stop, obj.Addr(parts)) \ No newline at end of file diff --git a/sloyka/src/geocoder/street_extractor.py b/sloyka/src/geocoder/street_extractor.py index 4b92178..c7d2774 100644 --- a/sloyka/src/geocoder/street_extractor.py +++ b/sloyka/src/geocoder/street_extractor.py @@ -45,7 +45,7 @@ def process_pipeline(df: pd.DataFrame, text_column: str, classifier) -> pd.DataF texts = StreetExtractor._preprocess_text_column(local_df, text_column) extracted_streets = StreetExtractor._extract_streets(texts, classifier) refined_streets = StreetExtractor._refine_street_data(extracted_streets) - building_numbers = StreetExtractor._extract_building_numbers(texts, refined_streets) + building_numbers = StreetExtractor._get_number(texts, refined_streets) toponyms = StreetExtractor._extract_toponyms(texts, refined_streets) # Combine results into a DataFrame @@ -152,7 +152,7 @@ def _refine_street_name(street: str) -> str: return "" @staticmethod - def _extract_building_numbers(texts: List[str], streets: List[Optional[str]]) -> List[Optional[str]]: + def _get_number(texts: List[str], streets: List[Optional[str]]) -> List[Optional[str]]: """ Extract building numbers from the text data. @@ -167,7 +167,7 @@ def _extract_building_numbers(texts: List[str], streets: List[Optional[str]]) -> for text, street in zip(texts, streets): if street: try: - building_numbers.append(StreetExtractor._extract_building_number(text, street)) + building_numbers.append(StreetExtractor._extract_building_number_from_text(text, street)) except Exception as e: logger.warning(f"Error extracting building number from text '{text}' with street '{street}': {e}") building_numbers.append(None) @@ -176,7 +176,7 @@ def _extract_building_numbers(texts: List[str], streets: List[Optional[str]]) -> return building_numbers @staticmethod - def _extract_building_number(text: str, street: str) -> str: + def _extract_building_number_from_text(text: str, street: str) -> str: """ Extract building number from the text. @@ -189,9 +189,9 @@ def _extract_building_number(text: str, street: str) -> str: """ try: numbers = " ".join(re.findall(r"\d+", text)) - return StreetExtractor.extract_building_num(text, street, numbers) + return StreetExtractor._check_if_extracted_number_legit(text, street, numbers) except Exception as e: - logger.warning(f"Error in _extract_building_number with text '{text}' and street '{street}': {e}") + logger.warning(f"Error in _extract_building_number_from_text with text '{text}' and street '{street}': {e}") return "" @staticmethod @@ -312,7 +312,7 @@ def _search_toponyms(words: List[str], position: int) -> Optional[str]: return None @staticmethod - def extract_building_num(text: str, street_name: str, number: Optional[str]) -> str: + def _check_if_extracted_number_legit(text: str, street_name: str, number: Optional[str]) -> str: """ Extract building numbers near the specified street name in the text. @@ -358,7 +358,7 @@ def _find_street_name_positions(words: List[str], street_name: str) -> List[int] Returns: List[int]: List of positions where the street name occurs. """ - return [index for index, word in enumerate(words) if word == street_name] + return [index for index, word in enumerate(words) if word.lower() == street_name] @staticmethod def _search_building_number(words: List[str], position: int) -> str: diff --git a/sloyka/src/geocoder/text_address_extractor_by_rules.py b/sloyka/src/geocoder/text_address_extractor_by_rules.py index 421acf0..bd80686 100644 --- a/sloyka/src/geocoder/text_address_extractor_by_rules.py +++ b/sloyka/src/geocoder/text_address_extractor_by_rules.py @@ -12,9 +12,6 @@ Doc, ) -import pandas as pd -import pymorphy2 - from sloyka.src.utils.constants import ( EXCEPTIONS_CITY_COUNTRY) diff --git a/sloyka/src/geocoder/word_form_matcher.py b/sloyka/src/geocoder/word_form_matcher.py index d6a71b9..a865b6f 100644 --- a/sloyka/src/geocoder/word_form_matcher.py +++ b/sloyka/src/geocoder/word_form_matcher.py @@ -45,22 +45,22 @@ def _process_row(self, row: pd.Series, strts_df: pd.DataFrame) -> dict: """ try: search_val = row.get("Street") - search_toponym = row.get("Toponims") + search_toponym = row.get("Toponyms") val_num = row.get("Numbers", "") if not search_val or pd.isna(search_val): - logger.warning(f"Error processing row with street '{row.get('Street')}' and toponym '{row.get('Toponims')}") + logger.warning(f"Error processing row with street '{row.get('Street')}' and toponym '{row.get('Toponyms')}") return {"full_street_name": None, "only_full_street_name": None} for col in strts_df.columns[2:]: matching_rows = self._find_matching_rows(strts_df, col, search_val, search_toponym) - if not matching_rows.empty: - full_streets = [self._format_full_address(street, val_num) for street in matching_rows["street"].values] - return { - "full_street_name": ",".join(full_streets), - "only_full_street_name": ",".join(matching_rows["street"].values) - } + if not matching_rows.empty: + full_streets = [self._format_full_address(street, val_num) for street in matching_rows["street"].values] + return { + "full_street_name": ",".join(full_streets), + "only_full_street_name": ",".join(matching_rows["street"].values) + } # If no exact match found, check without toponym if search_val in strts_df[col].values: @@ -71,11 +71,11 @@ def _process_row(self, row: pd.Series, strts_df: pd.DataFrame) -> dict: "only_full_street_name": ",".join(only_streets_full) } else: - logger.warning(f"Error processing row with street '{row.get('Street')}' and toponym '{row.get('Toponims')}'") + logger.warning(f"Error processing row with street '{row.get('Street')}' and toponym '{row.get('Toponyms')}'") return {"full_street_name": None, "only_full_street_name": None} except Exception as e: - logger.warning(f"Error processing row with street '{row.get('Street')}' and toponym '{row.get('Toponims')}': {e}") + logger.warning(f"Error processing row with street '{row.get('Street')}' and toponym '{row.get('Toponyms')}': {e}") return {"full_street_name": None, "only_full_street_name": None} diff --git a/sloyka/src/utils/data_getter/geo_data_getter.py b/sloyka/src/utils/data_getter/geo_data_getter.py index 9be70ee..e48f965 100644 --- a/sloyka/src/utils/data_getter/geo_data_getter.py +++ b/sloyka/src/utils/data_getter/geo_data_getter.py @@ -1,4 +1,5 @@ - +import warnings +warnings.filterwarnings("ignore") import osmnx as ox import geopandas as gpd @@ -112,7 +113,7 @@ def _process_tags(tags, place): gdf = GeoDataGetter._get_features_from_place(place_name, category, tag) gdf_list.append(gdf) except AttributeError: - # GeoDataGetter._handle_error(category, tag) + logger.warning(f'Error processing {tags, place}') pass return gdf_list @@ -142,10 +143,10 @@ def get_drive_graph(city_bounds: gpd.GeoDataFrame) -> nx.MultiDiGraph: # Streets.logger.info("Retrieving drive graph") try: G_drive = ox.graph_from_polygon(city_bounds.dissolve()["geometry"].squeeze(), network_type="drive") - # Streets.logger.debug(f"Drive graph retrieved: {G_drive}") + logger.debug(f"Drive graph retrieved: {G_drive}") return G_drive except Exception as e: - # Streets.logger.error(f"Error retrieving drive graph: {e}") + logger.error(f"Error retrieving drive graph: {e}") raise e # def _handle_error(self, category, tag): diff --git a/sloyka/src/utils/data_processing/rule_for_natasha.py b/sloyka/src/utils/data_processing/rule_for_natasha.py index 8393dbb..6aaa86b 100644 --- a/sloyka/src/utils/data_processing/rule_for_natasha.py +++ b/sloyka/src/utils/data_processing/rule_for_natasha.py @@ -1,1313 +1,2196 @@ -from yargy import rule, or_, and_ -from yargy.interpretation import fact -from yargy.predicates import ( - eq, - lte, - gte, - gram, - type, - tag, - length_eq, - in_, - in_caseless, - dictionary, - normalized, - caseless, - is_title, -) -from yargy.pipelines import morph_pipeline -from yargy.tokenizer import QUOTES - -import warnings - -warnings.simplefilter("ignore") -warnings.filterwarnings("ignore", category=DeprecationWarning) - - -Index = fact("Index", ["value"]) -Region = fact("Region", ["name", "type"]) -Raion = fact("Raion", ["name", "type"]) -Settlement = fact("Settlement", ["name", "type"]) -Street = fact("Street", ["name", "type"]) -Building = fact("Building", ["number", "type"]) -Room = fact("Room", ["number", "type"]) -AddrPart = fact("AddrPart", ["value"]) - - -def value(key): - @property - def field(self): - return getattr(self, key) - - return field - - -class Index(Index): - type = "индекс" - - -class Region(Region): - value = value("name") - - -class Raion(Raion): - value = value("name") - - -class Settlement(Settlement): - value = value("name") - - -class Street(Settlement): - value = value("name") - - -class Building(Building): - value = value("number") - - -class Room(Room): - value = value("number") - - -class AddrPart(AddrPart): - @property - def obj(self): - from natasha import obj - - part = self.value - return obj.AddrPart(part.value, part.type) - - -DASH = eq("-") -DOT = eq(".") - -ADJF = gram("ADJF") -NOUN = gram("NOUN") -INT = type("INT") -TITLE = is_title() - -ANUM = rule(INT, DASH.optional(), in_caseless({"я", "й", "е", "ое", "ая", "ий", "ой"})) - - -############# -# -# FED OKRUGA -# -############ - - -FED_OKRUG_NAME = or_( - rule( - dictionary( - { - "дальневосточный", - "приволжский", - "сибирский", - "уральский", - "центральный", - "южный", - } - ) - ), - rule(caseless("северо"), DASH.optional(), dictionary({"западный", "кавказский"})), -).interpretation(Region.name) - -FED_OKRUG_WORDS = or_(rule(normalized("федеральный"), normalized("округ")), rule(caseless("фо"))).interpretation( - Region.type.const("федеральный округ") -) - -FED_OKRUG = rule(FED_OKRUG_WORDS, FED_OKRUG_NAME).interpretation(Region) - - -######### -# -# RESPUBLIKA -# -############ - - -RESPUBLIKA_WORDS = or_(rule(caseless("респ"), DOT.optional()), rule(normalized("республика"))).interpretation( - Region.type.const("республика") -) - -RESPUBLIKA_ADJF = or_( - rule( - dictionary( - { - "удмуртский", - "чеченский", - "чувашский", - } - ) - ), - rule(caseless("карачаево"), DASH.optional(), normalized("черкесский")), - rule(caseless("кабардино"), DASH.optional(), normalized("балкарский")), -).interpretation(Region.name) - -RESPUBLIKA_NAME = or_( - rule( - dictionary( - { - "адыгея", - "алтай", - "башкортостан", - "бурятия", - "дагестан", - "ингушетия", - "калмыкия", - "карелия", - "коми", - "крым", - "мордовия", - "татарстан", - "тыва", - "удмуртия", - "хакасия", - "саха", - "якутия", - } - ) - ), - rule(caseless("марий"), caseless("эл")), - rule(normalized("северный"), normalized("осетия"), rule("-", normalized("алания")).optional()), -).interpretation(Region.name) - -RESPUBLIKA_ABBR = in_caseless( - { - "кбр", - "кчр", - "рт", # Татарстан - } -).interpretation( - Region.name # TODO type -) - -RESPUBLIKA = or_( - rule(RESPUBLIKA_ADJF, RESPUBLIKA_WORDS), rule(RESPUBLIKA_WORDS, RESPUBLIKA_NAME), rule(RESPUBLIKA_ABBR) -).interpretation(Region) - - -########## -# -# KRAI -# -######## - - -KRAI_WORDS = normalized("край").interpretation(Region.type.const("край")) - -KRAI_NAME = dictionary( - { - "алтайский", - "забайкальский", - "камчатский", - "краснодарский", - "красноярский", - "пермский", - "приморский", - "ставропольский", - "хабаровский", - } -).interpretation(Region.name) - -KRAI = rule(KRAI_NAME, KRAI_WORDS).interpretation(Region) - - -############ -# -# OBLAST -# -############ - - -OBLAST_WORDS = or_(rule(normalized("область")), rule(caseless("обл"), DOT.optional())).interpretation( - Region.type.const("область") -) - -OBLAST_NAME = dictionary( - { - "амурский", - "архангельский", - "астраханский", - "белгородский", - "брянский", - "владимирский", - "волгоградский", - "вологодский", - "воронежский", - "горьковский", - "ивановский", - "ивановский", - "иркутский", - "калининградский", - "калужский", - "камчатский", - "кемеровский", - "кировский", - "костромской", - "курганский", - "курский", - "ленинградский", - "липецкий", - "магаданский", - "московский", - "мурманский", - "нижегородский", - "новгородский", - "новосибирский", - "омский", - "оренбургский", - "орловский", - "пензенский", - "пермский", - "псковский", - "ростовский", - "рязанский", - "самарский", - "саратовский", - "сахалинский", - "свердловский", - "смоленский", - "тамбовский", - "тверской", - "томский", - "тульский", - "тюменский", - "ульяновский", - "челябинский", - "читинский", - "ярославский", - } -).interpretation(Region.name) - -OBLAST = rule(OBLAST_NAME, OBLAST_WORDS).interpretation(Region) - - -########## -# -# AUTO OKRUG -# -############# - - -AUTO_OKRUG_NAME = or_( - rule( - dictionary( - { - "чукотский", - "эвенкийский", - "корякский", - "ненецкий", - "таймырский", - "агинский", - "бурятский", - } - ) - ), - rule(caseless("коми"), "-", normalized("пермяцкий")), - rule(caseless("долгано"), "-", normalized("ненецкий")), - rule(caseless("ямало"), "-", normalized("ненецкий")), -).interpretation(Region.name) - -AUTO_OKRUG_WORDS = or_(rule(normalized("автономный"), normalized("округ")), rule(caseless("ао"))).interpretation( - Region.type.const("автономный округ") -) - -HANTI = rule(caseless("ханты"), "-", normalized("мансийский")).interpretation(Region.name) - -BURAT = rule(caseless("усть"), "-", normalized("ордынский"), normalized("бурятский")).interpretation(Region.name) - -AUTO_OKRUG = or_( - rule(AUTO_OKRUG_NAME, AUTO_OKRUG_WORDS), - or_( - rule(HANTI, AUTO_OKRUG_WORDS, "-", normalized("югра")), - rule( - caseless("хмао"), - ).interpretation(Region.name), - rule(caseless("хмао"), "-", caseless("югра")).interpretation(Region.name), - ), - rule(BURAT, AUTO_OKRUG_WORDS), -).interpretation(Region) - - -########## -# -# RAION -# -########### - - -RAION_WORDS = or_(rule(caseless("р"), "-", in_caseless({"он", "н"})), rule(normalized("район"))).interpretation( - Raion.type.const("район") -) - -RAION_SIMPLE_NAME = and_(ADJF, TITLE) - -RAION_MODIFIERS = rule( - in_caseless( - { - "усть", - "северо", - "александрово", - "гаврилово", - } - ), - DASH.optional(), - TITLE, -) - -RAION_COMPLEX_NAME = rule(RAION_MODIFIERS, RAION_SIMPLE_NAME) - -RAION_NAME = or_(rule(RAION_SIMPLE_NAME), RAION_COMPLEX_NAME).interpretation(Raion.name) - -RAION = rule(RAION_NAME, RAION_WORDS).interpretation(Raion) - - -########### -# -# GOROD -# -########### - - -# Top 200 Russia cities, cover 75% of population - -COMPLEX = morph_pipeline( - [ - "санкт-петербург", - "нижний новгород", - "н.новгород", - "ростов-на-дону", - "набережные челны", - "улан-удэ", - "нижний тагил", - "комсомольск-на-амуре", - "йошкар-ола", - "старый оскол", - "великий новгород", - "южно-сахалинск", - "петропавловск-камчатский", - "каменск-уральский", - "орехово-зуево", - "сергиев посад", - "новый уренгой", - "ленинск-кузнецкий", - "великие луки", - "каменск-шахтинский", - "усть-илимск", - "усолье-сибирский", - "кирово-чепецк", - ] -) - -SIMPLE = dictionary( - { - "москва", - "новосибирск", - "екатеринбург", - "казань", - "самара", - "омск", - "челябинск", - "уфа", - "волгоград", - "пермь", - "красноярск", - "воронеж", - "саратов", - "краснодар", - "тольятти", - "барнаул", - "ижевск", - "ульяновск", - "владивосток", - "ярославль", - "иркутск", - "тюмень", - "махачкала", - "хабаровск", - "оренбург", - "новокузнецк", - "кемерово", - "рязань", - "томск", - "астрахань", - "пенза", - "липецк", - "тула", - "киров", - "чебоксары", - "калининград", - "брянск", - "курск", - "иваново", - "магнитогорск", - "тверь", - "ставрополь", - "симферополь", - "белгород", - "архангельск", - "владимир", - "севастополь", - "сочи", - "курган", - "смоленск", - "калуга", - "чита", - "орёл", - "волжский", - "череповец", - "владикавказ", - "мурманск", - "сургут", - "вологда", - "саранск", - "тамбов", - "стерлитамак", - "грозный", - "якутск", - "кострома", - "петрозаводск", - "таганрог", - "нижневартовск", - "братск", - "новороссийск", - "дзержинск", - "шахта", - "нальчик", - "орск", - "сыктывкар", - "нижнекамск", - "ангарск", - "балашиха", - "благовещенск", - "прокопьевск", - "химки", - "псков", - "бийск", - "энгельс", - "рыбинск", - "балаково", - "северодвинск", - "армавир", - "подольск", - "королёв", - "сызрань", - "норильск", - "златоуст", - "мытищи", - "люберцы", - "волгодонск", - "новочеркасск", - "абакан", - "находка", - "уссурийск", - "березники", - "салават", - "электросталь", - "миасс", - "первоуральск", - "рубцовск", - "альметьевск", - "ковровый", - "коломна", - "керчь", - "майкоп", - "пятигорск", - "одинцово", - "копейск", - "хасавюрт", - "новомосковск", - "кисловодск", - "серпухов", - "новочебоксарск", - "нефтеюганск", - "димитровград", - "нефтекамск", - "черкесск", - "дербент", - "камышин", - "невинномысск", - "красногорск", - "мур", - "батайск", - "новошахтинск", - "ноябрьск", - "кызыл", - "октябрьский", - "ачинск", - "северск", - "новокуйбышевск", - "елец", - "евпатория", - "арзамас", - "обнинск", - "каспийск", - "элиста", - "пушкино", - "жуковский", - "междуреченск", - "сарапул", - "ессентуки", - "воткинск", - "ногинск", - "тобольск", - "ухта", - "серов", - "бердск", - "мичуринск", - "киселёвск", - "новотроицк", - "зеленодольск", - "соликамск", - "раменский", - "домодедово", - "магадан", - "глазов", - "железногорск", - "канск", - "назрань", - "гатчина", - "саров", - "новоуральск", - "воскресенск", - "долгопрудный", - "бугульма", - "кузнецк", - "губкин", - "кинешма", - "ейск", - "реутов", - "железногорск", - "чайковский", - "азов", - "бузулук", - "озёрск", - "балашов", - "юрга", - "кропоткин", - "клин", - } -) - -GOROD_ABBR = in_caseless({"спб", "мск", "нск"}) # Новосибирск - -GOROD_NAME = or_(rule(SIMPLE), COMPLEX, rule(GOROD_ABBR)).interpretation(Settlement.name) - -SIMPLE = and_(TITLE, or_(NOUN, ADJF)) # Железнодорожный, Юбилейный - -COMPLEX = or_( - rule(SIMPLE, DASH.optional(), SIMPLE), rule(TITLE, DASH.optional(), caseless("на"), DASH.optional(), TITLE) -) - -NAME = or_(rule(SIMPLE), COMPLEX) - -MAYBE_GOROD_NAME = or_(NAME, rule(NAME, "-", INT)).interpretation(Settlement.name) - -GOROD_WORDS = or_(rule(normalized("город")), rule(caseless("г"), DOT.optional())).interpretation( - Settlement.type.const("город") -) - -GOROD = or_(rule(GOROD_WORDS, MAYBE_GOROD_NAME), rule(GOROD_WORDS.optional(), GOROD_NAME)).interpretation(Settlement) - - -########## -# -# SETTLEMENT NAME -# -########## - - -ADJS = gram("ADJS") -SIMPLE = and_( - or_( - NOUN, # Александровка, Заречье, Горки - ADJS, # Кузнецово - ADJF, # Никольское, Новая, Марьино - ), - TITLE, -) - -COMPLEX = rule(SIMPLE, DASH.optional(), SIMPLE) - -NAME = or_(rule(SIMPLE), COMPLEX) - -SETTLEMENT_NAME = or_(NAME, rule(NAME, "-", INT), rule(NAME, ANUM)) - - -########### -# -# SELO -# -############# - - -SELO_WORDS = or_(rule(caseless("с"), DOT.optional()), rule(normalized("село"))).interpretation( - Settlement.type.const("село") -) - -SELO_NAME = SETTLEMENT_NAME.interpretation(Settlement.name) - -SELO = rule(SELO_WORDS, SELO_NAME).interpretation(Settlement) - - -########### -# -# DEREVNYA -# -############# - - -DEREVNYA_WORDS = or_(rule(caseless("д"), DOT.optional()), rule(normalized("деревня"))).interpretation( - Settlement.type.const("деревня") -) - -DEREVNYA_NAME = SETTLEMENT_NAME.interpretation(Settlement.name) - -DEREVNYA = rule(DEREVNYA_WORDS, DEREVNYA_NAME).interpretation(Settlement) - - -########### -# -# POSELOK -# -############# - - -POSELOK_WORDS = or_( - rule(in_caseless({"п", "пос"}), DOT.optional()), - rule(normalized("посёлок")), - rule(caseless("р"), DOT.optional(), caseless("п"), DOT.optional()), - rule(normalized("рабочий"), normalized("посёлок")), - rule(caseless("пгт"), DOT.optional()), - rule(caseless("п"), DOT, caseless("г"), DOT, caseless("т"), DOT.optional()), - rule( - normalized("посёлок"), - normalized("городского"), - normalized("типа"), - ), -).interpretation(Settlement.type.const("посёлок")) - -POSELOK_NAME = SETTLEMENT_NAME.interpretation(Settlement.name) - -POSELOK = rule(POSELOK_WORDS, POSELOK_NAME).interpretation(Settlement) - - -############## -# -# ADDR PERSON -# -############ - - -ABBR = and_(length_eq(1), is_title()) - -PART = and_(TITLE, or_(gram("Name"), gram("Surn"))) - -MAYBE_FIO = or_( - rule(TITLE, PART), - rule(PART, TITLE), - rule(ABBR, ".", TITLE), - rule(ABBR, ".", ABBR, ".", TITLE), - rule(TITLE, ABBR, ".", ABBR, "."), -) - -POSITION_WORDS_ = or_( - rule( - dictionary( - { - "мичман", - "геолог", - "подводник", - "краевед", - "снайпер", - "штурман", - "бригадир", - "учитель", - "политрук", - "военком", - "ветеран", - "историк", - "пулемётчик", - "авиаконструктор", - "адмирал", - "академик", - "актер", - "актриса", - "архитектор", - "атаман", - "врач", - "воевода", - "генерал", - "губернатор", - "хирург", - "декабрист", - "разведчик", - "граф", - "десантник", - "конструктор", - "скульптор", - "писатель", - "поэт", - "капитан", - "князь", - "комиссар", - "композитор", - "космонавт", - "купец", - "лейтенант", - "лётчик", - "майор", - "маршал", - "матрос", - "подполковник", - "полковник", - "профессор", - "сержант", - "старшина", - "танкист", - "художник", - "герой", - "княгиня", - "строитель", - "дружинник", - "диктор", - "прапорщик", - "артиллерист", - "графиня", - "большевик", - "патриарх", - "сварщик", - "офицер", - "рыбак", - "брат", - } - ) - ), - rule(normalized("генерал"), normalized("армия")), - rule(normalized("герой"), normalized("россия")), - rule(normalized("герой"), normalized("российский"), normalized("федерация")), - rule(normalized("герой"), normalized("советский"), normalized("союз")), -) - -ABBR_POSITION_WORDS = rule( - in_caseless( - { - "адм", - "ак", - "акад", - } - ), - DOT.optional(), -) - -POSITION_WORDS = or_(POSITION_WORDS_, ABBR_POSITION_WORDS) - -MAYBE_PERSON = or_(MAYBE_FIO, rule(POSITION_WORDS, MAYBE_FIO), rule(POSITION_WORDS, TITLE)) - - -########### -# -# IMENI -# -########## - - -IMENI_WORDS = or_(rule(caseless("им"), DOT.optional()), rule(caseless("имени"))) - -IMENI = or_(rule(IMENI_WORDS.optional(), MAYBE_PERSON), rule(IMENI_WORDS, TITLE)) - -########## -# -# LET -# -########## - - -LET_WORDS = or_(rule(caseless("лет")), rule(DASH.optional(), caseless("летия"))) - -LET_NAME = in_caseless( - { - "влксм", - "ссср", - "алтая", - "башкирии", - "бурятии", - "дагестана", - "калмыкии", - "колхоза", - "комсомола", - "космонавтики", - "москвы", - "октября", - "пионерии", - "победы", - "приморья", - "района", - "совхоза", - "совхозу", - "татарстана", - "тувы", - "удмуртии", - "улуса", - "хакасии", - "целины", - "чувашии", - "якутии", - } -) - -LET = rule(INT, LET_WORDS, LET_NAME) - - -########## -# -# ADDR DATE -# -############# - - -MONTH_WORDS = dictionary( - { - "январь", - "февраль", - "март", - "апрель", - "май", - "июнь", - "июль", - "август", - "сентябрь", - "октябрь", - "ноябрь", - "декабрь", - } -) - -DAY = and_(INT, gte(1), lte(31)) - -YEAR = and_(INT, gte(1), lte(2100)) - -YEAR_WORDS = normalized("год") - -DATE = or_(rule(DAY, MONTH_WORDS), rule(YEAR, YEAR_WORDS)) - - -######### -# -# MODIFIER -# -############ - - -MODIFIER_WORDS_ = rule( - dictionary( - { - "большой", - "малый", - "средний", - "верхний", - "центральный", - "нижний", - "северный", - "дальний", - "первый", - "второй", - "старый", - "новый", - "красный", - "лесной", - "тихий", - } - ), - DASH.optional(), -) - -ABBR_MODIFIER_WORDS = rule(in_caseless({"б", "м", "н"}), DOT.optional()) - -SHORT_MODIFIER_WORDS = rule( - in_caseless( - { - "больше", - "мало", - "средне", - "верх", - "верхне", - "центрально", - "нижне", - "северо", - "дальне", - "восточно", - "западно", - "перво", - "второ", - "старо", - "ново", - "красно", - "тихо", - "горно", - } - ), - DASH.optional(), -) - -MODIFIER_WORDS = or_( - MODIFIER_WORDS_, - ABBR_MODIFIER_WORDS, - SHORT_MODIFIER_WORDS, -) - - -########## -# -# ADDR NAME -# -########## - - -ROD = gram("gent") - -SIMPLE = and_( - or_( - ADJF, # Школьная - and_(NOUN, ROD), # Ленина, Победы - ), - TITLE, -) - -COMPLEX = or_( - rule(and_(ADJF, TITLE), NOUN), - rule(TITLE, DASH.optional(), TITLE), -) - -# TODO -EXCEPTION = dictionary({"арбат", "варварка"}) - -MAYBE_NAME = or_(rule(SIMPLE), COMPLEX, rule(EXCEPTION)) - -NAME = or_(MAYBE_NAME, LET, DATE, IMENI) - -NAME = rule(MODIFIER_WORDS.optional(), NAME) - -ADDR_CRF = tag("I").repeatable() - -NAME = or_( - NAME, ANUM, rule(NAME, ANUM), rule(ANUM, NAME), rule(INT, DASH.optional(), NAME), rule(NAME, DASH, INT), ADDR_CRF -) - -ADDR_NAME = NAME - - -######## -# -# STREET -# -######### - - -STREET_WORDS = or_(rule(normalized("улица")), rule(caseless("ул"), DOT.optional())).interpretation( - Street.type.const("улица") -) - -STREET_NAME = ADDR_NAME.interpretation(Street.name) - -STREET = or_(rule(STREET_WORDS, STREET_NAME), rule(STREET_NAME, STREET_WORDS)).interpretation(Street) - - -########## -# -# PROSPEKT -# -########## - - -PROSPEKT_WORDS = or_( - rule(in_caseless({"пр", "просп"}), DOT.optional()), - rule(caseless("пр"), "-", in_caseless({"кт", "т"}), DOT.optional()), - rule(normalized("проспект")), -).interpretation(Street.type.const("проспект")) - -PROSPEKT_NAME = ADDR_NAME.interpretation(Street.name) - -PROSPEKT = or_(rule(PROSPEKT_WORDS, PROSPEKT_NAME), rule(PROSPEKT_NAME, PROSPEKT_WORDS)).interpretation(Street) - - -############ -# -# PROEZD -# -############# - - -PROEZD_WORDS = or_( - rule(caseless("пр"), DOT.optional()), - rule(caseless("пр"), "-", in_caseless({"зд", "д"}), DOT.optional()), - rule(normalized("проезд")), -).interpretation(Street.type.const("проезд")) - -PROEZD_NAME = ADDR_NAME.interpretation(Street.name) - -PROEZD = or_(rule(PROEZD_WORDS, PROEZD_NAME), rule(PROEZD_NAME, PROEZD_WORDS)).interpretation(Street) - - -########### -# -# PEREULOK -# -############## - - -PEREULOK_WORDS = or_( - rule(caseless("п"), DOT), rule(caseless("пер"), DOT.optional()), rule(normalized("переулок")) -).interpretation(Street.type.const("переулок")) - -PEREULOK_NAME = ADDR_NAME.interpretation(Street.name) - -PEREULOK = or_(rule(PEREULOK_WORDS, PEREULOK_NAME), rule(PEREULOK_NAME, PEREULOK_WORDS)).interpretation(Street) - - -######## -# -# PLOSHAD -# -########## - - -PLOSHAD_WORDS = or_(rule(caseless("пл"), DOT.optional()), rule(normalized("площадь"))).interpretation( - Street.type.const("площадь") -) - -PLOSHAD_NAME = ADDR_NAME.interpretation(Street.name) - -PLOSHAD = or_(rule(PLOSHAD_WORDS, PLOSHAD_NAME), rule(PLOSHAD_NAME, PLOSHAD_WORDS)).interpretation(Street) - - -############ -# -# SHOSSE -# -########### - - -# TODO -# Покровское 17 км. -# Сергеляхское 13 км -# Сергеляхское 14 км. - - -SHOSSE_WORDS = or_(rule(caseless("ш"), DOT), rule(normalized("шоссе"))).interpretation(Street.type.const("шоссе")) - -SHOSSE_NAME = ADDR_NAME.interpretation(Street.name) - -SHOSSE = or_(rule(SHOSSE_WORDS, SHOSSE_NAME), rule(SHOSSE_NAME, SHOSSE_WORDS)).interpretation(Street) - - -######## -# -# NABEREG -# -########## - - -NABEREG_WORDS = or_(rule(caseless("наб"), DOT.optional()), rule(normalized("набережная"))).interpretation( - Street.type.const("набережная") -) - -NABEREG_NAME = ADDR_NAME.interpretation(Street.name) - -NABEREG = or_(rule(NABEREG_WORDS, NABEREG_NAME), rule(NABEREG_NAME, NABEREG_WORDS)).interpretation(Street) - -######## -# -# SAD -# -########## - - -SAD_WORDS = or_(rule(caseless("са"), DOT.optional()), rule(normalized("сад"))).interpretation(Street.type.const("сад")) - -SAD_NAME = ADDR_NAME.interpretation(Street.name) - -SAD = or_(rule(SAD_WORDS, SAD_NAME), rule(SAD_NAME, SAD_WORDS)).interpretation(Street) - -######## -# -# PARK -# -########## - - -PARK_WORDS = or_(rule(caseless("пар"), DOT.optional()), rule(normalized("парк"))).interpretation( - Street.type.const("парк") -) - -PARK_NAME = ADDR_NAME.interpretation(Street.name) - -PARK = or_(rule(PARK_WORDS, PARK_NAME), rule(PARK_NAME, PARK_WORDS)).interpretation(Street) - -######## -# -# SQVER -# -########## - - -SQVER_WORDS = or_(rule(caseless("ск"), DOT.optional()), rule(normalized("сквер"))).interpretation( - Street.type.const("сквер") -) - -SQVER_NAME = ADDR_NAME.interpretation(Street.name) - -SQVER = or_(rule(SQVER_WORDS, SQVER_NAME), rule(SQVER_NAME, SQVER_WORDS)).interpretation(Street) - -######## -# -# ROSCHA -# -########## - - -ROSCHA_WORDS = or_(rule(caseless("рощ"), DOT.optional()), rule(normalized("роща"))).interpretation( - Street.type.const("роща") -) - -ROSCHA_NAME = ADDR_NAME.interpretation(Street.name) - -ROSCHA = or_(rule(ROSCHA_WORDS, ROSCHA_NAME), rule(ROSCHA_NAME, ROSCHA_WORDS)).interpretation(Street) - -######## -# -# BULVAR -# -########## - - -BULVAR_WORDS = or_( - rule(caseless("б"), "-", caseless("р")), - rule(caseless("б"), DOT), - rule(caseless("бул"), DOT.optional()), - rule(normalized("бульвар")), -).interpretation(Street.type.const("бульвар")) - -BULVAR_NAME = ADDR_NAME.interpretation(Street.name) - -BULVAR = or_(rule(BULVAR_WORDS, BULVAR_NAME), rule(BULVAR_NAME, BULVAR_WORDS)).interpretation(Street) - - -############## -# -# ADDR VALUE -# -############# - - -LETTER = in_caseless(set("абвгдежзиклмнопрстуфхшщэюя")) - -QUOTE = in_(QUOTES) - -LETTER = or_(rule(LETTER), rule(QUOTE, LETTER, QUOTE)) - -VALUE = rule(INT, LETTER.optional()) - -SEP = in_(r"/\-") - -VALUE = or_(rule(VALUE), rule(VALUE, SEP, VALUE), rule(VALUE, SEP, LETTER)) - -ADDR_VALUE = rule(eq("№").optional(), VALUE) - - -############ -# -# DOM -# -############# - - -DOM_WORDS = or_(rule(normalized("дом")), rule(caseless("д"), DOT)).interpretation(Building.type.const("дом")) - -DOM_VALUE = ADDR_VALUE.interpretation(Building.number) - -DOM = rule(DOM_WORDS, DOM_VALUE).interpretation(Building) - - -########### -# -# KORPUS -# -########## - - -KORPUS_WORDS = or_(rule(in_caseless({"корп", "кор"}), DOT.optional()), rule(normalized("корпус"))).interpretation( - Building.type.const("корпус") -) - -KORPUS_VALUE = ADDR_VALUE.interpretation(Building.number) - -KORPUS = or_(rule(KORPUS_WORDS, KORPUS_VALUE), rule(KORPUS_VALUE, KORPUS_WORDS)).interpretation(Building) - - -########### -# -# STROENIE -# -########## - - -STROENIE_WORDS = or_(rule(caseless("стр"), DOT.optional()), rule(normalized("строение"))).interpretation( - Building.type.const("строение") -) - -STROENIE_VALUE = ADDR_VALUE.interpretation(Building.number) - -STROENIE = rule(STROENIE_WORDS, STROENIE_VALUE).interpretation(Building) - - -########### -# -# OFIS -# -############# - - -OFIS_WORDS = or_(rule(caseless("оф"), DOT.optional()), rule(normalized("офис"))).interpretation(Room.type.const("офис")) - -OFIS_VALUE = ADDR_VALUE.interpretation(Room.number) - -OFIS = rule(OFIS_WORDS, OFIS_VALUE).interpretation(Room) - - -########### -# -# KVARTIRA -# -############# - - -KVARTIRA_WORDS = or_(rule(caseless("кв"), DOT.optional()), rule(normalized("квартира"))).interpretation( - Room.type.const("квартира") -) - -KVARTIRA_VALUE = ADDR_VALUE.interpretation(Room.number) - -KVARTIRA = rule(KVARTIRA_WORDS, KVARTIRA_VALUE).interpretation(Room) - - -########### -# -# INDEX -# -############# - - -INDEX = and_(INT, gte(100000), lte(999999)).interpretation(Index.value).interpretation(Index) - - -############# -# -# ADDR PART -# -############ - - -ADDR_PART = or_(PLOSHAD, SAD, ROSCHA, SQVER, PARK).interpretation(AddrPart.value).interpretation(AddrPart) + +from yargy import ( + rule, + or_, and_ +) +from yargy.interpretation import fact +from yargy.predicates import ( + eq, lte, gte, gram, type, tag, + length_eq, + in_, in_caseless, dictionary, + normalized, caseless, + is_title +) +from yargy.pipelines import morph_pipeline +from yargy.tokenizer import QUOTES + + +Index = fact( + 'Index', + ['value'] +) +Region = fact( + 'Region', + ['name', 'type'] +) +Raion = fact( + 'Raion', + ['name', 'type'] +) +Settlement = fact( + 'Settlement', + ['name', 'type'] +) +Street = fact( + 'Street', + ['name', 'type'] +) +Building = fact( + 'Building', + ['number', 'type'] +) +Room = fact( + 'Room', + ['number', 'type'] +) +AddrPart = fact( + 'AddrPart', + ['value'] +) + + +def value(key): + @property + def field(self): + return getattr(self, key) + return field + + +class Index(Index): + type = 'индекс' + + + + +class Region(Region): + value = value('name') + + +class Raion(Raion): + value = value('name') + + +class Settlement(Settlement): + value = value('name') + + +class Street(Settlement): + value = value('name') + + +class Building(Building): + value = value('number') + + +class Room(Room): + value = value('number') + + +class AddrPart(AddrPart): + @property + def obj(self): + from natasha import obj + + part = self.value + return obj.AddrPart(part.value, part.type) + + +DASH = eq('-') +DOT = eq('.') + +ADJF = gram('ADJF') +NOUN = gram('NOUN') +INT = type('INT') +TITLE = is_title() + +ANUM = rule( + INT, + DASH.optional(), + in_caseless({ + 'я', 'й', 'е', + 'ое', 'ая', 'ий', 'ой' + }) +) + + + + +############# +# +# FED OKRUGA +# +############ + + +FED_OKRUG_NAME = or_( + rule( + dictionary({ + 'дальневосточный', + 'приволжский', + 'сибирский', + 'уральский', + 'центральный', + 'южный', + }) + ), + rule( + caseless('северо'), + DASH.optional(), + dictionary({ + 'западный', + 'кавказский' + }) + ) +).interpretation( + Region.name +) + +FED_OKRUG_WORDS = or_( + rule( + normalized('федеральный'), + normalized('округ') + ), + rule(caseless('фо')) +).interpretation( + Region.type.const('федеральный округ') +) + +FED_OKRUG = rule( + FED_OKRUG_WORDS, + FED_OKRUG_NAME +).interpretation( + Region +) + + +######### +# +# RESPUBLIKA +# +############ + + +RESPUBLIKA_WORDS = or_( + rule(caseless('респ'), DOT.optional()), + rule(normalized('республика')) +).interpretation( + Region.type.const('республика') +) + +RESPUBLIKA_ADJF = or_( + rule( + dictionary({ + 'удмуртский', + 'чеченский', + 'чувашский', + }) + ), + rule( + caseless('карачаево'), + DASH.optional(), + normalized('черкесский') + ), + rule( + caseless('кабардино'), + DASH.optional(), + normalized('балкарский') + ) +).interpretation( + Region.name +) + +RESPUBLIKA_NAME = or_( + rule( + dictionary({ + 'адыгея', + 'алтай', + 'башкортостан', + 'бурятия', + 'дагестан', + 'ингушетия', + 'калмыкия', + 'карелия', + 'коми', + 'крым', + 'мордовия', + 'татарстан', + 'тыва', + 'удмуртия', + 'хакасия', + 'саха', + 'якутия', + }) + ), + rule(caseless('марий'), caseless('эл')), + rule( + normalized('северный'), normalized('осетия'), + rule('-', normalized('алания')).optional() + ) +).interpretation( + Region.name +) + +RESPUBLIKA_ABBR = in_caseless({ + 'кбр', + 'кчр', + 'рт', # Татарстан +}).interpretation( + Region.name # TODO type +) + +RESPUBLIKA = or_( + rule(RESPUBLIKA_ADJF, RESPUBLIKA_WORDS), + rule(RESPUBLIKA_WORDS, RESPUBLIKA_NAME), + rule(RESPUBLIKA_ABBR) +).interpretation( + Region +) + + +########## +# +# KRAI +# +######## + + +KRAI_WORDS = normalized('край').interpretation( + Region.type.const('край') +) + +KRAI_NAME = dictionary({ + 'алтайский', + 'забайкальский', + 'камчатский', + 'краснодарский', + 'красноярский', + 'пермский', + 'приморский', + 'ставропольский', + 'хабаровский', +}).interpretation( + Region.name +) + +KRAI = rule( + KRAI_NAME, KRAI_WORDS +).interpretation( + Region +) + + +############ +# +# OBLAST +# +############ + + +OBLAST_WORDS = or_( + rule(normalized('область')), + rule( + caseless('обл'), + DOT.optional() + ) +).interpretation( + Region.type.const('область') +) + +OBLAST_NAME = dictionary({ + 'амурский', + 'архангельский', + 'астраханский', + 'белгородский', + 'брянский', + 'владимирский', + 'волгоградский', + 'вологодский', + 'воронежский', + 'горьковский', + 'ивановский', + 'ивановский', + 'иркутский', + 'калининградский', + 'калужский', + 'камчатский', + 'кемеровский', + 'кировский', + 'костромской', + 'курганский', + 'курский', + 'ленинградский', + 'липецкий', + 'магаданский', + 'московский', + 'мурманский', + 'нижегородский', + 'новгородский', + 'новосибирский', + 'омский', + 'оренбургский', + 'орловский', + 'пензенский', + 'пермский', + 'псковский', + 'ростовский', + 'рязанский', + 'самарский', + 'саратовский', + 'сахалинский', + 'свердловский', + 'смоленский', + 'тамбовский', + 'тверской', + 'томский', + 'тульский', + 'тюменский', + 'ульяновский', + 'челябинский', + 'читинский', + 'ярославский', +}).interpretation( + Region.name +) + +OBLAST = rule( + OBLAST_NAME, + OBLAST_WORDS +).interpretation( + Region +) + + +########## +# +# AUTO OKRUG +# +############# + + +AUTO_OKRUG_NAME = or_( + rule( + dictionary({ + 'чукотский', + 'эвенкийский', + 'корякский', + 'ненецкий', + 'таймырский', + 'агинский', + 'бурятский', + }) + ), + rule(caseless('коми'), '-', normalized('пермяцкий')), + rule(caseless('долгано'), '-', normalized('ненецкий')), + rule(caseless('ямало'), '-', normalized('ненецкий')), +).interpretation( + Region.name +) + +AUTO_OKRUG_WORDS = or_( + rule( + normalized('автономный'), + normalized('округ') + ), + rule(caseless('ао')) +).interpretation( + Region.type.const('автономный округ') +) + +HANTI = rule( + caseless('ханты'), '-', normalized('мансийский') +).interpretation( + Region.name +) + +BURAT = rule( + caseless('усть'), '-', normalized('ордынский'), + normalized('бурятский') +).interpretation( + Region.name +) + +AUTO_OKRUG = or_( + rule(AUTO_OKRUG_NAME, AUTO_OKRUG_WORDS), + or_( + rule( + HANTI, + AUTO_OKRUG_WORDS, + '-', normalized('югра') + ), + rule( + caseless('хмао'), + ).interpretation(Region.name), + rule( + caseless('хмао'), + '-', caseless('югра') + ).interpretation(Region.name), + ), + rule( + BURAT, + AUTO_OKRUG_WORDS + ) +).interpretation( + Region +) + + +########## +# +# RAION +# +########### + + +RAION_WORDS = or_( + rule(caseless('р'), '-', in_caseless({'он', 'н'})), + rule(normalized('район')) +).interpretation( + Raion.type.const('район') +) + +RAION_SIMPLE_NAME = and_( + ADJF, + TITLE +) + +RAION_MODIFIERS = rule( + in_caseless({ + 'усть', + 'северо', + 'александрово', + 'гаврилово', + }), + DASH.optional(), + TITLE +) + +RAION_COMPLEX_NAME = rule( + RAION_MODIFIERS, + RAION_SIMPLE_NAME +) + +RAION_NAME = or_( + rule(RAION_SIMPLE_NAME), + RAION_COMPLEX_NAME +).interpretation( + Raion.name +) + +RAION = rule( + RAION_NAME, + RAION_WORDS +).interpretation( + Raion +) + + +########### +# +# GOROD +# +########### + + +# Top 200 Russia cities, cover 75% of population + +COMPLEX = morph_pipeline([ + 'санкт-петербург', + 'нижний новгород', + 'н.новгород', + 'ростов-на-дону', + 'набережные челны', + 'улан-удэ', + 'нижний тагил', + 'комсомольск-на-амуре', + 'йошкар-ола', + 'старый оскол', + 'великий новгород', + 'южно-сахалинск', + 'петропавловск-камчатский', + 'каменск-уральский', + 'орехово-зуево', + 'сергиев посад', + 'новый уренгой', + 'ленинск-кузнецкий', + 'великие луки', + 'каменск-шахтинский', + 'усть-илимск', + 'усолье-сибирский', + 'кирово-чепецк', +]) + +SIMPLE = dictionary({ + 'москва', + 'новосибирск', + 'екатеринбург', + 'казань', + 'самара', + 'омск', + 'челябинск', + 'уфа', + 'волгоград', + 'пермь', + 'красноярск', + 'воронеж', + 'саратов', + 'краснодар', + 'тольятти', + 'барнаул', + 'ижевск', + 'ульяновск', + 'владивосток', + 'ярославль', + 'иркутск', + 'тюмень', + 'махачкала', + 'хабаровск', + 'оренбург', + 'новокузнецк', + 'кемерово', + 'рязань', + 'томск', + 'астрахань', + 'пенза', + 'липецк', + 'тула', + 'киров', + 'чебоксары', + 'калининград', + 'брянск', + 'курск', + 'иваново', + 'магнитогорск', + 'тверь', + 'ставрополь', + 'симферополь', + 'белгород', + 'архангельск', + 'владимир', + 'севастополь', + 'сочи', + 'курган', + 'смоленск', + 'калуга', + 'чита', + 'орёл', + 'волжский', + 'череповец', + 'владикавказ', + 'мурманск', + 'сургут', + 'вологда', + 'саранск', + 'тамбов', + 'стерлитамак', + 'грозный', + 'якутск', + 'кострома', + 'петрозаводск', + 'таганрог', + 'нижневартовск', + 'братск', + 'новороссийск', + 'дзержинск', + 'шахта', + 'нальчик', + 'орск', + 'сыктывкар', + 'нижнекамск', + 'ангарск', + 'балашиха', + 'благовещенск', + 'прокопьевск', + 'химки', + 'псков', + 'бийск', + 'энгельс', + 'рыбинск', + 'балаково', + 'северодвинск', + 'армавир', + 'подольск', + 'королёв', + 'сызрань', + 'норильск', + 'златоуст', + 'мытищи', + 'люберцы', + 'волгодонск', + 'новочеркасск', + 'абакан', + 'находка', + 'уссурийск', + 'березники', + 'салават', + 'электросталь', + 'миасс', + 'первоуральск', + 'рубцовск', + 'альметьевск', + 'ковровый', + 'коломна', + 'керчь', + 'майкоп', + 'пятигорск', + 'одинцово', + 'копейск', + 'хасавюрт', + 'новомосковск', + 'кисловодск', + 'серпухов', + 'новочебоксарск', + 'нефтеюганск', + 'димитровград', + 'нефтекамск', + 'черкесск', + 'дербент', + 'камышин', + 'невинномысск', + 'красногорск', + 'мур', + 'батайск', + 'новошахтинск', + 'ноябрьск', + 'кызыл', + 'октябрьский', + 'ачинск', + 'северск', + 'новокуйбышевск', + 'елец', + 'евпатория', + 'арзамас', + 'обнинск', + 'каспийск', + 'элиста', + 'пушкино', + 'жуковский', + 'междуреченск', + 'сарапул', + 'ессентуки', + 'воткинск', + 'ногинск', + 'тобольск', + 'ухта', + 'серов', + 'бердск', + 'мичуринск', + 'киселёвск', + 'новотроицк', + 'зеленодольск', + 'соликамск', + 'раменский', + 'домодедово', + 'магадан', + 'глазов', + 'железногорск', + 'канск', + 'назрань', + 'гатчина', + 'саров', + 'новоуральск', + 'воскресенск', + 'долгопрудный', + 'бугульма', + 'кузнецк', + 'губкин', + 'кинешма', + 'ейск', + 'реутов', + 'железногорск', + 'чайковский', + 'азов', + 'бузулук', + 'озёрск', + 'балашов', + 'юрга', + 'кропоткин', + 'клин' +}) + +GOROD_ABBR = in_caseless({ + 'спб', + 'мск', + 'нск' # Новосибирск +}) + +GOROD_NAME = or_( + rule(SIMPLE), + COMPLEX, + rule(GOROD_ABBR) +).interpretation( + Settlement.name +) + +SIMPLE = and_( + TITLE, + or_( + NOUN, + ADJF # Железнодорожный, Юбилейный + ) +) + +COMPLEX = or_( + rule( + SIMPLE, + DASH.optional(), + SIMPLE + ), + rule( + TITLE, + DASH.optional(), + caseless('на'), + DASH.optional(), + TITLE + ) +) + +NAME = or_( + rule(SIMPLE), + COMPLEX +) + +MAYBE_GOROD_NAME = or_( + NAME, + rule(NAME, '-', INT) +).interpretation( + Settlement.name +) + +GOROD_WORDS = or_( + rule(normalized('город')), + rule( + caseless('г'), + DOT.optional() + ) +).interpretation( + Settlement.type.const('город') +) + +GOROD = or_( + rule(GOROD_WORDS, MAYBE_GOROD_NAME), + rule( + GOROD_WORDS.optional(), + GOROD_NAME + ) +).interpretation( + Settlement +) + + +########## +# +# SETTLEMENT NAME +# +########## + + +ADJS = gram('ADJS') +SIMPLE = and_( + or_( + NOUN, # Александровка, Заречье, Горки + ADJS, # Кузнецово + ADJF, # Никольское, Новая, Марьино + ), + TITLE +) + +COMPLEX = rule( + SIMPLE, + DASH.optional(), + SIMPLE +) + +NAME = or_( + rule(SIMPLE), + COMPLEX +) + +SETTLEMENT_NAME = or_( + NAME, + rule(NAME, '-', INT), + rule(NAME, ANUM) +) + + +########### +# +# SELO +# +############# + + +SELO_WORDS = or_( + rule( + caseless('с'), + DOT.optional() + ), + rule(normalized('село')) +).interpretation( + Settlement.type.const('село') +) + +SELO_NAME = SETTLEMENT_NAME.interpretation( + Settlement.name +) + +SELO = rule( + SELO_WORDS, + SELO_NAME +).interpretation( + Settlement +) + + +########### +# +# DEREVNYA +# +############# + + +DEREVNYA_WORDS = or_( + rule( + caseless('д'), + DOT.optional() + ), + rule(normalized('деревня')) +).interpretation( + Settlement.type.const('деревня') +) + +DEREVNYA_NAME = SETTLEMENT_NAME.interpretation( + Settlement.name +) + +DEREVNYA = rule( + DEREVNYA_WORDS, + DEREVNYA_NAME +).interpretation( + Settlement +) + + +########### +# +# POSELOK +# +############# + + +POSELOK_WORDS = or_( + rule( + in_caseless({'п', 'пос'}), + DOT.optional() + ), + rule(normalized('посёлок')), + rule( + caseless('р'), + DOT.optional(), + caseless('п'), + DOT.optional() + ), + rule( + normalized('рабочий'), + normalized('посёлок') + ), + rule( + caseless('пгт'), + DOT.optional() + ), + rule( + caseless('п'), DOT, caseless('г'), DOT, caseless('т'), + DOT.optional() + ), + rule( + normalized('посёлок'), + normalized('городского'), + normalized('типа'), + ), +).interpretation( + Settlement.type.const('посёлок') +) + +POSELOK_NAME = SETTLEMENT_NAME.interpretation( + Settlement.name +) + +POSELOK = rule( + POSELOK_WORDS, + POSELOK_NAME +).interpretation( + Settlement +) + + +############## +# +# ADDR PERSON +# +############ + + +ABBR = and_( + length_eq(1), + is_title() +) + +PART = and_( + TITLE, + or_( + gram('Name'), + gram('Surn') + ) +) + +MAYBE_FIO = or_( + rule(TITLE, PART), + rule(PART, TITLE), + rule(ABBR, '.', TITLE), + rule(ABBR, '.', ABBR, '.', TITLE), + rule(TITLE, ABBR, '.', ABBR, '.') +) + +POSITION_WORDS_ = or_( + rule( + dictionary({ + 'мичман', + 'геолог', + 'подводник', + 'краевед', + 'снайпер', + 'штурман', + 'бригадир', + 'учитель', + 'политрук', + 'военком', + 'ветеран', + 'историк', + 'пулемётчик', + 'авиаконструктор', + 'адмирал', + 'академик', + 'актер', + 'актриса', + 'архитектор', + 'атаман', + 'врач', + 'воевода', + 'генерал', + 'губернатор', + 'хирург', + 'декабрист', + 'разведчик', + 'граф', + 'десантник', + 'конструктор', + 'скульптор', + 'писатель', + 'поэт', + 'капитан', + 'князь', + 'комиссар', + 'композитор', + 'космонавт', + 'купец', + 'лейтенант', + 'лётчик', + 'майор', + 'маршал', + 'матрос', + 'подполковник', + 'полковник', + 'профессор', + 'сержант', + 'старшина', + 'танкист', + 'художник', + 'герой', + 'княгиня', + 'строитель', + 'дружинник', + 'диктор', + 'прапорщик', + 'артиллерист', + 'графиня', + 'большевик', + 'патриарх', + 'сварщик', + 'офицер', + 'рыбак', + 'брат', + }) + ), + rule(normalized('генерал'), normalized('армия')), + rule(normalized('герой'), normalized('россия')), + rule( + normalized('герой'), + normalized('российский'), normalized('федерация')), + rule( + normalized('герой'), + normalized('советский'), normalized('союз') + ), +) + +ABBR_POSITION_WORDS = rule( + in_caseless({ + 'адм', + 'ак', + 'акад', + }), + DOT.optional() +) + +POSITION_WORDS = or_( + POSITION_WORDS_, + ABBR_POSITION_WORDS +) + +MAYBE_PERSON = or_( + MAYBE_FIO, + rule(POSITION_WORDS, MAYBE_FIO), + rule(POSITION_WORDS, TITLE) +) + + +########### +# +# IMENI +# +########## + + +IMENI_WORDS = or_( + rule( + caseless('им'), + DOT.optional() + ), + rule(caseless('имени')) +) + +IMENI = or_( + rule( + IMENI_WORDS.optional(), + MAYBE_PERSON + ), + rule( + IMENI_WORDS, + TITLE + ) +) + +########## +# +# LET +# +########## + + +LET_WORDS = or_( + rule(caseless('лет')), + rule( + DASH.optional(), + caseless('летия') + ) +) + +LET_NAME = in_caseless({ + 'влксм', + 'ссср', + 'алтая', + 'башкирии', + 'бурятии', + 'дагестана', + 'калмыкии', + 'колхоза', + 'комсомола', + 'космонавтики', + 'москвы', + 'октября', + 'пионерии', + 'победы', + 'приморья', + 'района', + 'совхоза', + 'совхозу', + 'татарстана', + 'тувы', + 'удмуртии', + 'улуса', + 'хакасии', + 'целины', + 'чувашии', + 'якутии', +}) + +LET = rule( + INT, + LET_WORDS, + LET_NAME +) + + +########## +# +# ADDR DATE +# +############# + + +MONTH_WORDS = dictionary({ + 'январь', + 'февраль', + 'март', + 'апрель', + 'май', + 'июнь', + 'июль', + 'август', + 'сентябрь', + 'октябрь', + 'ноябрь', + 'декабрь', +}) + +DAY = and_( + INT, + gte(1), + lte(31) +) + +YEAR = and_( + INT, + gte(1), + lte(2100) +) + +YEAR_WORDS = normalized('год') + +DATE = or_( + rule(DAY, MONTH_WORDS), + rule(YEAR, YEAR_WORDS) +) + + +######### +# +# MODIFIER +# +############ + + +MODIFIER_WORDS_ = rule( + dictionary({ + 'большой', + 'малый', + 'средний', + + 'верхний', + 'центральный', + 'нижний', + 'северный', + 'дальний', + + 'первый', + 'второй', + + 'старый', + 'новый', + + 'красный', + 'лесной', + 'тихий', + }), + DASH.optional() +) + +ABBR_MODIFIER_WORDS = rule( + in_caseless({ + 'б', 'м', 'н' + }), + DOT.optional() +) + +SHORT_MODIFIER_WORDS = rule( + in_caseless({ + 'больше', + 'мало', + 'средне', + + 'верх', + 'верхне', + 'центрально', + 'нижне', + 'северо', + 'дальне', + 'восточно', + 'западно', + + 'перво', + 'второ', + + 'старо', + 'ново', + + 'красно', + 'тихо', + 'горно', + }), + DASH.optional() +) + +MODIFIER_WORDS = or_( + MODIFIER_WORDS_, + ABBR_MODIFIER_WORDS, + SHORT_MODIFIER_WORDS, +) + + +########## +# +# ADDR NAME +# +########## + + +ROD = gram('gent') + +SIMPLE = and_( + or_( + ADJF, # Школьная + and_(NOUN, ROD), # Ленина, Победы + ), + TITLE +) + +COMPLEX = or_( + rule( + and_(ADJF, TITLE), + NOUN + ), + rule( + TITLE, + DASH.optional(), + TITLE + ), +) + +# TODO +EXCEPTION = dictionary({ + 'арбат', + 'варварка' +}) + +MAYBE_NAME = or_( + rule(SIMPLE), + COMPLEX, + rule(EXCEPTION) +) + +NAME = or_( + MAYBE_NAME, + LET, + DATE, + IMENI +) + +NAME = rule( + MODIFIER_WORDS.optional(), + NAME +) + +ADDR_CRF = tag('I').repeatable() + +NAME = or_( + NAME, + ANUM, + rule(NAME, ANUM), + rule(ANUM, NAME), + rule(INT, DASH.optional(), NAME), + rule(NAME, DASH, INT), + ADDR_CRF +) + +ADDR_NAME = NAME + + +######## +# +# STREET +# +######### + + +STREET_WORDS = or_( + rule(normalized('улица')), + rule( + caseless('ул'), + DOT.optional() + ) +).interpretation( + Street.type.const('улица') +) + +STREET_NAME = ADDR_NAME.interpretation( + Street.name +) + +STREET = or_( + rule(STREET_WORDS, STREET_NAME), + rule(STREET_NAME, STREET_WORDS) +).interpretation( + Street +) + + +########## +# +# PROSPEKT +# +########## + + +PROSPEKT_WORDS = or_( + rule( + in_caseless({'пр', 'просп'}), + DOT.optional() + ), + rule( + caseless('пр'), + '-', + in_caseless({'кт', 'т'}), + DOT.optional() + ), + rule(normalized('проспект')) +).interpretation( + Street.type.const('проспект') +) + +PROSPEKT_NAME = ADDR_NAME.interpretation( + Street.name +) + +PROSPEKT = or_( + rule(PROSPEKT_WORDS, PROSPEKT_NAME), + rule(PROSPEKT_NAME, PROSPEKT_WORDS) +).interpretation( + Street +) + + +############ +# +# PROEZD +# +############# + + +PROEZD_WORDS = or_( + rule(caseless('пр'), DOT.optional()), + rule( + caseless('пр'), + '-', + in_caseless({'зд', 'д'}), + DOT.optional() + ), + rule(normalized('проезд')) +).interpretation( + Street.type.const('проезд') +) + +PROEZD_NAME = ADDR_NAME.interpretation( + Street.name +) + +PROEZD = or_( + rule(PROEZD_WORDS, PROEZD_NAME), + rule(PROEZD_NAME, PROEZD_WORDS) +).interpretation( + Street +) + + +########### +# +# PEREULOK +# +############## + + +PEREULOK_WORDS = or_( + rule( + caseless('п'), + DOT + ), + rule( + caseless('пер'), + DOT.optional() + ), + rule(normalized('переулок')) +).interpretation( + Street.type.const('переулок') +) + +PEREULOK_NAME = ADDR_NAME.interpretation( + Street.name +) + +PEREULOK = or_( + rule(PEREULOK_WORDS, PEREULOK_NAME), + rule(PEREULOK_NAME, PEREULOK_WORDS) +).interpretation( + Street +) + + +######## +# +# PLOSHAD +# +########## + + +PLOSHAD_WORDS = or_( + rule( + caseless('пл'), + DOT.optional() + ), + rule(normalized('площадь')) +).interpretation( + Street.type.const('площадь') +) + +PLOSHAD_NAME = ADDR_NAME.interpretation( + Street.name +) + +PLOSHAD = or_( + rule(PLOSHAD_WORDS, PLOSHAD_NAME), + rule(PLOSHAD_NAME, PLOSHAD_WORDS) +).interpretation( + Street +) + + +############ +# +# SHOSSE +# +########### + + +# TODO +# Покровское 17 км. +# Сергеляхское 13 км +# Сергеляхское 14 км. + + +SHOSSE_WORDS = or_( + rule( + caseless('ш'), + DOT + ), + rule(normalized('шоссе')) +).interpretation( + Street.type.const('шоссе') +) + +SHOSSE_NAME = ADDR_NAME.interpretation( + Street.name +) + +SHOSSE = or_( + rule(SHOSSE_WORDS, SHOSSE_NAME), + rule(SHOSSE_NAME, SHOSSE_WORDS) +).interpretation( + Street +) + + +######## +# +# NABEREG +# +########## + + +NABEREG_WORDS = or_( + rule( + caseless('наб'), + DOT.optional() + ), + rule(normalized('набережная')) +).interpretation( + Street.type.const('набережная') +) + +NABEREG_NAME = ADDR_NAME.interpretation( + Street.name +) + +NABEREG = or_( + rule(NABEREG_WORDS, NABEREG_NAME), + rule(NABEREG_NAME, NABEREG_WORDS) +).interpretation( + Street +) + +######## +# +# SAD +# +########## + + +SAD_WORDS = or_( + rule( + caseless('са'), + DOT.optional() + ), + rule(normalized('сад')) +).interpretation( + Street.type.const('сад') +) + +SAD_NAME = ADDR_NAME.interpretation( + Street.name +) + +SAD = or_( + rule(SAD_WORDS, SAD_NAME), + rule(SAD_NAME, SAD_WORDS) +).interpretation( + Street +) + +######## +# +# POLE +# +########## + + +POLE_WORDS = or_( + rule( + caseless('пол'), + DOT.optional() + ), + rule(normalized('поле')) +).interpretation( + Street.type.const('поле') +) + +POLE_NAME = ADDR_NAME.interpretation( + Street.name +) + +POLE = or_( + rule(POLE_WORDS, POLE_NAME), + rule(POLE_NAME, POLE_WORDS) +).interpretation( + Street +) + +######## +# +# KLAD +# +########## + + +KLAD_WORDS = or_( + rule( + caseless('клад'), + DOT.optional() + ), + rule(normalized('кладбище')) +).interpretation( + Street.type.const('кладбище') +) + +KLAD_NAME = ADDR_NAME.interpretation( + Street.name +) + +KLAD = or_( + rule(KLAD_WORDS, KLAD_NAME), + rule(KLAD_NAME, KLAD_WORDS) +).interpretation( + Street +) + +######## +# +# PLAJ +# +########## + + +PLAJ_WORDS = or_( + rule( + caseless('пля'), + DOT.optional() + ), + rule(normalized('пляж')) +).interpretation( + Street.type.const('пляж') +) + +PLAJ_NAME = ADDR_NAME.interpretation( + Street.name +) + +PLAJ = or_( + rule(PLAJ_WORDS, PLAJ_NAME), + rule(PLAJ_NAME, PLAJ_WORDS) +).interpretation( + Street +) + +######## +# +# PRUD +# +########## + + +PRUD_WORDS = or_( + rule( + caseless('пру'), + DOT.optional() + ), + rule(normalized('пруд')) +).interpretation( + Street.type.const('пруд') +) + +PRUD_NAME = ADDR_NAME.interpretation( + Street.name +) + +PRUD = or_( + rule(PRUD_WORDS, PRUD_NAME), + rule(PRUD_NAME, PRUD_WORDS) +).interpretation( + Street +) + +######## +# +# VOKZAL +# +########## + + +VOKZAL_WORDS = or_( + rule( + caseless('вок'), + DOT.optional() + ), + rule(normalized('вокзал')) +).interpretation( + Street.type.const('вокзал') +) + +VOKZAL_NAME = ADDR_NAME.interpretation( + Street.name +) + +VOKZAL = or_( + rule(VOKZAL_WORDS, VOKZAL_NAME), + rule(VOKZAL_NAME, VOKZAL_WORDS) +).interpretation( + Street +) + +######## +# +# METRO +# +########## + + +METRO_WORDS = or_( + rule( + caseless('ст'), + DOT.optional() + ), + rule(normalized('метро')) +).interpretation( + Street.type.const('метро') +) + +METRO_NAME = ADDR_NAME.interpretation( + Street.name +) + +METRO = or_( + rule(METRO_WORDS, METRO_NAME), + rule(METRO_NAME, METRO_WORDS) +).interpretation( + Street +) + +######## +# +# TEATR +# +########## + + +TEATR_WORDS = or_( + rule( + caseless('т'), + DOT.optional() + ), + rule(normalized('театр')) +).interpretation( + Street.type.const('театр') +) + +TEATR_NAME = ADDR_NAME.interpretation( + Street.name +) + +TEATR = or_( + rule(TEATR_WORDS, TEATR_NAME), + rule(TEATR_NAME, TEATR_WORDS) +).interpretation( + Street +) + +######## +# +# MUZEI +# +########## + + +MUZEI_WORDS = or_( + rule( + caseless('муз'), + DOT.optional() + ), + rule(normalized('музей')) +).interpretation( + Street.type.const('музей') +) + +MUZEI_NAME = ADDR_NAME.interpretation( + Street.name +) + +MUZEI = or_( + rule(MUZEI_WORDS, MUZEI_NAME), + rule(MUZEI_NAME, MUZEI_WORDS) +).interpretation( + Street +) + +######## +# +# PAMETNIK +# +########## + + +PAMETNIK_WORDS = or_( + rule( + caseless('пам'), + DOT.optional() + ), + rule(normalized('памятник')) +).interpretation( + Street.type.const('памятник') +) + +PAMETNIK_NAME = ADDR_NAME.interpretation( + Street.name +) + +PAMETNIK = or_( + rule(PAMETNIK_WORDS, PAMETNIK_NAME), + rule(PAMETNIK_NAME, PAMETNIK_WORDS) +).interpretation( + Street +) + +######## +# +# STATUIA +# +########## + + +STATUIA_WORDS = or_( + rule( + caseless('стат'), + DOT.optional() + ), + rule(normalized('статуя')) +).interpretation( + Street.type.const('статуя') +) + +STATUIA_NAME = ADDR_NAME.interpretation( + Street.name +) + +STATUIA = or_( + rule(STATUIA_WORDS, STATUIA_NAME), + rule(STATUIA_NAME, STATUIA_WORDS) +).interpretation( + Street +) + +######## +# +# PARK +# +########## + + +PARK_WORDS = or_( + rule( + caseless('пар'), + DOT.optional() + ), + rule(normalized('парк')) +).interpretation( + Street.type.const('парк') +) + +PARK_NAME = ADDR_NAME.interpretation( + Street.name +) + +PARK = or_( + rule(PARK_WORDS, PARK_NAME), + rule(PARK_NAME, PARK_WORDS) +).interpretation( + Street +) + +######## +# +# SQVER +# +########## + + +SQVER_WORDS = or_( + rule( + caseless('ск'), + DOT.optional() + ), + rule(normalized('сквер')) +).interpretation( + Street.type.const('сквер') +) + +SQVER_NAME = ADDR_NAME.interpretation( + Street.name +) + +SQVER = or_( + rule(SQVER_WORDS, SQVER_NAME), + rule(SQVER_NAME, SQVER_WORDS) +).interpretation( + Street +) + +######## +# +# ROSCHA +# +########## + + +ROSCHA_WORDS = or_( + rule( + caseless('рощ'), + DOT.optional() + ), + rule(normalized('роща')) +).interpretation( + Street.type.const('роща') +) + +ROSCHA_NAME = ADDR_NAME.interpretation( + Street.name +) + +ROSCHA = or_( + rule(ROSCHA_WORDS, ROSCHA_NAME), + rule(ROSCHA_NAME, ROSCHA_WORDS) +).interpretation( + Street +) + +######## +# +# BULVAR +# +########## + + +BULVAR_WORDS = or_( + rule( + caseless('б'), + '-', + caseless('р') + ), + rule( + caseless('б'), + DOT + ), + rule( + caseless('бул'), + DOT.optional() + ), + rule(normalized('бульвар')) +).interpretation( + Street.type.const('бульвар') +) + +BULVAR_NAME = ADDR_NAME.interpretation( + Street.name +) + +BULVAR = or_( + rule(BULVAR_WORDS, BULVAR_NAME), + rule(BULVAR_NAME, BULVAR_WORDS) +).interpretation( + Street +) + + +############## +# +# ADDR VALUE +# +############# + + +LETTER = in_caseless(set('абвгдежзиклмнопрстуфхшщэюя')) + +QUOTE = in_(QUOTES) + +LETTER = or_( + rule(LETTER), + rule(QUOTE, LETTER, QUOTE) +) + +VALUE = rule( + INT, + LETTER.optional() +) + +SEP = in_(r'/\-') + +VALUE = or_( + rule(VALUE), + rule(VALUE, SEP, VALUE), + rule(VALUE, SEP, LETTER) +) + +ADDR_VALUE = rule( + eq('№').optional(), + VALUE +) + + +############ +# +# DOM +# +############# + + +DOM_WORDS = or_( + rule(normalized('дом')), + rule( + caseless('д'), + DOT + ) +).interpretation( + Building.type.const('дом') +) + +DOM_VALUE = ADDR_VALUE.interpretation( + Building.number +) + +DOM = rule( + DOM_WORDS, + DOM_VALUE +).interpretation( + Building +) + + +########### +# +# KORPUS +# +########## + + +KORPUS_WORDS = or_( + rule( + in_caseless({'корп', 'кор'}), + DOT.optional() + ), + rule(normalized('корпус')) +).interpretation( + Building.type.const('корпус') +) + +KORPUS_VALUE = ADDR_VALUE.interpretation( + Building.number +) + +KORPUS = or_( + rule( + KORPUS_WORDS, + KORPUS_VALUE + ), + rule( + KORPUS_VALUE, + KORPUS_WORDS + ) +).interpretation( + Building +) + + +########### +# +# STROENIE +# +########## + + +STROENIE_WORDS = or_( + rule( + caseless('стр'), + DOT.optional() + ), + rule(normalized('строение')) +).interpretation( + Building.type.const('строение') +) + +STROENIE_VALUE = ADDR_VALUE.interpretation( + Building.number +) + +STROENIE = rule( + STROENIE_WORDS, + STROENIE_VALUE +).interpretation( + Building +) + + +########### +# +# OFIS +# +############# + + +OFIS_WORDS = or_( + rule( + caseless('оф'), + DOT.optional() + ), + rule(normalized('офис')) +).interpretation( + Room.type.const('офис') +) + +OFIS_VALUE = ADDR_VALUE.interpretation( + Room.number +) + +OFIS = rule( + OFIS_WORDS, + OFIS_VALUE +).interpretation( + Room +) + + +########### +# +# KVARTIRA +# +############# + + +KVARTIRA_WORDS = or_( + rule( + caseless('кв'), + DOT.optional() + ), + rule(normalized('квартира')) +).interpretation( + Room.type.const('квартира') +) + +KVARTIRA_VALUE = ADDR_VALUE.interpretation( + Room.number +) + +KVARTIRA = rule( + KVARTIRA_WORDS, + KVARTIRA_VALUE +).interpretation( + Room +) + + +########### +# +# INDEX +# +############# + + +INDEX = and_( + INT, + gte(100000), + lte(999999) +).interpretation( + Index.value +).interpretation( + Index +) + + +############# +# +# ADDR PART +# +############ + + +ADDR_PART = or_( + SAD, + ROSCHA, + SQVER, + PARK, + POLE, + KLAD, + PLAJ, + PRUD, + VOKZAL, + METRO, + TEATR, + MUZEI, + PAMETNIK +).interpretation( + AddrPart.value +).interpretation( + AddrPart +)