diff --git a/.DS_Store b/.DS_Store index 9a130e8..fbea74f 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/examples/city_services_example.ipynb b/examples/city_services_example.ipynb index a472674..4674a12 100644 --- a/examples/city_services_example.ipynb +++ b/examples/city_services_example.ipynb @@ -23,7 +23,25 @@ "name": "stdout", "output_type": "stream", "text": [ - "2024-04-25 10:08:13,127 SequenceTagger predicts: Dictionary with 7 tags: O, S-Service, B-Service, E-Service, I-Service, , \n" + "INFO: Pandarallel will run on -1 workers.\n", + "INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /Users/test/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package stopwords to /Users/test/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-06-09 13:02:33,176 SequenceTagger predicts: Dictionary with 7 tags: O, S-Service, B-Service, E-Service, I-Service, , \n" ] } ], @@ -159,6 +177,184 @@ "result = City_services().run(df, text_column='Текст комментария')" ] }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
message_idДата и времяТекст комментарияCity_services_extracedCity_services
002023.01.26 16:32Здравствуйте! В Санкт-Петербурге нет Генеральн...[][]
112023.01.26 11:55[club143265175|Центральный район Санкт-Петербу...[][]
222023.01.28 12:391) Фурштатская, 19 Отслоение штукатурного слоя...[][]
332023.01.28 12:422) Фурштатская, 17 Здесь прямо-таки умоляю обр...[][]
442023.01.28 12:453) Фурштатская, 13 Отслоение штукатурного слоя...[][]
..................
1951952022.10.24 14:02На Чайковского 63 тоже идет кап.ремонт. В квар...[][]
1961962022.10.21 22:22Вся улица Жуковского и Восстания заклеена рекл...[][]
1971972022.10.26 12:57О каком благоустройстве идёт речь. Стремянная...[][]
1981982022.10.20 22:08🤣🤣🤣🤣 угол 1й советской и Суворовского Клюваты...[][]
1991992022.10.18 19:24Искренне благодарю за участие в решении многих...[Фонтанке][Фонтан]
\n", + "

200 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " message_id Дата и время \\\n", + "0 0 2023.01.26 16:32 \n", + "1 1 2023.01.26 11:55 \n", + "2 2 2023.01.28 12:39 \n", + "3 3 2023.01.28 12:42 \n", + "4 4 2023.01.28 12:45 \n", + ".. ... ... \n", + "195 195 2022.10.24 14:02 \n", + "196 196 2022.10.21 22:22 \n", + "197 197 2022.10.26 12:57 \n", + "198 198 2022.10.20 22:08 \n", + "199 199 2022.10.18 19:24 \n", + "\n", + " Текст комментария City_services_extraced \\\n", + "0 Здравствуйте! В Санкт-Петербурге нет Генеральн... [] \n", + "1 [club143265175|Центральный район Санкт-Петербу... [] \n", + "2 1) Фурштатская, 19 Отслоение штукатурного слоя... [] \n", + "3 2) Фурштатская, 17 Здесь прямо-таки умоляю обр... [] \n", + "4 3) Фурштатская, 13 Отслоение штукатурного слоя... [] \n", + ".. ... ... \n", + "195 На Чайковского 63 тоже идет кап.ремонт. В квар... [] \n", + "196 Вся улица Жуковского и Восстания заклеена рекл... [] \n", + "197 О каком благоустройстве идёт речь. Стремянная... [] \n", + "198 🤣🤣🤣🤣 угол 1й советской и Суворовского Клюваты... [] \n", + "199 Искренне благодарю за участие в решении многих... [Фонтанке] \n", + "\n", + " City_services \n", + "0 [] \n", + "1 [] \n", + "2 [] \n", + "3 [] \n", + "4 [] \n", + ".. ... \n", + "195 [] \n", + "196 [] \n", + "197 [] \n", + "198 [] \n", + "199 [Фонтан] \n", + "\n", + "[200 rows x 5 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(result)" + ] + }, { "cell_type": "code", "execution_count": 4, @@ -676,7 +872,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/examples/geocoder_example.ipynb b/examples/geocoder_example.ipynb index 661142e..028d059 100644 --- a/examples/geocoder_example.ipynb +++ b/examples/geocoder_example.ipynb @@ -16,37 +16,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO: Pandarallel will run on -1 workers.\n", - "INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n", - "\n", - "WARNING: You are on Windows. If you detect any issue with pandarallel, be sure you checked out the Troubleshooting page:\n", - "https://nalepae.github.io/pandarallel/troubleshooting/\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package stopwords to\n", - "[nltk_data] C:\\Users\\trolo\\AppData\\Roaming\\nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-06-07 14:55:12,526 SequenceTagger predicts: Dictionary with 7 tags: O, S-Service, B-Service, E-Service, I-Service, , \n" - ] - } - ], + "outputs": [], "source": [ "import warnings\n", "\n", @@ -60,8 +32,8 @@ "folder = os.getcwd().split(\"\\\\examples\")[0]\n", "sys.path.append(folder)\n", "\n", - "\n", - "from sloyka.src.geocoder import Geocoder" + "from sloyka.src.utils.data_getter.data_getter import VKParser\n", + "from sloyka.src.geocoder.geocoder import Geocoder" ] }, { @@ -73,540 +45,17 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv('F://Group_name//Новая папка (2)//test_activity_data.csv', sep=';')" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0dateidtextviews.countlikes.countreposts.counttypelinkfrom_idpost_idparents_stackgroup_name
002024-06-02 17:28:1158908В Центральном районе продолжается борьба с гра...583.020.0postNaNNaNNaNNaNЦентральный район Санкт-Петербурга
112024-06-02 14:31:3458907В Центральном районе продолжается борьба с нез...577.041.0postNaNNaNNaNNaNЦентральный район Санкт-Петербурга
222024-06-02 11:38:4258902Лебеди Елисей и Любава вернулись в Карпиев пру...538.061.0postNaNNaNNaNNaNЦентральный район Санкт-Петербурга
332024-06-01 19:22:1058896Накануне Дня защиты детей пообщались о праздни...675.040.0posthttps://vk.com/public206303341|ДетскогоNaNNaNNaNЦентральный район Санкт-Петербурга
442024-06-01 18:15:0058893Накануне Дня защиты детей в Таврическом саду с...1047.0100.0postNaNNaNNaNNaNЦентральный район Санкт-Петербурга
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 date id \\\n", - "0 0 2024-06-02 17:28:11 58908 \n", - "1 1 2024-06-02 14:31:34 58907 \n", - "2 2 2024-06-02 11:38:42 58902 \n", - "3 3 2024-06-01 19:22:10 58896 \n", - "4 4 2024-06-01 18:15:00 58893 \n", - "\n", - " text views.count \\\n", - "0 В Центральном районе продолжается борьба с гра... 583.0 \n", - "1 В Центральном районе продолжается борьба с нез... 577.0 \n", - "2 Лебеди Елисей и Любава вернулись в Карпиев пру... 538.0 \n", - "3 Накануне Дня защиты детей пообщались о праздни... 675.0 \n", - "4 Накануне Дня защиты детей в Таврическом саду с... 1047.0 \n", - "\n", - " likes.count reposts.count type link \\\n", - "0 2 0.0 post NaN \n", - "1 4 1.0 post NaN \n", - "2 6 1.0 post NaN \n", - "3 4 0.0 post https://vk.com/public206303341|Детского \n", - "4 10 0.0 post NaN \n", - "\n", - " from_id post_id parents_stack group_name \n", - "0 NaN NaN NaN Центральный район Санкт-Петербурга \n", - "1 NaN NaN NaN Центральный район Санкт-Петербурга \n", - "2 NaN NaN NaN Центральный район Санкт-Петербурга \n", - "3 NaN NaN NaN Центральный район Санкт-Петербурга \n", - "4 NaN NaN NaN Центральный район Санкт-Петербурга " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(1076, 13)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Unnamed: 0 25\n", - "date 2024-05-29 16:06:02\n", - "id 58820\n", - "text Порой нужен всего один разговор, чтобы принять...\n", - "views.count 428.0\n", - "likes.count 0\n", - "reposts.count 0.0\n", - "type post\n", - "link NaN\n", - "from_id NaN\n", - "post_id NaN\n", - "parents_stack NaN\n", - "group_name Центральный район Санкт-Петербурга\n", - "Name: 25, dtype: object" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.iloc[25,]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Запуск геолокатора со встроенным геокодером" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-06-07 14:55:35,350 SequenceTagger predicts: Dictionary with 5 tags: O, S-ADDRESS, B-ADDRESS, E-ADDRESS, I-ADDRESS\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Processing category admin_level: 100%|██████████| 1/1 [00:00<00:00, 4.87it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error fetching timestamp for osmid 1040824844 [ElementNotFoundApiError]: Request failed: 404 - Not Found - b''\n", - "\u001b[32m06-07 14:55\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mrun_OSM_dfs started\u001b[0m\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\trolo\\CodeNIRMA\\sloyka\\sloyka\\src\\geocoder.py:318: UserWarning: The `geometries` module and `geometries_from_X` functions have been renamed the `features` module and `features_from_X` functions. Use these instead. The `geometries` module and function names are deprecated and will be removed in a future release.\n", - " green_obj = ox.geometries_from_place(osm_city_name, tags)\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n", - "c:\\Users\\trolo\\CodeNIRMA\\sloyka\\sloyka\\src\\geocoder.py:330: UserWarning: The `geometries` module and `geometries_from_X` functions have been renamed the `features` module and `features_from_X` functions. Use these instead. The `geometries` module and function names are deprecated and will be removed in a future release.\n", - " osm_num_obj = ox.geometries_from_place(osm_city_name, tags)\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n", - "c:\\Users\\trolo\\CodeNIRMA\\sloyka\\sloyka\\src\\geocoder.py:342: UserWarning: The `geometries` module and `geometries_from_X` functions have been renamed the `features` module and `features_from_X` functions. Use these instead. The `geometries` module and function names are deprecated and will be removed in a future release.\n", - " osm_cemetery = ox.geometries_from_place(osm_city_name, tags)\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n", - "c:\\Users\\trolo\\CodeNIRMA\\sloyka\\sloyka\\src\\geocoder.py:354: UserWarning: The `geometries` module and `geometries_from_X` functions have been renamed the `features` module and `features_from_X` functions. Use these instead. The `geometries` module and function names are deprecated and will be removed in a future release.\n", - " osm_natural = ox.geometries_from_place(osm_city_name, tags)\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n", - "c:\\Users\\trolo\\CodeNIRMA\\sloyka\\sloyka\\src\\geocoder.py:366: UserWarning: The `geometries` module and `geometries_from_X` functions have been renamed the `features` module and `features_from_X` functions. Use these instead. The `geometries` module and function names are deprecated and will be removed in a future release.\n", - " osm_railway = ox.geometries_from_place(osm_city_name, tags)\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n", - "c:\\Users\\trolo\\CodeNIRMA\\sloyka\\sloyka\\src\\geocoder.py:378: UserWarning: The `geometries` module and `geometries_from_X` functions have been renamed the `features` module and `features_from_X` functions. Use these instead. The `geometries` module and function names are deprecated and will be removed in a future release.\n", - " osm_tourism = ox.geometries_from_place(osm_city_name, tags)\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n", - "c:\\Users\\trolo\\CodeNIRMA\\sloyka\\sloyka\\src\\geocoder.py:390: UserWarning: The `geometries` module and `geometries_from_X` functions have been renamed the `features` module and `features_from_X` functions. Use these instead. The `geometries` module and function names are deprecated and will be removed in a future release.\n", - " osm_historic = ox.geometries_from_place(osm_city_name, tags)\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m06-07 14:55\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mfind_other_geo_obj started\u001b[0m\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " return np.find_common_type(types, [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n", - "c:\\Users\\trolo\\python3.10.9\\envs\\nirmapy\\lib\\site-packages\\pandas\\core\\algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", - "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", - " common = np.find_common_type([values.dtype, comps_array.dtype], [])\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m06-07 14:59\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mclear_names started\u001b[0m\n", - "\u001b[32m06-07 14:59\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mget_street started\u001b[0m\n", - "\u001b[32m06-07 14:59\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mextract_ner_street started\u001b[0m\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 1076/1076 [00:10<00:00, 100.62it/s]\n", - "100%|██████████| 1076/1076 [00:07<00:00, 144.12it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m06-07 15:00\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpattern1.sub started\u001b[0m\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "100%|██████████| 407/407 [00:00<00:00, 203709.04it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m06-07 15:00\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpattern2.findall started\u001b[0m\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "100%|██████████| 407/407 [00:00<00:00, 406641.67it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m06-07 15:00\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpattern2.sub started\u001b[0m\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "100%|██████████| 407/407 [00:00<00:00, 407418.07it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m06-07 15:00\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mextract_building_num started\u001b[0m\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - " 0%| | 0/407 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0dateidtextviews.countlikes.countreposts.counttypelinkfrom_id...initial_streetToponimsfull_street_namelocation_optionsaddr_to_geocodeonly_full_street_nameLocationgeometryother_geo_objgeo_obj_tag
012024-06-02 14:31:3458907В Центральном ра...577.041.0postNaNNaN...БольшуюплощадьБольшая аллея С...['Большая аллея ...Большая аллея С...Большая аллеяБольшая аллея, О...POINT (30.28604 ...NaNstreet
112024-06-02 14:31:3458907В Центральном ра...577.041.0postNaNNaN...БольшуюплощадьБольшая аллея С...['Большая аллея ...Большая аллея С...Большой проспектБольшая аллея, О...POINT (30.28604 ...NaNstreet
212024-06-02 14:31:3458907В Центральном ра...577.041.0postNaNNaN...БольшуюплощадьБольшая аллея С...['Большая аллея ...Большая аллея С...Большая улицаБольшая аллея, О...POINT (30.28604 ...NaNstreet
312024-06-02 14:31:3458907В Центральном ра...577.041.0postNaNNaN...БольшуюплощадьБольшая аллея С...['Большая аллея ...Большой проспект...Большая аллеяБольшой проспект...POINT (29.85599 ...NaNstreet
412024-06-02 14:31:3458907В Центральном ра...577.041.0postNaNNaN...БольшуюплощадьБольшая аллея С...['Большая аллея ...Большой проспект...Большой проспектБольшой проспект...POINT (29.85599 ...NaNstreet
..................................................................
3889792024-03-22 18:24:5457489Очень нужны мест...NaN0NaNcommentNaN16164672.0...NaNNaNNaNNaNNaNNaNNaNPOINT (30.37323 ...Таврический садpark
38910462024-03-18 19:05:5457410[id973044|Екатер...NaN0NaNreplyNaN2585913.0...NaNNaNNaNNaNNaNNaNNaNPOINT (30.37323 ...Таврический садpark
39010742024-03-17 15:43:3957346А где находится ...NaN0NaNcommentNaN7038519.0...NaNNaNNaNNaNNaNNaNNaNPOINT (30.36747 ...Некрасовский садpark
39110752024-03-18 07:47:5657383[id7038519|Светл...NaN0NaNreplyNaN-143265175.0...NaNNaNNaNNaNNaNNaNNaNPOINT (30.36747 ...Некрасовский садpark
39210752024-03-18 07:47:5657383[id7038519|Светл...NaN0NaNreplyNaN-143265175.0...NaNNaNNaNNaNNaNNaNNaNPOINT (30.33962 ...Гренадерский садpark
\n", - "

393 rows × 28 columns

\n", - "" - ], - "text/plain": [ - " Unnamed: 0 date id text views.count likes.count reposts.count type link from_id ... initial_street Toponims full_street_name location_options addr_to_geocode only_full_street_name Location geometry other_geo_obj geo_obj_tag\n", - "0 1 2024-06-02 14:31:34 58907 В Центральном ра... 577.0 4 1.0 post NaN NaN ... Большую площадь Большая аллея С... ['Большая аллея ... Большая аллея С... Большая аллея Большая аллея, О... POINT (30.28604 ... NaN street\n", - "1 1 2024-06-02 14:31:34 58907 В Центральном ра... 577.0 4 1.0 post NaN NaN ... Большую площадь Большая аллея С... ['Большая аллея ... Большая аллея С... Большой проспект Большая аллея, О... POINT (30.28604 ... NaN street\n", - "2 1 2024-06-02 14:31:34 58907 В Центральном ра... 577.0 4 1.0 post NaN NaN ... Большую площадь Большая аллея С... ['Большая аллея ... Большая аллея С... Большая улица Большая аллея, О... POINT (30.28604 ... NaN street\n", - "3 1 2024-06-02 14:31:34 58907 В Центральном ра... 577.0 4 1.0 post NaN NaN ... Большую площадь Большая аллея С... ['Большая аллея ... Большой проспект... Большая аллея Большой проспект... POINT (29.85599 ... NaN street\n", - "4 1 2024-06-02 14:31:34 58907 В Центральном ра... 577.0 4 1.0 post NaN NaN ... Большую площадь Большая аллея С... ['Большая аллея ... Большой проспект... Большой проспект Большой проспект... POINT (29.85599 ... NaN street\n", - ".. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - "388 979 2024-03-22 18:24:54 57489 Очень нужны мест... NaN 0 NaN comment NaN 16164672.0 ... NaN NaN NaN NaN NaN NaN NaN POINT (30.37323 ... Таврический сад park\n", - "389 1046 2024-03-18 19:05:54 57410 [id973044|Екатер... NaN 0 NaN reply NaN 2585913.0 ... NaN NaN NaN NaN NaN NaN NaN POINT (30.37323 ... Таврический сад park\n", - "390 1074 2024-03-17 15:43:39 57346 А где находится ... NaN 0 NaN comment NaN 7038519.0 ... NaN NaN NaN NaN NaN NaN NaN POINT (30.36747 ... Некрасовский сад park\n", - "391 1075 2024-03-18 07:47:56 57383 [id7038519|Светл... NaN 0 NaN reply NaN -143265175.0 ... NaN NaN NaN NaN NaN NaN NaN POINT (30.36747 ... Некрасовский сад park\n", - "392 1075 2024-03-18 07:47:56 57383 [id7038519|Светл... NaN 0 NaN reply NaN -143265175.0 ... NaN NaN NaN NaN NaN NaN NaN POINT (30.33962 ... Гренадерский сад park\n", - "\n", - "[393 rows x 28 columns]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "result" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0dateidtextviews.countlikes.countreposts.counttypelinkfrom_id...initial_streetToponimsfull_street_namelocation_optionsaddr_to_geocodeonly_full_street_nameLocationgeometryother_geo_objgeo_obj_tag
12362024-05-27 15:39:0658773Сегодня Общеросс...489.051.0postNaNNaN...НевеNoneпутепровод Нева ...['путепровод Нев...путепровод Нева ...путепровод Невапутепровод Нева,...POINT (30.49252 ...NaNstreet
13432024-05-25 19:11:4758742Фестиваль мороже...2482.0153.0postNaNNaN...Островскогоплощадьплощадь Островск...['площадь Остров...площадь Островск...площадь Островскогоплощадь Островск...POINT (30.33677 ...NaNstreet
221102024-05-13 17:30:0058539188 лет назад в ...1525.0173.0postNaNNaN...ЦарскосельскуюдорогаЦарскосельская у...['Царскосельская...Царскосельская у...Царскосельская у...Царскосельская у...POINT (30.36046 ...NaNstreet
431512024-05-07 15:33:4658286На площади Восст...1819.05710.0postNaNNaN...Восстанияплощадьплощадь Восстани...['площадь Восста...площадь Восстани...площадь ВосстанияПлощадь Восстани...POINT (30.35902 ...NaNstreet
451742024-05-03 09:36:1958200Весеннее настрое...455.060.0posthttps://vk.com/w...NaN...ЛитейномNoneЛитейный мост С...['Литейный мост ...Литейный мост С...Литейный мостЛитейный мост, В...POINT (30.34952 ...NaNstreet
..................................................................
2779722024-03-26 14:09:3157547В свете последни...NaN1NaNcommentNaN7332491.0...ТаврическойNoneТаврическая улиц...['Таврическая ул...Таврическая улиц...Таврическая улица2, Таврическая у...POINT (30.37735 ...NaNstreet
2819932024-03-24 15:48:5757514Лучше бы Краснос...NaN113NaNcommentNaN1278670.0...КрасносельскомуNoneКрасносельский п...['Красносельский...Красносельский п...Красносельский п...Красносельский п...POINT (30.10345 ...NaNstreet
2829932024-03-24 15:48:5757514Лучше бы Краснос...NaN113NaNcommentNaN1278670.0...КрасносельскомуNoneКрасносельский п...['Красносельский...Красносельский п...Красносельское ш...Красносельский п...POINT (30.10345 ...NaNstreet
2839932024-03-24 15:48:5757514Лучше бы Краснос...NaN113NaNcommentNaN1278670.0...КрасносельскомуNoneКрасносельский п...['Красносельский...Красносельское ш...Красносельский п...Красносельское ш...POINT (30.06784 ...NaNstreet
2849932024-03-24 15:48:5757514Лучше бы Краснос...NaN113NaNcommentNaN1278670.0...КрасносельскомуNoneКрасносельский п...['Красносельский...Красносельское ш...Красносельское ш...Красносельское ш...POINT (30.06784 ...NaNstreet
\n", - "

124 rows × 28 columns

\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 date id text views.count likes.count reposts.count type link from_id ... initial_street Toponims full_street_name location_options addr_to_geocode only_full_street_name Location geometry other_geo_obj geo_obj_tag\n", - "12 36 2024-05-27 15:39:06 58773 Сегодня Общеросс... 489.0 5 1.0 post NaN NaN ... Неве None путепровод Нева ... ['путепровод Нев... путепровод Нева ... путепровод Нева путепровод Нева,... POINT (30.49252 ... NaN street\n", - "13 43 2024-05-25 19:11:47 58742 Фестиваль мороже... 2482.0 15 3.0 post NaN NaN ... Островского площадь площадь Островск... ['площадь Остров... площадь Островск... площадь Островского площадь Островск... POINT (30.33677 ... NaN street\n", - "22 110 2024-05-13 17:30:00 58539 188 лет назад в ... 1525.0 17 3.0 post NaN NaN ... Царскосельскую дорога Царскосельская у... ['Царскосельская... Царскосельская у... Царскосельская у... Царскосельская у... POINT (30.36046 ... NaN street\n", - "43 151 2024-05-07 15:33:46 58286 На площади Восст... 1819.0 57 10.0 post NaN NaN ... Восстания площадь площадь Восстани... ['площадь Восста... площадь Восстани... площадь Восстания Площадь Восстани... POINT (30.35902 ... NaN street\n", - "45 174 2024-05-03 09:36:19 58200 Весеннее настрое... 455.0 6 0.0 post https://vk.com/w... NaN ... Литейном None Литейный мост С... ['Литейный мост ... Литейный мост С... Литейный мост Литейный мост, В... POINT (30.34952 ... NaN street\n", - ".. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - "277 972 2024-03-26 14:09:31 57547 В свете последни... NaN 1 NaN comment NaN 7332491.0 ... Таврической None Таврическая улиц... ['Таврическая ул... Таврическая улиц... Таврическая улица 2, Таврическая у... POINT (30.37735 ... NaN street\n", - "281 993 2024-03-24 15:48:57 57514 Лучше бы Краснос... NaN 113 NaN comment NaN 1278670.0 ... Красносельскому None Красносельский п... ['Красносельский... Красносельский п... Красносельский п... Красносельский п... POINT (30.10345 ... NaN street\n", - "282 993 2024-03-24 15:48:57 57514 Лучше бы Краснос... NaN 113 NaN comment NaN 1278670.0 ... Красносельскому None Красносельский п... ['Красносельский... Красносельский п... Красносельское ш... Красносельский п... POINT (30.10345 ... NaN street\n", - "283 993 2024-03-24 15:48:57 57514 Лучше бы Краснос... NaN 113 NaN comment NaN 1278670.0 ... Красносельскому None Красносельский п... ['Красносельский... Красносельское ш... Красносельский п... Красносельское ш... POINT (30.06784 ... NaN street\n", - "284 993 2024-03-24 15:48:57 57514 Лучше бы Краснос... NaN 113 NaN comment NaN 1278670.0 ... Красносельскому None Красносельский п... ['Красносельский... Красносельское ш... Красносельское ш... Красносельское ш... POINT (30.06784 ... NaN street\n", - "\n", - "[124 rows x 28 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mask = (result.Score.isna())&(~result.Street.isna())\n", - "result[mask]\n", - "# result.to_csv('C:\\Users\\USER\\SOIKA\\сохры.csv', index=False)" + "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Визаулизация полученных сообщений" + "#### Запуск геолокатора со встроенным геокодером" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0dateidtextviews.countlikes.countreposts.counttypelinkfrom_id...initial_streetToponimsfull_street_namelocation_optionsaddr_to_geocodeonly_full_street_nameLocationgeometryother_geo_objgeo_obj_tag
012024-06-02 14:31:3458907В Центральном ра...577.041.0postNaNNaN...БольшуюплощадьБольшая аллея С...['Большая аллея ...Большая аллея С...Большая аллеяБольшая аллея, О...POINT (30.28604 ...NaNstreet
112024-06-02 14:31:3458907В Центральном ра...577.041.0postNaNNaN...БольшуюплощадьБольшая аллея С...['Большая аллея ...Большая аллея С...Большой проспектБольшая аллея, О...POINT (30.28604 ...NaNstreet
212024-06-02 14:31:3458907В Центральном ра...577.041.0postNaNNaN...БольшуюплощадьБольшая аллея С...['Большая аллея ...Большая аллея С...Большая улицаБольшая аллея, О...POINT (30.28604 ...NaNstreet
\n", - "

3 rows × 28 columns

\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 date id text views.count likes.count reposts.count type link from_id ... initial_street Toponims full_street_name location_options addr_to_geocode only_full_street_name Location geometry other_geo_obj geo_obj_tag\n", - "0 1 2024-06-02 14:31:34 58907 В Центральном ра... 577.0 4 1.0 post NaN NaN ... Большую площадь Большая аллея С... ['Большая аллея ... Большая аллея С... Большая аллея Большая аллея, О... POINT (30.28604 ... NaN street\n", - "1 1 2024-06-02 14:31:34 58907 В Центральном ра... 577.0 4 1.0 post NaN NaN ... Большую площадь Большая аллея С... ['Большая аллея ... Большая аллея С... Большой проспект Большая аллея, О... POINT (30.28604 ... NaN street\n", - "2 1 2024-06-02 14:31:34 58907 В Центральном ра... 577.0 4 1.0 post NaN NaN ... Большую площадь Большая аллея С... ['Большая аллея ... Большая аллея С... Большая улица Большая аллея, О... POINT (30.28604 ... NaN street\n", - "\n", - "[3 rows x 28 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "result.head(3)" + "result = Geocoder().run(osm_id = 338635, tags = {\"admin_level\": [\"8\"]}, date = \"2024-04-22T00:00:00Z\", df=df, text_column='text', group_column='group_name')" ] }, { @@ -1479,7 +98,7 @@ "metadata": {}, "outputs": [], "source": [ - "# result.explore()" + "result.columns" ] } ], diff --git a/examples/new_graph_extraction.ipynb b/examples/new_graph_extraction.ipynb index cb6d2f5..f5205f3 100644 --- a/examples/new_graph_extraction.ipynb +++ b/examples/new_graph_extraction.ipynb @@ -15,6 +15,9 @@ { "cell_type": "code", "execution_count": 10, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "import warnings\n", @@ -28,26 +31,26 @@ "sys.path.append(folder)\n", "\n", "from sloyka.src.data_getter import VKParser" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 11, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "owner_id='-129354225' # группа \"центральный район за комфортную среду обитания\"\n", - "token='af981bccaf981bccaf981bcc00ac8f3fccaaf98af981bccca2667d92812caf000917163'" - ], - "metadata": { - "collapsed": false - } + "token='...'" + ] }, { "cell_type": "code", "execution_count": 12, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -60,14 +63,14 @@ ], "source": [ "posts_df = VKParser().run_posts(owner_id, token, 100, \"2024-03-01\")" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 13, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -80,14 +83,14 @@ "source": [ "post_ids = posts_df[\"id\"].to_list()\n", "comments_df = VKParser().run_comments(owner_id=owner_id, post_ids=post_ids, access_token=token)" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 15, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -101,19 +104,252 @@ ], "source": [ "result_df = VKParser().run_parser(owner_id, token, step=100, cutoff_date='2024-03-20') #объединяем посты и комментарии в один датафрейм" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 16, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { - "text/plain": " date id \\\n0 2024-03-21 19:30:00 333766 \n1 2024-03-21 10:30:00 333530 \n2 2024-03-20 19:30:00 333376 \n3 2024-03-20 10:30:00 333265 \n4 2024-03-19 19:30:00 333226 \n.. ... ... \n715 2024-02-07 10:42:30 328420 \n716 2024-02-07 12:57:57 328422 \n717 2024-02-07 14:03:22 328427 \n718 2024-02-07 15:26:01 328438 \n719 2024-02-07 18:56:25 328462 \n\n text views.count \\\n0 В доме на Синопской набережной 32/35 из-за поз... 795.0 \n1 Сделайте сад Сан-Галли безопасным для граждан,... 3527.0 \n2 «Электросамокатная саранча» вновь повылезала н... 9866.0 \n3 Вот и отгуляла широкая Масленица! Наша команда... 4599.0 \n4 На том же месте сквозь года Вот так каланча ве... 4355.0 \n.. ... ... \n715 Люди десятилетиями ждут замены лифтов, чтоб он... NaN \n716 С вопросом капремонта лифтов, к сожалению, ст... NaN \n717 Думаю что лифты в порядке. Просто их выключили... NaN \n718 [id10835085|Андрей], очень даже может быть NaN \n719 С лифтами ныне у многих проблемы. У меня в дом... NaN \n\n likes.count reposts.count type \\\n0 28 8.0 post \n1 126 23.0 post \n2 188 24.0 post \n3 54 8.0 post \n4 51 11.0 post \n.. ... ... ... \n715 2 NaN comment \n716 4 NaN comment \n717 1 NaN comment \n718 1 NaN reply \n719 0 NaN comment \n\n link post_id parents_stack \n0 NaN NaN NaN \n1 NaN NaN NaN \n2 NaN NaN NaN \n3 https://vk.com/wall-129354225_332885|ярко, NaN NaN \n4 https://vk.com/wall-129354225_325408|уплотните... NaN NaN \n.. ... ... ... \n715 NaN 328388.0 [] \n716 NaN 328388.0 [] \n717 NaN 328388.0 [] \n718 NaN 328388.0 [328427] \n719 NaN 328388.0 [] \n\n[720 rows x 10 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
dateidtextviews.countlikes.countreposts.counttypelinkpost_idparents_stack
02024-03-21 19:30:00333766В доме на Синопской набережной 32/35 из-за поз...795.0288.0postNaNNaNNaN
12024-03-21 10:30:00333530Сделайте сад Сан-Галли безопасным для граждан,...3527.012623.0postNaNNaNNaN
22024-03-20 19:30:00333376«Электросамокатная саранча» вновь повылезала н...9866.018824.0postNaNNaNNaN
32024-03-20 10:30:00333265Вот и отгуляла широкая Масленица! Наша команда...4599.0548.0posthttps://vk.com/wall-129354225_332885|ярко,NaNNaN
42024-03-19 19:30:00333226На том же месте сквозь года Вот так каланча ве...4355.05111.0posthttps://vk.com/wall-129354225_325408|уплотните...NaNNaN
.................................
7152024-02-07 10:42:30328420Люди десятилетиями ждут замены лифтов, чтоб он...NaN2NaNcommentNaN328388.0[]
7162024-02-07 12:57:57328422С вопросом капремонта лифтов, к сожалению, ст...NaN4NaNcommentNaN328388.0[]
7172024-02-07 14:03:22328427Думаю что лифты в порядке. Просто их выключили...NaN1NaNcommentNaN328388.0[]
7182024-02-07 15:26:01328438[id10835085|Андрей], очень даже может бытьNaN1NaNreplyNaN328388.0[328427]
7192024-02-07 18:56:25328462С лифтами ныне у многих проблемы. У меня в дом...NaN0NaNcommentNaN328388.0[]
\n

720 rows × 10 columns

\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dateidtextviews.countlikes.countreposts.counttypelinkpost_idparents_stack
02024-03-21 19:30:00333766В доме на Синопской набережной 32/35 из-за поз...795.0288.0postNaNNaNNaN
12024-03-21 10:30:00333530Сделайте сад Сан-Галли безопасным для граждан,...3527.012623.0postNaNNaNNaN
22024-03-20 19:30:00333376«Электросамокатная саранча» вновь повылезала н...9866.018824.0postNaNNaNNaN
32024-03-20 10:30:00333265Вот и отгуляла широкая Масленица! Наша команда...4599.0548.0posthttps://vk.com/wall-129354225_332885|ярко,NaNNaN
42024-03-19 19:30:00333226На том же месте сквозь года Вот так каланча ве...4355.05111.0posthttps://vk.com/wall-129354225_325408|уплотните...NaNNaN
.................................
7152024-02-07 10:42:30328420Люди десятилетиями ждут замены лифтов, чтоб он...NaN2NaNcommentNaN328388.0[]
7162024-02-07 12:57:57328422С вопросом капремонта лифтов, к сожалению, ст...NaN4NaNcommentNaN328388.0[]
7172024-02-07 14:03:22328427Думаю что лифты в порядке. Просто их выключили...NaN1NaNcommentNaN328388.0[]
7182024-02-07 15:26:01328438[id10835085|Андрей], очень даже может бытьNaN1NaNreplyNaN328388.0[328427]
7192024-02-07 18:56:25328462С лифтами ныне у многих проблемы. У меня в дом...NaN0NaNcommentNaN328388.0[]
\n", + "

720 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " date id \\\n", + "0 2024-03-21 19:30:00 333766 \n", + "1 2024-03-21 10:30:00 333530 \n", + "2 2024-03-20 19:30:00 333376 \n", + "3 2024-03-20 10:30:00 333265 \n", + "4 2024-03-19 19:30:00 333226 \n", + ".. ... ... \n", + "715 2024-02-07 10:42:30 328420 \n", + "716 2024-02-07 12:57:57 328422 \n", + "717 2024-02-07 14:03:22 328427 \n", + "718 2024-02-07 15:26:01 328438 \n", + "719 2024-02-07 18:56:25 328462 \n", + "\n", + " text views.count \\\n", + "0 В доме на Синопской набережной 32/35 из-за поз... 795.0 \n", + "1 Сделайте сад Сан-Галли безопасным для граждан,... 3527.0 \n", + "2 «Электросамокатная саранча» вновь повылезала н... 9866.0 \n", + "3 Вот и отгуляла широкая Масленица! Наша команда... 4599.0 \n", + "4 На том же месте сквозь года Вот так каланча ве... 4355.0 \n", + ".. ... ... \n", + "715 Люди десятилетиями ждут замены лифтов, чтоб он... NaN \n", + "716 С вопросом капремонта лифтов, к сожалению, ст... NaN \n", + "717 Думаю что лифты в порядке. Просто их выключили... NaN \n", + "718 [id10835085|Андрей], очень даже может быть NaN \n", + "719 С лифтами ныне у многих проблемы. У меня в дом... NaN \n", + "\n", + " likes.count reposts.count type \\\n", + "0 28 8.0 post \n", + "1 126 23.0 post \n", + "2 188 24.0 post \n", + "3 54 8.0 post \n", + "4 51 11.0 post \n", + ".. ... ... ... \n", + "715 2 NaN comment \n", + "716 4 NaN comment \n", + "717 1 NaN comment \n", + "718 1 NaN reply \n", + "719 0 NaN comment \n", + "\n", + " link post_id parents_stack \n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 https://vk.com/wall-129354225_332885|ярко, NaN NaN \n", + "4 https://vk.com/wall-129354225_325408|уплотните... NaN NaN \n", + ".. ... ... ... \n", + "715 NaN 328388.0 [] \n", + "716 NaN 328388.0 [] \n", + "717 NaN 328388.0 [] \n", + "718 NaN 328388.0 [328427] \n", + "719 NaN 328388.0 [] \n", + "\n", + "[720 rows x 10 columns]" + ] }, "execution_count": 16, "metadata": {}, @@ -122,45 +358,45 @@ ], "source": [ "result_df" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 17, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "result_df.to_csv(\"C:\\\\Users\\\\thebe\\\\Downloads\\\\ruina.csv\")" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 153, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "df_from_csv = pd.read_csv(\"C:\\\\Users\\\\thebe\\\\Downloads\\\\ruina.csv\")" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 154, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2024-03-22 14:11:45,652 SequenceTagger predicts: Dictionary with 5 tags: O, S-ADDRESS, B-ADDRESS, E-ADDRESS, I-ADDRESS\n", - "\u001B[32m03-22 14:12\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mclear_names started\u001B[0m\n", - "\u001B[32m03-22 14:12\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mget_street started\u001B[0m\n", - "\u001B[32m03-22 14:12\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mextract_ner_street started\u001B[0m\n" + "\u001b[32m03-22 14:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mclear_names started\u001b[0m\n", + "\u001b[32m03-22 14:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mget_street started\u001b[0m\n", + "\u001b[32m03-22 14:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mextract_ner_street started\u001b[0m\n" ] }, { @@ -175,7 +411,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001B[32m03-22 14:12\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mpattern1.sub started\u001B[0m\n" + "\u001b[32m03-22 14:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpattern1.sub started\u001b[0m\n" ] }, { @@ -190,7 +426,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001B[32m03-22 14:12\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mpattern2.findall started\u001B[0m\n" + "\u001b[32m03-22 14:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpattern2.findall started\u001b[0m\n" ] }, { @@ -205,7 +441,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001B[32m03-22 14:12\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mpattern2.sub started\u001B[0m\n" + "\u001b[32m03-22 14:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpattern2.sub started\u001b[0m\n" ] }, { @@ -220,7 +456,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001B[32m03-22 14:12\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mextract_building_num started\u001B[0m\n" + "\u001b[32m03-22 14:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mextract_building_num started\u001b[0m\n" ] }, { @@ -235,7 +471,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001B[32m03-22 14:12\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mextract_toponym started\u001B[0m\n" + "\u001b[32m03-22 14:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mextract_toponym started\u001b[0m\n" ] }, { @@ -250,7 +486,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001B[32m03-22 14:12\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mget_stem started\u001B[0m\n" + "\u001b[32m03-22 14:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mget_stem started\u001b[0m\n" ] }, { @@ -265,19 +501,19 @@ "evalue": "Unable to fill values because RangeIndex cannot contain NA", "output_type": "error", "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mValueError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[1;32mIn[154], line 5\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mgeocoder\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Geocoder\n\u001B[0;32m 3\u001B[0m g \u001B[38;5;241m=\u001B[39m Geocoder(device\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mcuda\u001B[39m\u001B[38;5;124m'\u001B[39m)\n\u001B[1;32m----> 5\u001B[0m gdf \u001B[38;5;241m=\u001B[39m \u001B[43mg\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mrun\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdf_from_csv\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mtext\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m)\u001B[49m\n", - "File \u001B[1;32mI:\\sloyka\\sloyka\\src\\geocoder.py:740\u001B[0m, in \u001B[0;36mGeocoder.run\u001B[1;34m(self, df, text_column)\u001B[0m\n\u001B[0;32m 738\u001B[0m df \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mget_street(df, text_column)\n\u001B[0;32m 739\u001B[0m street_names \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mget_stem(street_names)\n\u001B[1;32m--> 740\u001B[0m df \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfind_word_form\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdf\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mstreet_names\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 741\u001B[0m gdf \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcreate_gdf(df)\n\u001B[0;32m 742\u001B[0m \u001B[38;5;66;03m# gdf2 = self.merge_to_initial_df(gdf, initial_df)\u001B[39;00m\n\u001B[0;32m 743\u001B[0m \n\u001B[0;32m 744\u001B[0m \u001B[38;5;66;03m# # Add a new 'level' column using the get_level function\u001B[39;00m\n\u001B[0;32m 745\u001B[0m \u001B[38;5;66;03m# gdf2[\"level\"] = gdf2.progress_apply(self.get_level, axis=1)\u001B[39;00m\n\u001B[0;32m 746\u001B[0m \u001B[38;5;66;03m# gdf2 = self.set_global_repr_point(gdf2)\u001B[39;00m\n", - "File \u001B[1;32mI:\\sloyka\\sloyka\\src\\geocoder.py:561\u001B[0m, in \u001B[0;36mGeocoder.find_word_form\u001B[1;34m(self, df, strts_df)\u001B[0m\n\u001B[0;32m 558\u001B[0m new_df \u001B[38;5;241m=\u001B[39m tmp_df_1\u001B[38;5;241m.\u001B[39mto_frame()\u001B[38;5;241m.\u001B[39mjoin(tmp_df_2\u001B[38;5;241m.\u001B[39mto_frame()) \n\u001B[0;32m 560\u001B[0m df\u001B[38;5;241m.\u001B[39mdrop(columns\u001B[38;5;241m=\u001B[39m[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124monly_full_street_name\u001B[39m\u001B[38;5;124m'\u001B[39m], inplace\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m)\n\u001B[1;32m--> 561\u001B[0m df \u001B[38;5;241m=\u001B[39m \u001B[43mdf\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mmerge\u001B[49m\u001B[43m(\u001B[49m\u001B[43mnew_df\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mleft_on\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdf\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mindex\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mright_on\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mnew_df\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mindex\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 562\u001B[0m df\u001B[38;5;241m.\u001B[39mdrop(columns\u001B[38;5;241m=\u001B[39m[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mkey_0\u001B[39m\u001B[38;5;124m'\u001B[39m], inplace\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m)\n\u001B[0;32m 564\u001B[0m \u001B[38;5;66;03m# new_df = df[\"only_full_street_name\"].explode()\u001B[39;00m\n\u001B[0;32m 565\u001B[0m \u001B[38;5;66;03m# new_df.name = \"only_full_street_name\"\u001B[39;00m\n\u001B[0;32m 566\u001B[0m \u001B[38;5;66;03m# df.drop(columns=['key_0', 'only_full_street_name'], inplace=True)\u001B[39;00m\n\u001B[0;32m 567\u001B[0m \u001B[38;5;66;03m# df = df.merge(new_df, left_on=df.index, right_on=new_df.index)\u001B[39;00m\n\u001B[0;32m 568\u001B[0m \n\u001B[0;32m 569\u001B[0m \u001B[38;5;66;03m# print(df.head())\u001B[39;00m\n", - "File \u001B[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\frame.py:10819\u001B[0m, in \u001B[0;36mDataFrame.merge\u001B[1;34m(self, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)\u001B[0m\n\u001B[0;32m 10800\u001B[0m \u001B[38;5;129m@Substitution\u001B[39m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m 10801\u001B[0m \u001B[38;5;129m@Appender\u001B[39m(_merge_doc, indents\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m2\u001B[39m)\n\u001B[0;32m 10802\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mmerge\u001B[39m(\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 10815\u001B[0m validate: MergeValidate \u001B[38;5;241m|\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m,\n\u001B[0;32m 10816\u001B[0m ) \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m DataFrame:\n\u001B[0;32m 10817\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpandas\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mcore\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mreshape\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mmerge\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m merge\n\u001B[1;32m> 10819\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mmerge\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m 10820\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[0;32m 10821\u001B[0m \u001B[43m \u001B[49m\u001B[43mright\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 10822\u001B[0m \u001B[43m \u001B[49m\u001B[43mhow\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mhow\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 10823\u001B[0m \u001B[43m \u001B[49m\u001B[43mon\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mon\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 10824\u001B[0m \u001B[43m \u001B[49m\u001B[43mleft_on\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mleft_on\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 10825\u001B[0m \u001B[43m \u001B[49m\u001B[43mright_on\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mright_on\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 10826\u001B[0m \u001B[43m \u001B[49m\u001B[43mleft_index\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mleft_index\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 10827\u001B[0m \u001B[43m \u001B[49m\u001B[43mright_index\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mright_index\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 10828\u001B[0m \u001B[43m \u001B[49m\u001B[43msort\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43msort\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 10829\u001B[0m \u001B[43m \u001B[49m\u001B[43msuffixes\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43msuffixes\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 10830\u001B[0m \u001B[43m \u001B[49m\u001B[43mcopy\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mcopy\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 10831\u001B[0m \u001B[43m \u001B[49m\u001B[43mindicator\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mindicator\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 10832\u001B[0m \u001B[43m \u001B[49m\u001B[43mvalidate\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mvalidate\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 10833\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n", - "File \u001B[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\reshape\\merge.py:184\u001B[0m, in \u001B[0;36mmerge\u001B[1;34m(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)\u001B[0m\n\u001B[0;32m 169\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m 170\u001B[0m op \u001B[38;5;241m=\u001B[39m _MergeOperation(\n\u001B[0;32m 171\u001B[0m left_df,\n\u001B[0;32m 172\u001B[0m right_df,\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 182\u001B[0m validate\u001B[38;5;241m=\u001B[39mvalidate,\n\u001B[0;32m 183\u001B[0m )\n\u001B[1;32m--> 184\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mop\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_result\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcopy\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mcopy\u001B[49m\u001B[43m)\u001B[49m\n", - "File \u001B[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\reshape\\merge.py:896\u001B[0m, in \u001B[0;36m_MergeOperation.get_result\u001B[1;34m(self, copy)\u001B[0m\n\u001B[0;32m 893\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mindicator:\n\u001B[0;32m 894\u001B[0m result \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_indicator_post_merge(result)\n\u001B[1;32m--> 896\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_maybe_add_join_keys\u001B[49m\u001B[43m(\u001B[49m\u001B[43mresult\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mleft_indexer\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mright_indexer\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 898\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_maybe_restore_index_levels(result)\n\u001B[0;32m 900\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m result\u001B[38;5;241m.\u001B[39m__finalize__(\u001B[38;5;28mself\u001B[39m, method\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mmerge\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n", - "File \u001B[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\reshape\\merge.py:1064\u001B[0m, in \u001B[0;36m_MergeOperation._maybe_add_join_keys\u001B[1;34m(self, result, left_indexer, right_indexer)\u001B[0m\n\u001B[0;32m 1062\u001B[0m take_left \u001B[38;5;241m=\u001B[39m extract_array(take_left, extract_numpy\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m)\n\u001B[0;32m 1063\u001B[0m lfill \u001B[38;5;241m=\u001B[39m na_value_for_dtype(take_left\u001B[38;5;241m.\u001B[39mdtype)\n\u001B[1;32m-> 1064\u001B[0m lvals \u001B[38;5;241m=\u001B[39m \u001B[43malgos\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtake_nd\u001B[49m\u001B[43m(\u001B[49m\u001B[43mtake_left\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mleft_indexer\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mfill_value\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mlfill\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 1066\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m take_right \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[0;32m 1067\u001B[0m rvals \u001B[38;5;241m=\u001B[39m result[name]\u001B[38;5;241m.\u001B[39m_values\n", - "File \u001B[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\array_algos\\take.py:110\u001B[0m, in \u001B[0;36mtake_nd\u001B[1;34m(arr, indexer, axis, fill_value, allow_fill)\u001B[0m\n\u001B[0;32m 107\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m is_1d_only_ea_dtype(arr\u001B[38;5;241m.\u001B[39mdtype):\n\u001B[0;32m 108\u001B[0m \u001B[38;5;66;03m# i.e. DatetimeArray, TimedeltaArray\u001B[39;00m\n\u001B[0;32m 109\u001B[0m arr \u001B[38;5;241m=\u001B[39m cast(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mNDArrayBackedExtensionArray\u001B[39m\u001B[38;5;124m\"\u001B[39m, arr)\n\u001B[1;32m--> 110\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43marr\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtake\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m 111\u001B[0m \u001B[43m \u001B[49m\u001B[43mindexer\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mfill_value\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mfill_value\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mallow_fill\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mallow_fill\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43maxis\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43maxis\u001B[49m\n\u001B[0;32m 112\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 114\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m arr\u001B[38;5;241m.\u001B[39mtake(indexer, fill_value\u001B[38;5;241m=\u001B[39mfill_value, allow_fill\u001B[38;5;241m=\u001B[39mallow_fill)\n\u001B[0;32m 116\u001B[0m arr \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39masarray(arr)\n", - "File \u001B[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\indexes\\range.py:1163\u001B[0m, in \u001B[0;36mRangeIndex.take\u001B[1;34m(self, indices, axis, allow_fill, fill_value, **kwargs)\u001B[0m\n\u001B[0;32m 1160\u001B[0m indices \u001B[38;5;241m=\u001B[39m ensure_platform_int(indices)\n\u001B[0;32m 1162\u001B[0m \u001B[38;5;66;03m# raise an exception if allow_fill is True and fill_value is not None\u001B[39;00m\n\u001B[1;32m-> 1163\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_maybe_disallow_fill\u001B[49m\u001B[43m(\u001B[49m\u001B[43mallow_fill\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mfill_value\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mindices\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 1165\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(indices) \u001B[38;5;241m==\u001B[39m \u001B[38;5;241m0\u001B[39m:\n\u001B[0;32m 1166\u001B[0m taken \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39marray([], dtype\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mdtype)\n", - "File \u001B[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\indexes\\base.py:1192\u001B[0m, in \u001B[0;36mIndex._maybe_disallow_fill\u001B[1;34m(self, allow_fill, fill_value, indices)\u001B[0m\n\u001B[0;32m 1190\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m 1191\u001B[0m cls_name \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mtype\u001B[39m(\u001B[38;5;28mself\u001B[39m)\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__name__\u001B[39m\n\u001B[1;32m-> 1192\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[0;32m 1193\u001B[0m \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mUnable to fill values because \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mcls_name\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m cannot contain NA\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m 1194\u001B[0m )\n\u001B[0;32m 1195\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m 1196\u001B[0m allow_fill \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mFalse\u001B[39;00m\n", - "\u001B[1;31mValueError\u001B[0m: Unable to fill values because RangeIndex cannot contain NA" + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[154], line 5\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgeocoder\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Geocoder\n\u001b[0;32m 3\u001b[0m g \u001b[38;5;241m=\u001b[39m Geocoder(device\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m----> 5\u001b[0m gdf \u001b[38;5;241m=\u001b[39m \u001b[43mg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_from_csv\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtext\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mI:\\sloyka\\sloyka\\src\\geocoder.py:740\u001b[0m, in \u001b[0;36mGeocoder.run\u001b[1;34m(self, df, text_column)\u001b[0m\n\u001b[0;32m 738\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_street(df, text_column)\n\u001b[0;32m 739\u001b[0m street_names \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_stem(street_names)\n\u001b[1;32m--> 740\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfind_word_form\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstreet_names\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 741\u001b[0m gdf \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcreate_gdf(df)\n\u001b[0;32m 742\u001b[0m \u001b[38;5;66;03m# gdf2 = self.merge_to_initial_df(gdf, initial_df)\u001b[39;00m\n\u001b[0;32m 743\u001b[0m \n\u001b[0;32m 744\u001b[0m \u001b[38;5;66;03m# # Add a new 'level' column using the get_level function\u001b[39;00m\n\u001b[0;32m 745\u001b[0m \u001b[38;5;66;03m# gdf2[\"level\"] = gdf2.progress_apply(self.get_level, axis=1)\u001b[39;00m\n\u001b[0;32m 746\u001b[0m \u001b[38;5;66;03m# gdf2 = self.set_global_repr_point(gdf2)\u001b[39;00m\n", + "File \u001b[1;32mI:\\sloyka\\sloyka\\src\\geocoder.py:561\u001b[0m, in \u001b[0;36mGeocoder.find_word_form\u001b[1;34m(self, df, strts_df)\u001b[0m\n\u001b[0;32m 558\u001b[0m new_df \u001b[38;5;241m=\u001b[39m tmp_df_1\u001b[38;5;241m.\u001b[39mto_frame()\u001b[38;5;241m.\u001b[39mjoin(tmp_df_2\u001b[38;5;241m.\u001b[39mto_frame()) \n\u001b[0;32m 560\u001b[0m df\u001b[38;5;241m.\u001b[39mdrop(columns\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124monly_full_street_name\u001b[39m\u001b[38;5;124m'\u001b[39m], inplace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m--> 561\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmerge\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnew_df\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mleft_on\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mright_on\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnew_df\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 562\u001b[0m df\u001b[38;5;241m.\u001b[39mdrop(columns\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mkey_0\u001b[39m\u001b[38;5;124m'\u001b[39m], inplace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 564\u001b[0m \u001b[38;5;66;03m# new_df = df[\"only_full_street_name\"].explode()\u001b[39;00m\n\u001b[0;32m 565\u001b[0m \u001b[38;5;66;03m# new_df.name = \"only_full_street_name\"\u001b[39;00m\n\u001b[0;32m 566\u001b[0m \u001b[38;5;66;03m# df.drop(columns=['key_0', 'only_full_street_name'], inplace=True)\u001b[39;00m\n\u001b[0;32m 567\u001b[0m \u001b[38;5;66;03m# df = df.merge(new_df, left_on=df.index, right_on=new_df.index)\u001b[39;00m\n\u001b[0;32m 568\u001b[0m \n\u001b[0;32m 569\u001b[0m \u001b[38;5;66;03m# print(df.head())\u001b[39;00m\n", + "File \u001b[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\frame.py:10819\u001b[0m, in \u001b[0;36mDataFrame.merge\u001b[1;34m(self, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)\u001b[0m\n\u001b[0;32m 10800\u001b[0m \u001b[38;5;129m@Substitution\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 10801\u001b[0m \u001b[38;5;129m@Appender\u001b[39m(_merge_doc, indents\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m)\n\u001b[0;32m 10802\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mmerge\u001b[39m(\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 10815\u001b[0m validate: MergeValidate \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 10816\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame:\n\u001b[0;32m 10817\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mreshape\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmerge\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m merge\n\u001b[1;32m> 10819\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmerge\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 10820\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 10821\u001b[0m \u001b[43m \u001b[49m\u001b[43mright\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 10822\u001b[0m \u001b[43m \u001b[49m\u001b[43mhow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhow\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 10823\u001b[0m \u001b[43m \u001b[49m\u001b[43mon\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 10824\u001b[0m \u001b[43m \u001b[49m\u001b[43mleft_on\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mleft_on\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 10825\u001b[0m \u001b[43m \u001b[49m\u001b[43mright_on\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mright_on\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 10826\u001b[0m \u001b[43m \u001b[49m\u001b[43mleft_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mleft_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 10827\u001b[0m \u001b[43m \u001b[49m\u001b[43mright_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mright_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 10828\u001b[0m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 10829\u001b[0m \u001b[43m \u001b[49m\u001b[43msuffixes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msuffixes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 10830\u001b[0m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 10831\u001b[0m \u001b[43m \u001b[49m\u001b[43mindicator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindicator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 10832\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalidate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 10833\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\reshape\\merge.py:184\u001b[0m, in \u001b[0;36mmerge\u001b[1;34m(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)\u001b[0m\n\u001b[0;32m 169\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 170\u001b[0m op \u001b[38;5;241m=\u001b[39m _MergeOperation(\n\u001b[0;32m 171\u001b[0m left_df,\n\u001b[0;32m 172\u001b[0m right_df,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 182\u001b[0m validate\u001b[38;5;241m=\u001b[39mvalidate,\n\u001b[0;32m 183\u001b[0m )\n\u001b[1;32m--> 184\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_result\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\reshape\\merge.py:896\u001b[0m, in \u001b[0;36m_MergeOperation.get_result\u001b[1;34m(self, copy)\u001b[0m\n\u001b[0;32m 893\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindicator:\n\u001b[0;32m 894\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_indicator_post_merge(result)\n\u001b[1;32m--> 896\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_maybe_add_join_keys\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresult\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mleft_indexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mright_indexer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 898\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maybe_restore_index_levels(result)\n\u001b[0;32m 900\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\u001b[38;5;241m.\u001b[39m__finalize__(\u001b[38;5;28mself\u001b[39m, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmerge\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\reshape\\merge.py:1064\u001b[0m, in \u001b[0;36m_MergeOperation._maybe_add_join_keys\u001b[1;34m(self, result, left_indexer, right_indexer)\u001b[0m\n\u001b[0;32m 1062\u001b[0m take_left \u001b[38;5;241m=\u001b[39m extract_array(take_left, extract_numpy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 1063\u001b[0m lfill \u001b[38;5;241m=\u001b[39m na_value_for_dtype(take_left\u001b[38;5;241m.\u001b[39mdtype)\n\u001b[1;32m-> 1064\u001b[0m lvals \u001b[38;5;241m=\u001b[39m \u001b[43malgos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtake_nd\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtake_left\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mleft_indexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfill_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlfill\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1066\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m take_right \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 1067\u001b[0m rvals \u001b[38;5;241m=\u001b[39m result[name]\u001b[38;5;241m.\u001b[39m_values\n", + "File \u001b[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\array_algos\\take.py:110\u001b[0m, in \u001b[0;36mtake_nd\u001b[1;34m(arr, indexer, axis, fill_value, allow_fill)\u001b[0m\n\u001b[0;32m 107\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_1d_only_ea_dtype(arr\u001b[38;5;241m.\u001b[39mdtype):\n\u001b[0;32m 108\u001b[0m \u001b[38;5;66;03m# i.e. DatetimeArray, TimedeltaArray\u001b[39;00m\n\u001b[0;32m 109\u001b[0m arr \u001b[38;5;241m=\u001b[39m cast(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNDArrayBackedExtensionArray\u001b[39m\u001b[38;5;124m\"\u001b[39m, arr)\n\u001b[1;32m--> 110\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43marr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtake\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 111\u001b[0m \u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfill_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfill_value\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mallow_fill\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mallow_fill\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\n\u001b[0;32m 112\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mtake(indexer, fill_value\u001b[38;5;241m=\u001b[39mfill_value, allow_fill\u001b[38;5;241m=\u001b[39mallow_fill)\n\u001b[0;32m 116\u001b[0m arr \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39masarray(arr)\n", + "File \u001b[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\indexes\\range.py:1163\u001b[0m, in \u001b[0;36mRangeIndex.take\u001b[1;34m(self, indices, axis, allow_fill, fill_value, **kwargs)\u001b[0m\n\u001b[0;32m 1160\u001b[0m indices \u001b[38;5;241m=\u001b[39m ensure_platform_int(indices)\n\u001b[0;32m 1162\u001b[0m \u001b[38;5;66;03m# raise an exception if allow_fill is True and fill_value is not None\u001b[39;00m\n\u001b[1;32m-> 1163\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_maybe_disallow_fill\u001b[49m\u001b[43m(\u001b[49m\u001b[43mallow_fill\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfill_value\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindices\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1165\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(indices) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m 1166\u001b[0m taken \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marray([], dtype\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdtype)\n", + "File \u001b[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\indexes\\base.py:1192\u001b[0m, in \u001b[0;36mIndex._maybe_disallow_fill\u001b[1;34m(self, allow_fill, fill_value, indices)\u001b[0m\n\u001b[0;32m 1190\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1191\u001b[0m cls_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\n\u001b[1;32m-> 1192\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 1193\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to fill values because \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcls_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m cannot contain NA\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1194\u001b[0m )\n\u001b[0;32m 1195\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1196\u001b[0m allow_fill \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n", + "\u001b[1;31mValueError\u001b[0m: Unable to fill values because RangeIndex cannot contain NA" ] } ], @@ -287,116 +523,116 @@ "g = Geocoder(device='cuda')\n", "\n", "gdf = g.run(df_from_csv, 'text')" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 24, + "metadata": { + "collapsed": false + }, "outputs": [ { "ename": "NameError", "evalue": "name 'gdf' is not defined", "output_type": "error", "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mNameError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[1;32mIn[24], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43mgdf\u001B[49m\n", - "\u001B[1;31mNameError\u001B[0m: name 'gdf' is not defined" + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[24], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mgdf\u001b[49m\n", + "\u001b[1;31mNameError\u001b[0m: name 'gdf' is not defined" ] } ], "source": [ "gdf" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 25, + "metadata": { + "collapsed": false + }, "outputs": [ { "ename": "NameError", "evalue": "name 'gdf' is not defined", "output_type": "error", "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mNameError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[1;32mIn[25], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[38;5;28mlen\u001B[39m(\u001B[43mgdf\u001B[49m[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mparents_stack\u001B[39m\u001B[38;5;124m'\u001B[39m]\u001B[38;5;241m.\u001B[39miloc[\u001B[38;5;241m187\u001B[39m])\n", - "\u001B[1;31mNameError\u001B[0m: name 'gdf' is not defined" + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[25], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28mlen\u001b[39m(\u001b[43mgdf\u001b[49m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mparents_stack\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39miloc[\u001b[38;5;241m187\u001b[39m])\n", + "\u001b[1;31mNameError\u001b[0m: name 'gdf' is not defined" ] } ], "source": [ "len(gdf['parents_stack'].iloc[187])" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 26, + "metadata": { + "collapsed": false + }, "outputs": [ { "ename": "NameError", "evalue": "name 'gdf' is not defined", "output_type": "error", "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mNameError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[1;32mIn[26], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m test_gdf \u001B[38;5;241m=\u001B[39m \u001B[43mgdf\u001B[49m\u001B[38;5;241m.\u001B[39mcopy()\n", - "\u001B[1;31mNameError\u001B[0m: name 'gdf' is not defined" + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[26], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m test_gdf \u001b[38;5;241m=\u001b[39m \u001b[43mgdf\u001b[49m\u001b[38;5;241m.\u001b[39mcopy()\n", + "\u001b[1;31mNameError\u001b[0m: name 'gdf' is not defined" ] } ], "source": [ "test_gdf = gdf.copy()" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 18, + "metadata": { + "collapsed": false + }, "outputs": [ { "ename": "NameError", "evalue": "name 'test_gdf' is not defined", "output_type": "error", "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mNameError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[1;32mIn[18], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43mtest_gdf\u001B[49m\n", - "\u001B[1;31mNameError\u001B[0m: name 'test_gdf' is not defined" + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[18], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mtest_gdf\u001b[49m\n", + "\u001b[1;31mNameError\u001b[0m: name 'test_gdf' is not defined" ] } ], "source": [ "test_gdf" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 19, + "metadata": { + "collapsed": false + }, "outputs": [ { "ename": "NameError", "evalue": "name 'test_gdf' is not defined", "output_type": "error", "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mNameError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[1;32mIn[19], line 3\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mnumpy\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m \u001B[38;5;21;01mnp\u001B[39;00m\n\u001B[1;32m----> 3\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m i \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mrange\u001B[39m(\u001B[38;5;28mlen\u001B[39m(\u001B[43mtest_gdf\u001B[49m)):\n\u001B[0;32m 4\u001B[0m tmp \u001B[38;5;241m=\u001B[39m test_gdf[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mparents_stack\u001B[39m\u001B[38;5;124m'\u001B[39m]\u001B[38;5;241m.\u001B[39miloc[i]\n\u001B[0;32m 5\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mtype\u001B[39m(tmp) \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28mfloat\u001B[39m \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(tmp) \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m0\u001B[39m:\n", - "\u001B[1;31mNameError\u001B[0m: name 'test_gdf' is not defined" + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[19], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mlen\u001b[39m(\u001b[43mtest_gdf\u001b[49m)):\n\u001b[0;32m 4\u001b[0m tmp \u001b[38;5;241m=\u001b[39m test_gdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mparents_stack\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39miloc[i]\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(tmp) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mfloat\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(tmp) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n", + "\u001b[1;31mNameError\u001b[0m: name 'test_gdf' is not defined" ] } ], @@ -409,19 +645,449 @@ " test_gdf.at[i, 'parents_stack'] = tmp[0]\n", " else:\n", " test_gdf.at[i, 'parents_stack'] = np.nan\n" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 14, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { - "text/plain": " date id \\\n0 2024-02-01 19:30:00 327734 \n1 2024-02-02 10:30:00 327851 \n2 2024-02-02 07:48:48 327853 \n3 2024-02-02 08:24:25 327854 \n4 2024-02-02 08:31:37 327855 \n.. ... ... \n477 2024-03-21 16:33:06 333767 \n478 2024-03-21 16:33:49 333768 \n479 2024-03-21 16:36:27 333770 \n480 2024-03-21 16:38:47 333771 \n481 2024-03-21 16:40:20 333772 \n\n text views.count \\\n0 Как вам такая идея по обустройству широких газ... 3328.0 \n1 Возвращение домофонов в парадные после капремо... 3746.0 \n2 Повсеместно такая проблема.\\nУгол Мытнинской и... NaN \n3 Можно закидать меня тряпками, но я сторонник у... NaN \n4 Некрасова 60, 4-ая парадная. Установлена новая... NaN \n.. ... ... \n477 Вредит не застройка, а косорукие строители и п... NaN \n478 Ужас, сквозные щели! Страшно жить в таком доме! NaN \n479 статью заменят на хулиганку? NaN \n480 Что и требовалось ожидать... NaN \n481 Становится очевидным что подобные \"истории\" пр... NaN \n\n likes.count reposts.count type link post_id parents_stack ... \\\n0 34 3.0 post NaN NaN NaN ... \n1 58 4.0 post NaN NaN NaN ... \n2 5 NaN comment NaN 327851.0 [] ... \n3 13 NaN comment NaN 327851.0 [] ... \n4 7 NaN comment NaN 327851.0 [] ... \n.. ... ... ... ... ... ... ... \n477 1 NaN comment NaN 333766.0 [] ... \n478 1 NaN comment NaN 333766.0 [] ... \n479 0 NaN comment NaN 333530.0 [] ... \n480 0 NaN comment NaN 333766.0 [] ... \n481 0 NaN comment NaN 333766.0 [] ... \n\n Score_y Numbers initial_street Toponims \\\n0 0.961 Приморская None \n1 0.987 60 Некрасова None \n2 NaN 8 Мытнинской None \n3 NaN NaN NaN NaN \n4 0.992 60 Некрасова None \n.. ... ... ... ... \n477 NaN NaN NaN NaN \n478 NaN NaN NaN NaN \n479 NaN NaN NaN NaN \n480 NaN NaN NaN NaN \n481 NaN Гороховой None \n\n full_street_name \\\n0 Приморская улица Санкт-Петербург Россия \n1 улица Некрасова 60 Санкт-Петербург Россия \n2 Мытнинская улица 8 Санкт-Петербург Россия,Мытн... \n3 NaN \n4 улица Некрасова 60 Санкт-Петербург Россия \n.. ... \n477 NaN \n478 NaN \n479 NaN \n480 NaN \n481 Гороховая улица Санкт-Петербург Россия \n\n location_options \\\n0 ['Приморская улица Санкт-Петербург Россия'] \n1 ['улица Некрасова 60 Санкт-Петербург Россия'] \n2 ['Мытнинская улица 8 Санкт-Петербург Россия', ... \n3 NaN \n4 ['улица Некрасова 60 Санкт-Петербург Россия'] \n.. ... \n477 NaN \n478 NaN \n479 NaN \n480 NaN \n481 ['Гороховая улица Санкт-Петербург Россия'] \n\n addr_to_geocode only_full_street_name \\\n0 Приморская улица Санкт-Петербург Россия Приморская улица \n1 улица Некрасова 60 Санкт-Петербург Россия улица Некрасова \n2 Мытнинская улица 8 Санкт-Петербург Россия Мытнинская улица \n3 NaN NaN \n4 улица Некрасова 60 Санкт-Петербург Россия улица Некрасова \n.. ... ... \n477 NaN NaN \n478 NaN NaN \n479 NaN NaN \n480 NaN NaN \n481 Гороховая улица Санкт-Петербург Россия Гороховая улица \n\n Location \\\n0 Приморская улица, Просвещение, Петергоф, Санкт... \n1 60, улица Некрасова, Пески, округ Смольнинское... \n2 Мытнинская улица, 8-я Советская улица, Пески, ... \n3 NaN \n4 60, улица Некрасова, Пески, округ Смольнинское... \n.. ... \n477 NaN \n478 NaN \n479 NaN \n480 NaN \n481 Гороховая улица, Адмиралтейский округ, Санкт-П... \n\n geometry \n0 POINT (29.86224 59.89660) \n1 POINT (30.36946 59.93851) \n2 POINT (30.38073 59.93420) \n3 None \n4 POINT (30.36946 59.93851) \n.. ... \n477 None \n478 None \n479 None \n480 None \n481 POINT (30.31142 59.93560) \n\n[482 rows x 23 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
dateidtextviews.countlikes.countreposts.counttypelinkpost_idparents_stack...Score_yNumbersinitial_streetToponimsfull_street_namelocation_optionsaddr_to_geocodeonly_full_street_nameLocationgeometry
02024-02-01 19:30:00327734Как вам такая идея по обустройству широких газ...3328.0343.0postNaNNaNNaN...0.961ПриморскаяNoneПриморская улица Санкт-Петербург Россия['Приморская улица Санкт-Петербург Россия']Приморская улица Санкт-Петербург РоссияПриморская улицаПриморская улица, Просвещение, Петергоф, Санкт...POINT (29.86224 59.89660)
12024-02-02 10:30:00327851Возвращение домофонов в парадные после капремо...3746.0584.0postNaNNaNNaN...0.98760НекрасоваNoneулица Некрасова 60 Санкт-Петербург Россия['улица Некрасова 60 Санкт-Петербург Россия']улица Некрасова 60 Санкт-Петербург Россияулица Некрасова60, улица Некрасова, Пески, округ Смольнинское...POINT (30.36946 59.93851)
22024-02-02 07:48:48327853Повсеместно такая проблема.\\nУгол Мытнинской и...NaN5NaNcommentNaN327851.0[]...NaN8МытнинскойNoneМытнинская улица 8 Санкт-Петербург Россия,Мытн...['Мытнинская улица 8 Санкт-Петербург Россия', ...Мытнинская улица 8 Санкт-Петербург РоссияМытнинская улицаМытнинская улица, 8-я Советская улица, Пески, ...POINT (30.38073 59.93420)
32024-02-02 08:24:25327854Можно закидать меня тряпками, но я сторонник у...NaN13NaNcommentNaN327851.0[]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNone
42024-02-02 08:31:37327855Некрасова 60, 4-ая парадная. Установлена новая...NaN7NaNcommentNaN327851.0[]...0.99260НекрасоваNoneулица Некрасова 60 Санкт-Петербург Россия['улица Некрасова 60 Санкт-Петербург Россия']улица Некрасова 60 Санкт-Петербург Россияулица Некрасова60, улица Некрасова, Пески, округ Смольнинское...POINT (30.36946 59.93851)
..................................................................
4772024-03-21 16:33:06333767Вредит не застройка, а косорукие строители и п...NaN1NaNcommentNaN333766.0[]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNone
4782024-03-21 16:33:49333768Ужас, сквозные щели! Страшно жить в таком доме!NaN1NaNcommentNaN333766.0[]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNone
4792024-03-21 16:36:27333770статью заменят на хулиганку?NaN0NaNcommentNaN333530.0[]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNone
4802024-03-21 16:38:47333771Что и требовалось ожидать...NaN0NaNcommentNaN333766.0[]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNone
4812024-03-21 16:40:20333772Становится очевидным что подобные \"истории\" пр...NaN0NaNcommentNaN333766.0[]...NaNГороховойNoneГороховая улица Санкт-Петербург Россия['Гороховая улица Санкт-Петербург Россия']Гороховая улица Санкт-Петербург РоссияГороховая улицаГороховая улица, Адмиралтейский округ, Санкт-П...POINT (30.31142 59.93560)
\n

482 rows × 23 columns

\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dateidtextviews.countlikes.countreposts.counttypelinkpost_idparents_stack...Score_yNumbersinitial_streetToponimsfull_street_namelocation_optionsaddr_to_geocodeonly_full_street_nameLocationgeometry
02024-02-01 19:30:00327734Как вам такая идея по обустройству широких газ...3328.0343.0postNaNNaNNaN...0.961ПриморскаяNoneПриморская улица Санкт-Петербург Россия['Приморская улица Санкт-Петербург Россия']Приморская улица Санкт-Петербург РоссияПриморская улицаПриморская улица, Просвещение, Петергоф, Санкт...POINT (29.86224 59.89660)
12024-02-02 10:30:00327851Возвращение домофонов в парадные после капремо...3746.0584.0postNaNNaNNaN...0.98760НекрасоваNoneулица Некрасова 60 Санкт-Петербург Россия['улица Некрасова 60 Санкт-Петербург Россия']улица Некрасова 60 Санкт-Петербург Россияулица Некрасова60, улица Некрасова, Пески, округ Смольнинское...POINT (30.36946 59.93851)
22024-02-02 07:48:48327853Повсеместно такая проблема.\\nУгол Мытнинской и...NaN5NaNcommentNaN327851.0[]...NaN8МытнинскойNoneМытнинская улица 8 Санкт-Петербург Россия,Мытн...['Мытнинская улица 8 Санкт-Петербург Россия', ...Мытнинская улица 8 Санкт-Петербург РоссияМытнинская улицаМытнинская улица, 8-я Советская улица, Пески, ...POINT (30.38073 59.93420)
32024-02-02 08:24:25327854Можно закидать меня тряпками, но я сторонник у...NaN13NaNcommentNaN327851.0[]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNone
42024-02-02 08:31:37327855Некрасова 60, 4-ая парадная. Установлена новая...NaN7NaNcommentNaN327851.0[]...0.99260НекрасоваNoneулица Некрасова 60 Санкт-Петербург Россия['улица Некрасова 60 Санкт-Петербург Россия']улица Некрасова 60 Санкт-Петербург Россияулица Некрасова60, улица Некрасова, Пески, округ Смольнинское...POINT (30.36946 59.93851)
..................................................................
4772024-03-21 16:33:06333767Вредит не застройка, а косорукие строители и п...NaN1NaNcommentNaN333766.0[]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNone
4782024-03-21 16:33:49333768Ужас, сквозные щели! Страшно жить в таком доме!NaN1NaNcommentNaN333766.0[]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNone
4792024-03-21 16:36:27333770статью заменят на хулиганку?NaN0NaNcommentNaN333530.0[]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNone
4802024-03-21 16:38:47333771Что и требовалось ожидать...NaN0NaNcommentNaN333766.0[]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNone
4812024-03-21 16:40:20333772Становится очевидным что подобные \"истории\" пр...NaN0NaNcommentNaN333766.0[]...NaNГороховойNoneГороховая улица Санкт-Петербург Россия['Гороховая улица Санкт-Петербург Россия']Гороховая улица Санкт-Петербург РоссияГороховая улицаГороховая улица, Адмиралтейский округ, Санкт-П...POINT (30.31142 59.93560)
\n", + "

482 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " date id \\\n", + "0 2024-02-01 19:30:00 327734 \n", + "1 2024-02-02 10:30:00 327851 \n", + "2 2024-02-02 07:48:48 327853 \n", + "3 2024-02-02 08:24:25 327854 \n", + "4 2024-02-02 08:31:37 327855 \n", + ".. ... ... \n", + "477 2024-03-21 16:33:06 333767 \n", + "478 2024-03-21 16:33:49 333768 \n", + "479 2024-03-21 16:36:27 333770 \n", + "480 2024-03-21 16:38:47 333771 \n", + "481 2024-03-21 16:40:20 333772 \n", + "\n", + " text views.count \\\n", + "0 Как вам такая идея по обустройству широких газ... 3328.0 \n", + "1 Возвращение домофонов в парадные после капремо... 3746.0 \n", + "2 Повсеместно такая проблема.\\nУгол Мытнинской и... NaN \n", + "3 Можно закидать меня тряпками, но я сторонник у... NaN \n", + "4 Некрасова 60, 4-ая парадная. Установлена новая... NaN \n", + ".. ... ... \n", + "477 Вредит не застройка, а косорукие строители и п... NaN \n", + "478 Ужас, сквозные щели! Страшно жить в таком доме! NaN \n", + "479 статью заменят на хулиганку? NaN \n", + "480 Что и требовалось ожидать... NaN \n", + "481 Становится очевидным что подобные \"истории\" пр... NaN \n", + "\n", + " likes.count reposts.count type link post_id parents_stack ... \\\n", + "0 34 3.0 post NaN NaN NaN ... \n", + "1 58 4.0 post NaN NaN NaN ... \n", + "2 5 NaN comment NaN 327851.0 [] ... \n", + "3 13 NaN comment NaN 327851.0 [] ... \n", + "4 7 NaN comment NaN 327851.0 [] ... \n", + ".. ... ... ... ... ... ... ... \n", + "477 1 NaN comment NaN 333766.0 [] ... \n", + "478 1 NaN comment NaN 333766.0 [] ... \n", + "479 0 NaN comment NaN 333530.0 [] ... \n", + "480 0 NaN comment NaN 333766.0 [] ... \n", + "481 0 NaN comment NaN 333766.0 [] ... \n", + "\n", + " Score_y Numbers initial_street Toponims \\\n", + "0 0.961 Приморская None \n", + "1 0.987 60 Некрасова None \n", + "2 NaN 8 Мытнинской None \n", + "3 NaN NaN NaN NaN \n", + "4 0.992 60 Некрасова None \n", + ".. ... ... ... ... \n", + "477 NaN NaN NaN NaN \n", + "478 NaN NaN NaN NaN \n", + "479 NaN NaN NaN NaN \n", + "480 NaN NaN NaN NaN \n", + "481 NaN Гороховой None \n", + "\n", + " full_street_name \\\n", + "0 Приморская улица Санкт-Петербург Россия \n", + "1 улица Некрасова 60 Санкт-Петербург Россия \n", + "2 Мытнинская улица 8 Санкт-Петербург Россия,Мытн... \n", + "3 NaN \n", + "4 улица Некрасова 60 Санкт-Петербург Россия \n", + ".. ... \n", + "477 NaN \n", + "478 NaN \n", + "479 NaN \n", + "480 NaN \n", + "481 Гороховая улица Санкт-Петербург Россия \n", + "\n", + " location_options \\\n", + "0 ['Приморская улица Санкт-Петербург Россия'] \n", + "1 ['улица Некрасова 60 Санкт-Петербург Россия'] \n", + "2 ['Мытнинская улица 8 Санкт-Петербург Россия', ... \n", + "3 NaN \n", + "4 ['улица Некрасова 60 Санкт-Петербург Россия'] \n", + ".. ... \n", + "477 NaN \n", + "478 NaN \n", + "479 NaN \n", + "480 NaN \n", + "481 ['Гороховая улица Санкт-Петербург Россия'] \n", + "\n", + " addr_to_geocode only_full_street_name \\\n", + "0 Приморская улица Санкт-Петербург Россия Приморская улица \n", + "1 улица Некрасова 60 Санкт-Петербург Россия улица Некрасова \n", + "2 Мытнинская улица 8 Санкт-Петербург Россия Мытнинская улица \n", + "3 NaN NaN \n", + "4 улица Некрасова 60 Санкт-Петербург Россия улица Некрасова \n", + ".. ... ... \n", + "477 NaN NaN \n", + "478 NaN NaN \n", + "479 NaN NaN \n", + "480 NaN NaN \n", + "481 Гороховая улица Санкт-Петербург Россия Гороховая улица \n", + "\n", + " Location \\\n", + "0 Приморская улица, Просвещение, Петергоф, Санкт... \n", + "1 60, улица Некрасова, Пески, округ Смольнинское... \n", + "2 Мытнинская улица, 8-я Советская улица, Пески, ... \n", + "3 NaN \n", + "4 60, улица Некрасова, Пески, округ Смольнинское... \n", + ".. ... \n", + "477 NaN \n", + "478 NaN \n", + "479 NaN \n", + "480 NaN \n", + "481 Гороховая улица, Адмиралтейский округ, Санкт-П... \n", + "\n", + " geometry \n", + "0 POINT (29.86224 59.89660) \n", + "1 POINT (30.36946 59.93851) \n", + "2 POINT (30.38073 59.93420) \n", + "3 None \n", + "4 POINT (30.36946 59.93851) \n", + ".. ... \n", + "477 None \n", + "478 None \n", + "479 None \n", + "480 None \n", + "481 POINT (30.31142 59.93560) \n", + "\n", + "[482 rows x 23 columns]" + ] }, "execution_count": 14, "metadata": {}, @@ -433,18 +1099,20 @@ "test_gdf = test_gdf.drop(columns=['date', 'text', 'views.count', 'likes.count', 'reposts.count', 'type', 'link', 'post_id', 'parents_stack'])\n", "test_gdf = pd.merge(result_df, test_gdf, on='id', how='outer')\n", "test_gdf" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 20, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { - "text/plain": "720" + "text/plain": [ + "720" + ] }, "execution_count": 20, "metadata": {}, @@ -453,18 +1121,69 @@ ], "source": [ "len(result_df)" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 22, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { - "text/plain": "[211,\n 141,\n 169,\n 252,\n 158,\n 442,\n 1,\n 73,\n 447,\n 690,\n 490,\n 277,\n 29,\n 163,\n 559,\n 520,\n 621,\n 612,\n 114,\n 422,\n 183,\n 358,\n 270,\n 526,\n 135,\n 601,\n 631,\n 639,\n 587,\n 345,\n 266,\n 476,\n 620,\n 211,\n 682,\n 141,\n 416,\n 170,\n 355,\n 242,\n 442,\n 717,\n 206,\n 85,\n 137,\n 579,\n 451,\n 487,\n 223,\n 636]" + "text/plain": [ + "[211,\n", + " 141,\n", + " 169,\n", + " 252,\n", + " 158,\n", + " 442,\n", + " 1,\n", + " 73,\n", + " 447,\n", + " 690,\n", + " 490,\n", + " 277,\n", + " 29,\n", + " 163,\n", + " 559,\n", + " 520,\n", + " 621,\n", + " 612,\n", + " 114,\n", + " 422,\n", + " 183,\n", + " 358,\n", + " 270,\n", + " 526,\n", + " 135,\n", + " 601,\n", + " 631,\n", + " 639,\n", + " 587,\n", + " 345,\n", + " 266,\n", + " 476,\n", + " 620,\n", + " 211,\n", + " 682,\n", + " 141,\n", + " 416,\n", + " 170,\n", + " 355,\n", + " 242,\n", + " 442,\n", + " 717,\n", + " 206,\n", + " 85,\n", + " 137,\n", + " 579,\n", + " 451,\n", + " 487,\n", + " 223,\n", + " 636]" + ] }, "execution_count": 22, "metadata": {}, @@ -482,44 +1201,302 @@ " index_list.append(random.randint(0, len(result_df)))\n", "\n", "index_list" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 23, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "test_gdf = result_df.copy()\n", "\n", "for i in index_list:\n", " test_gdf.at[i, 'toponim'] = f'toponim_{i}'" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 135, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "test_gdf.at[121, 'toponim'] = 'toponim_implemented_1'" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 136, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { - "text/plain": " date id \\\n0 2024-03-21 19:30:00 333766 \n1 2024-03-21 10:30:00 333530 \n2 2024-03-20 19:30:00 333376 \n3 2024-03-20 10:30:00 333265 \n4 2024-03-19 19:30:00 333226 \n.. ... ... \n715 2024-02-07 10:42:30 328420 \n716 2024-02-07 12:57:57 328422 \n717 2024-02-07 14:03:22 328427 \n718 2024-02-07 15:26:01 328438 \n719 2024-02-07 18:56:25 328462 \n\n text views.count \\\n0 В доме на Синопской набережной 32/35 из-за поз... 795.0 \n1 Сделайте сад Сан-Галли безопасным для граждан,... 3527.0 \n2 «Электросамокатная саранча» вновь повылезала н... 9866.0 \n3 Вот и отгуляла широкая Масленица! Наша команда... 4599.0 \n4 На том же месте сквозь года Вот так каланча ве... 4355.0 \n.. ... ... \n715 Люди десятилетиями ждут замены лифтов, чтоб он... NaN \n716 С вопросом капремонта лифтов, к сожалению, ст... NaN \n717 Думаю что лифты в порядке. Просто их выключили... NaN \n718 [id10835085|Андрей], очень даже может быть NaN \n719 С лифтами ныне у многих проблемы. У меня в дом... NaN \n\n likes.count reposts.count type \\\n0 28 8.0 post \n1 126 23.0 post \n2 188 24.0 post \n3 54 8.0 post \n4 51 11.0 post \n.. ... ... ... \n715 2 NaN comment \n716 4 NaN comment \n717 1 NaN comment \n718 1 NaN reply \n719 0 NaN comment \n\n link post_id \\\n0 NaN NaN \n1 NaN NaN \n2 NaN NaN \n3 https://vk.com/wall-129354225_332885|ярко, NaN \n4 https://vk.com/wall-129354225_325408|уплотните... NaN \n.. ... ... \n715 NaN 328388.0 \n716 NaN 328388.0 \n717 NaN 328388.0 \n718 NaN 328388.0 \n719 NaN 328388.0 \n\n parents_stack toponim \n0 NaN NaN \n1 NaN toponim_1 \n2 NaN NaN \n3 NaN NaN \n4 NaN NaN \n.. ... ... \n715 NaN NaN \n716 NaN NaN \n717 NaN toponim_717 \n718 328427 NaN \n719 NaN NaN \n\n[720 rows x 11 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
dateidtextviews.countlikes.countreposts.counttypelinkpost_idparents_stacktoponim
02024-03-21 19:30:00333766В доме на Синопской набережной 32/35 из-за поз...795.0288.0postNaNNaNNaNNaN
12024-03-21 10:30:00333530Сделайте сад Сан-Галли безопасным для граждан,...3527.012623.0postNaNNaNNaNtoponim_1
22024-03-20 19:30:00333376«Электросамокатная саранча» вновь повылезала н...9866.018824.0postNaNNaNNaNNaN
32024-03-20 10:30:00333265Вот и отгуляла широкая Масленица! Наша команда...4599.0548.0posthttps://vk.com/wall-129354225_332885|ярко,NaNNaNNaN
42024-03-19 19:30:00333226На том же месте сквозь года Вот так каланча ве...4355.05111.0posthttps://vk.com/wall-129354225_325408|уплотните...NaNNaNNaN
....................................
7152024-02-07 10:42:30328420Люди десятилетиями ждут замены лифтов, чтоб он...NaN2NaNcommentNaN328388.0NaNNaN
7162024-02-07 12:57:57328422С вопросом капремонта лифтов, к сожалению, ст...NaN4NaNcommentNaN328388.0NaNNaN
7172024-02-07 14:03:22328427Думаю что лифты в порядке. Просто их выключили...NaN1NaNcommentNaN328388.0NaNtoponim_717
7182024-02-07 15:26:01328438[id10835085|Андрей], очень даже может бытьNaN1NaNreplyNaN328388.0328427NaN
7192024-02-07 18:56:25328462С лифтами ныне у многих проблемы. У меня в дом...NaN0NaNcommentNaN328388.0NaNNaN
\n

720 rows × 11 columns

\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dateidtextviews.countlikes.countreposts.counttypelinkpost_idparents_stacktoponim
02024-03-21 19:30:00333766В доме на Синопской набережной 32/35 из-за поз...795.0288.0postNaNNaNNaNNaN
12024-03-21 10:30:00333530Сделайте сад Сан-Галли безопасным для граждан,...3527.012623.0postNaNNaNNaNtoponim_1
22024-03-20 19:30:00333376«Электросамокатная саранча» вновь повылезала н...9866.018824.0postNaNNaNNaNNaN
32024-03-20 10:30:00333265Вот и отгуляла широкая Масленица! Наша команда...4599.0548.0posthttps://vk.com/wall-129354225_332885|ярко,NaNNaNNaN
42024-03-19 19:30:00333226На том же месте сквозь года Вот так каланча ве...4355.05111.0posthttps://vk.com/wall-129354225_325408|уплотните...NaNNaNNaN
....................................
7152024-02-07 10:42:30328420Люди десятилетиями ждут замены лифтов, чтоб он...NaN2NaNcommentNaN328388.0NaNNaN
7162024-02-07 12:57:57328422С вопросом капремонта лифтов, к сожалению, ст...NaN4NaNcommentNaN328388.0NaNNaN
7172024-02-07 14:03:22328427Думаю что лифты в порядке. Просто их выключили...NaN1NaNcommentNaN328388.0NaNtoponim_717
7182024-02-07 15:26:01328438[id10835085|Андрей], очень даже может бытьNaN1NaNreplyNaN328388.0328427NaN
7192024-02-07 18:56:25328462С лифтами ныне у многих проблемы. У меня в дом...NaN0NaNcommentNaN328388.0NaNNaN
\n", + "

720 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " date id \\\n", + "0 2024-03-21 19:30:00 333766 \n", + "1 2024-03-21 10:30:00 333530 \n", + "2 2024-03-20 19:30:00 333376 \n", + "3 2024-03-20 10:30:00 333265 \n", + "4 2024-03-19 19:30:00 333226 \n", + ".. ... ... \n", + "715 2024-02-07 10:42:30 328420 \n", + "716 2024-02-07 12:57:57 328422 \n", + "717 2024-02-07 14:03:22 328427 \n", + "718 2024-02-07 15:26:01 328438 \n", + "719 2024-02-07 18:56:25 328462 \n", + "\n", + " text views.count \\\n", + "0 В доме на Синопской набережной 32/35 из-за поз... 795.0 \n", + "1 Сделайте сад Сан-Галли безопасным для граждан,... 3527.0 \n", + "2 «Электросамокатная саранча» вновь повылезала н... 9866.0 \n", + "3 Вот и отгуляла широкая Масленица! Наша команда... 4599.0 \n", + "4 На том же месте сквозь года Вот так каланча ве... 4355.0 \n", + ".. ... ... \n", + "715 Люди десятилетиями ждут замены лифтов, чтоб он... NaN \n", + "716 С вопросом капремонта лифтов, к сожалению, ст... NaN \n", + "717 Думаю что лифты в порядке. Просто их выключили... NaN \n", + "718 [id10835085|Андрей], очень даже может быть NaN \n", + "719 С лифтами ныне у многих проблемы. У меня в дом... NaN \n", + "\n", + " likes.count reposts.count type \\\n", + "0 28 8.0 post \n", + "1 126 23.0 post \n", + "2 188 24.0 post \n", + "3 54 8.0 post \n", + "4 51 11.0 post \n", + ".. ... ... ... \n", + "715 2 NaN comment \n", + "716 4 NaN comment \n", + "717 1 NaN comment \n", + "718 1 NaN reply \n", + "719 0 NaN comment \n", + "\n", + " link post_id \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 https://vk.com/wall-129354225_332885|ярко, NaN \n", + "4 https://vk.com/wall-129354225_325408|уплотните... NaN \n", + ".. ... ... \n", + "715 NaN 328388.0 \n", + "716 NaN 328388.0 \n", + "717 NaN 328388.0 \n", + "718 NaN 328388.0 \n", + "719 NaN 328388.0 \n", + "\n", + " parents_stack toponim \n", + "0 NaN NaN \n", + "1 NaN toponim_1 \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + ".. ... ... \n", + "715 NaN NaN \n", + "716 NaN NaN \n", + "717 NaN toponim_717 \n", + "718 328427 NaN \n", + "719 NaN NaN \n", + "\n", + "[720 rows x 11 columns]" + ] }, "execution_count": 136, "metadata": {}, @@ -528,24 +1505,24 @@ ], "source": [ "test_gdf" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 119, + "metadata": { + "collapsed": false + }, "outputs": [ { "ename": "TypeError", "evalue": "object of type 'int' has no len()", "output_type": "error", "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mTypeError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[1;32mIn[119], line 5\u001B[0m\n\u001B[0;32m 3\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m i \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mrange\u001B[39m(\u001B[38;5;28mlen\u001B[39m(test_gdf)):\n\u001B[0;32m 4\u001B[0m tmp \u001B[38;5;241m=\u001B[39m test_gdf[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mparents_stack\u001B[39m\u001B[38;5;124m'\u001B[39m]\u001B[38;5;241m.\u001B[39miloc[i]\n\u001B[1;32m----> 5\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mtype\u001B[39m(tmp) \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28mfloat\u001B[39m \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;28;43mlen\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mtmp\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m0\u001B[39m:\n\u001B[0;32m 6\u001B[0m test_gdf\u001B[38;5;241m.\u001B[39mat[i, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mparents_stack\u001B[39m\u001B[38;5;124m'\u001B[39m] \u001B[38;5;241m=\u001B[39m tmp[\u001B[38;5;241m0\u001B[39m]\n\u001B[0;32m 7\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n", - "\u001B[1;31mTypeError\u001B[0m: object of type 'int' has no len()" + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[119], line 5\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mlen\u001b[39m(test_gdf)):\n\u001b[0;32m 4\u001b[0m tmp \u001b[38;5;241m=\u001b[39m test_gdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mparents_stack\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39miloc[i]\n\u001b[1;32m----> 5\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(tmp) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mfloat\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtmp\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m 6\u001b[0m test_gdf\u001b[38;5;241m.\u001b[39mat[i, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mparents_stack\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m tmp[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "\u001b[1;31mTypeError\u001b[0m: object of type 'int' has no len()" ] } ], @@ -558,18 +1535,20 @@ " test_gdf.at[i, 'parents_stack'] = tmp[0]\n", " else:\n", " test_gdf.at[i, 'parents_stack'] = np.nan" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 120, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { - "text/plain": "True" + "text/plain": [ + "True" + ] }, "execution_count": 120, "metadata": {}, @@ -579,27 +1558,29 @@ "source": [ "check = test_gdf['toponim'].loc[test_gdf['id'] == 333766][0]\n", "type(check) is float" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "Список id постов с топонимами" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "Список id постов с топонимами" + ] }, { "cell_type": "code", "execution_count": 137, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { - "text/plain": "[333530, 331686, 329242, 328572]" + "text/plain": [ + "[333530, 331686, 329242, 328572]" + ] }, "execution_count": 137, "metadata": {}, @@ -611,27 +1592,56 @@ "post_top_gdf = post_top_gdf.dropna(subset='toponim')\n", "post_toponim_list = list(post_top_gdf['id'])\n", "post_toponim_list" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "Список id комментариев с топонимами" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "Список id комментариев с топонимами" + ] }, { "cell_type": "code", "execution_count": 138, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { - "text/plain": "[333775,\n 333532,\n 333535,\n 333565,\n 333596,\n 333698,\n 332892,\n 332895,\n 332927,\n 332598,\n 332283,\n 330212,\n 330217,\n 329443,\n 329448,\n 329451,\n 329528,\n 329550,\n 329561,\n 329752,\n 328941,\n 328977,\n 328984,\n 329005,\n 329014,\n 329025,\n 328833,\n 328427]" + "text/plain": [ + "[333775,\n", + " 333532,\n", + " 333535,\n", + " 333565,\n", + " 333596,\n", + " 333698,\n", + " 332892,\n", + " 332895,\n", + " 332927,\n", + " 332598,\n", + " 332283,\n", + " 330212,\n", + " 330217,\n", + " 329443,\n", + " 329448,\n", + " 329451,\n", + " 329528,\n", + " 329550,\n", + " 329561,\n", + " 329752,\n", + " 328941,\n", + " 328977,\n", + " 328984,\n", + " 329005,\n", + " 329014,\n", + " 329025,\n", + " 328833,\n", + " 328427]" + ] }, "execution_count": 138, "metadata": {}, @@ -643,27 +1653,44 @@ "comment_top_gdf = comment_top_gdf.dropna(subset='toponim')\n", "comment_toponim_list = list(comment_top_gdf['id'])\n", "comment_toponim_list" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "Список id реплаев с топонимами" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "Список id реплаев с топонимами" + ] }, { "cell_type": "code", "execution_count": 139, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { - "text/plain": "[333540,\n 333606,\n 333669,\n 333717,\n 333765,\n 333782,\n 333604,\n 332973,\n 332957,\n 332744,\n 331181,\n 331194,\n 330246,\n 328975,\n 328991,\n 328404]" + "text/plain": [ + "[333540,\n", + " 333606,\n", + " 333669,\n", + " 333717,\n", + " 333765,\n", + " 333782,\n", + " 333604,\n", + " 332973,\n", + " 332957,\n", + " 332744,\n", + " 331181,\n", + " 331194,\n", + " 330246,\n", + " 328975,\n", + " 328991,\n", + " 328404]" + ] }, "execution_count": 139, "metadata": {}, @@ -675,25 +1702,25 @@ "reply_top_gdf = reply_top_gdf.dropna(subset='toponim')\n", "reply_toponim_list = list(reply_top_gdf['id'])\n", "reply_toponim_list" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 140, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "exclude_list = reply_toponim_list + comment_toponim_list" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 147, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -715,14 +1742,14 @@ " df_to_extract.append(test_gdf['text'].loc[test_gdf['id'] == i])\n", "\n", " print(i, len(df_to_extract))" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 151, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -766,14 +1793,14 @@ " df_to_extract.append(test_gdf['text'].loc[test_gdf['id'] == i])\n", "\n", " print(i, len(df_to_extract))" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 152, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -803,19 +1830,277 @@ " df_to_extract = test_gdf['text'].loc[test_gdf['id'] == i]\n", "\n", " print(i, len(df_to_extract))" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 127, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { - "text/plain": " date id \\\n0 2024-03-21 19:30:00 333766 \n1 2024-03-21 10:30:00 333530 \n2 2024-03-20 19:30:00 333376 \n3 2024-03-20 10:30:00 333265 \n4 2024-03-19 19:30:00 333226 \n.. ... ... \n715 2024-02-07 10:42:30 328420 \n716 2024-02-07 12:57:57 328422 \n717 2024-02-07 14:03:22 328427 \n718 2024-02-07 15:26:01 328438 \n719 2024-02-07 18:56:25 328462 \n\n text views.count \\\n0 В доме на Синопской набережной 32/35 из-за поз... 795.0 \n1 Сделайте сад Сан-Галли безопасным для граждан,... 3527.0 \n2 «Электросамокатная саранча» вновь повылезала н... 9866.0 \n3 Вот и отгуляла широкая Масленица! Наша команда... 4599.0 \n4 На том же месте сквозь года Вот так каланча ве... 4355.0 \n.. ... ... \n715 Люди десятилетиями ждут замены лифтов, чтоб он... NaN \n716 С вопросом капремонта лифтов, к сожалению, ст... NaN \n717 Думаю что лифты в порядке. Просто их выключили... NaN \n718 [id10835085|Андрей], очень даже может быть NaN \n719 С лифтами ныне у многих проблемы. У меня в дом... NaN \n\n likes.count reposts.count type \\\n0 28 8.0 post \n1 126 23.0 post \n2 188 24.0 post \n3 54 8.0 post \n4 51 11.0 post \n.. ... ... ... \n715 2 NaN comment \n716 4 NaN comment \n717 1 NaN comment \n718 1 NaN reply \n719 0 NaN comment \n\n link post_id \\\n0 NaN NaN \n1 NaN NaN \n2 NaN NaN \n3 https://vk.com/wall-129354225_332885|ярко, NaN \n4 https://vk.com/wall-129354225_325408|уплотните... NaN \n.. ... ... \n715 NaN 328388.0 \n716 NaN 328388.0 \n717 NaN 328388.0 \n718 NaN 328388.0 \n719 NaN 328388.0 \n\n parents_stack toponim \n0 NaN NaN \n1 NaN toponim_1 \n2 NaN NaN \n3 NaN NaN \n4 NaN NaN \n.. ... ... \n715 NaN NaN \n716 NaN NaN \n717 NaN toponim_717 \n718 328427 NaN \n719 NaN NaN \n\n[720 rows x 11 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
dateidtextviews.countlikes.countreposts.counttypelinkpost_idparents_stacktoponim
02024-03-21 19:30:00333766В доме на Синопской набережной 32/35 из-за поз...795.0288.0postNaNNaNNaNNaN
12024-03-21 10:30:00333530Сделайте сад Сан-Галли безопасным для граждан,...3527.012623.0postNaNNaNNaNtoponim_1
22024-03-20 19:30:00333376«Электросамокатная саранча» вновь повылезала н...9866.018824.0postNaNNaNNaNNaN
32024-03-20 10:30:00333265Вот и отгуляла широкая Масленица! Наша команда...4599.0548.0posthttps://vk.com/wall-129354225_332885|ярко,NaNNaNNaN
42024-03-19 19:30:00333226На том же месте сквозь года Вот так каланча ве...4355.05111.0posthttps://vk.com/wall-129354225_325408|уплотните...NaNNaNNaN
....................................
7152024-02-07 10:42:30328420Люди десятилетиями ждут замены лифтов, чтоб он...NaN2NaNcommentNaN328388.0NaNNaN
7162024-02-07 12:57:57328422С вопросом капремонта лифтов, к сожалению, ст...NaN4NaNcommentNaN328388.0NaNNaN
7172024-02-07 14:03:22328427Думаю что лифты в порядке. Просто их выключили...NaN1NaNcommentNaN328388.0NaNtoponim_717
7182024-02-07 15:26:01328438[id10835085|Андрей], очень даже может бытьNaN1NaNreplyNaN328388.0328427NaN
7192024-02-07 18:56:25328462С лифтами ныне у многих проблемы. У меня в дом...NaN0NaNcommentNaN328388.0NaNNaN
\n

720 rows × 11 columns

\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dateidtextviews.countlikes.countreposts.counttypelinkpost_idparents_stacktoponim
02024-03-21 19:30:00333766В доме на Синопской набережной 32/35 из-за поз...795.0288.0postNaNNaNNaNNaN
12024-03-21 10:30:00333530Сделайте сад Сан-Галли безопасным для граждан,...3527.012623.0postNaNNaNNaNtoponim_1
22024-03-20 19:30:00333376«Электросамокатная саранча» вновь повылезала н...9866.018824.0postNaNNaNNaNNaN
32024-03-20 10:30:00333265Вот и отгуляла широкая Масленица! Наша команда...4599.0548.0posthttps://vk.com/wall-129354225_332885|ярко,NaNNaNNaN
42024-03-19 19:30:00333226На том же месте сквозь года Вот так каланча ве...4355.05111.0posthttps://vk.com/wall-129354225_325408|уплотните...NaNNaNNaN
....................................
7152024-02-07 10:42:30328420Люди десятилетиями ждут замены лифтов, чтоб он...NaN2NaNcommentNaN328388.0NaNNaN
7162024-02-07 12:57:57328422С вопросом капремонта лифтов, к сожалению, ст...NaN4NaNcommentNaN328388.0NaNNaN
7172024-02-07 14:03:22328427Думаю что лифты в порядке. Просто их выключили...NaN1NaNcommentNaN328388.0NaNtoponim_717
7182024-02-07 15:26:01328438[id10835085|Андрей], очень даже может бытьNaN1NaNreplyNaN328388.0328427NaN
7192024-02-07 18:56:25328462С лифтами ныне у многих проблемы. У меня в дом...NaN0NaNcommentNaN328388.0NaNNaN
\n", + "

720 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " date id \\\n", + "0 2024-03-21 19:30:00 333766 \n", + "1 2024-03-21 10:30:00 333530 \n", + "2 2024-03-20 19:30:00 333376 \n", + "3 2024-03-20 10:30:00 333265 \n", + "4 2024-03-19 19:30:00 333226 \n", + ".. ... ... \n", + "715 2024-02-07 10:42:30 328420 \n", + "716 2024-02-07 12:57:57 328422 \n", + "717 2024-02-07 14:03:22 328427 \n", + "718 2024-02-07 15:26:01 328438 \n", + "719 2024-02-07 18:56:25 328462 \n", + "\n", + " text views.count \\\n", + "0 В доме на Синопской набережной 32/35 из-за поз... 795.0 \n", + "1 Сделайте сад Сан-Галли безопасным для граждан,... 3527.0 \n", + "2 «Электросамокатная саранча» вновь повылезала н... 9866.0 \n", + "3 Вот и отгуляла широкая Масленица! Наша команда... 4599.0 \n", + "4 На том же месте сквозь года Вот так каланча ве... 4355.0 \n", + ".. ... ... \n", + "715 Люди десятилетиями ждут замены лифтов, чтоб он... NaN \n", + "716 С вопросом капремонта лифтов, к сожалению, ст... NaN \n", + "717 Думаю что лифты в порядке. Просто их выключили... NaN \n", + "718 [id10835085|Андрей], очень даже может быть NaN \n", + "719 С лифтами ныне у многих проблемы. У меня в дом... NaN \n", + "\n", + " likes.count reposts.count type \\\n", + "0 28 8.0 post \n", + "1 126 23.0 post \n", + "2 188 24.0 post \n", + "3 54 8.0 post \n", + "4 51 11.0 post \n", + ".. ... ... ... \n", + "715 2 NaN comment \n", + "716 4 NaN comment \n", + "717 1 NaN comment \n", + "718 1 NaN reply \n", + "719 0 NaN comment \n", + "\n", + " link post_id \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 https://vk.com/wall-129354225_332885|ярко, NaN \n", + "4 https://vk.com/wall-129354225_325408|уплотните... NaN \n", + ".. ... ... \n", + "715 NaN 328388.0 \n", + "716 NaN 328388.0 \n", + "717 NaN 328388.0 \n", + "718 NaN 328388.0 \n", + "719 NaN 328388.0 \n", + "\n", + " parents_stack toponim \n", + "0 NaN NaN \n", + "1 NaN toponim_1 \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + ".. ... ... \n", + "715 NaN NaN \n", + "716 NaN NaN \n", + "717 NaN toponim_717 \n", + "718 328427 NaN \n", + "719 NaN NaN \n", + "\n", + "[720 rows x 11 columns]" + ] }, "execution_count": 127, "metadata": {}, @@ -824,29 +2109,29 @@ ], "source": [ "test_gdf" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 15, + "metadata": { + "collapsed": false + }, "outputs": [ { "ename": "ValueError", "evalue": "('Lengths must match to compare', (482,), (0,))", "output_type": "error", "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mValueError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[1;32mIn[15], line 38\u001B[0m\n\u001B[0;32m 36\u001B[0m \u001B[38;5;66;03m# Применяем функцию к DataFrame\u001B[39;00m\n\u001B[0;32m 37\u001B[0m df \u001B[38;5;241m=\u001B[39m test_gdf \u001B[38;5;66;03m# Замените это на ваш DataFrame\u001B[39;00m\n\u001B[1;32m---> 38\u001B[0m df \u001B[38;5;241m=\u001B[39m \u001B[43mlink_comments\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdf\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 40\u001B[0m \u001B[38;5;66;03m# Выводим обновленный DataFrame\u001B[39;00m\n\u001B[0;32m 41\u001B[0m df\n", - "Cell \u001B[1;32mIn[15], line 27\u001B[0m, in \u001B[0;36mlink_comments\u001B[1;34m(df)\u001B[0m\n\u001B[0;32m 24\u001B[0m comments \u001B[38;5;241m=\u001B[39m df[(df[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mtype\u001B[39m\u001B[38;5;124m'\u001B[39m] \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mcomment\u001B[39m\u001B[38;5;124m'\u001B[39m) \u001B[38;5;241m&\u001B[39m (df[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mpost_id\u001B[39m\u001B[38;5;124m'\u001B[39m] \u001B[38;5;241m==\u001B[39m post[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mid\u001B[39m\u001B[38;5;124m'\u001B[39m])]\n\u001B[0;32m 25\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m _, comment \u001B[38;5;129;01min\u001B[39;00m comments\u001B[38;5;241m.\u001B[39miterrows():\n\u001B[0;32m 26\u001B[0m \u001B[38;5;66;03m# Добавляем id комментариев, если у них нет топонима и они не ссылаются на комментарий с топонимом\u001B[39;00m\n\u001B[1;32m---> 27\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m pd\u001B[38;5;241m.\u001B[39misnull(comment[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124monly_full_street_name\u001B[39m\u001B[38;5;124m'\u001B[39m]) \u001B[38;5;129;01mand\u001B[39;00m pd\u001B[38;5;241m.\u001B[39misnull(df[\u001B[43mdf\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mid\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m==\u001B[39;49m\u001B[43m \u001B[49m\u001B[43mcomment\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mparents_stack\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m]\u001B[49m][\u001B[38;5;124m'\u001B[39m\u001B[38;5;124monly_full_street_name\u001B[39m\u001B[38;5;124m'\u001B[39m])\u001B[38;5;241m.\u001B[39mall():\n\u001B[0;32m 28\u001B[0m linked_ids\u001B[38;5;241m.\u001B[39mappend(comment[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mid\u001B[39m\u001B[38;5;124m'\u001B[39m])\n\u001B[0;32m 30\u001B[0m \u001B[38;5;66;03m# Записываем связанные id комментариев в новую ячейку\u001B[39;00m\n", - "File \u001B[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\ops\\common.py:76\u001B[0m, in \u001B[0;36m_unpack_zerodim_and_defer..new_method\u001B[1;34m(self, other)\u001B[0m\n\u001B[0;32m 72\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mNotImplemented\u001B[39m\n\u001B[0;32m 74\u001B[0m other \u001B[38;5;241m=\u001B[39m item_from_zerodim(other)\n\u001B[1;32m---> 76\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mmethod\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mother\u001B[49m\u001B[43m)\u001B[49m\n", - "File \u001B[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\arraylike.py:40\u001B[0m, in \u001B[0;36mOpsMixin.__eq__\u001B[1;34m(self, other)\u001B[0m\n\u001B[0;32m 38\u001B[0m \u001B[38;5;129m@unpack_zerodim_and_defer\u001B[39m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m__eq__\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m 39\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m__eq__\u001B[39m(\u001B[38;5;28mself\u001B[39m, other):\n\u001B[1;32m---> 40\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_cmp_method\u001B[49m\u001B[43m(\u001B[49m\u001B[43mother\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43moperator\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43meq\u001B[49m\u001B[43m)\u001B[49m\n", - "File \u001B[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\series.py:6110\u001B[0m, in \u001B[0;36mSeries._cmp_method\u001B[1;34m(self, other, op)\u001B[0m\n\u001B[0;32m 6107\u001B[0m lvalues \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_values\n\u001B[0;32m 6108\u001B[0m rvalues \u001B[38;5;241m=\u001B[39m extract_array(other, extract_numpy\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m, extract_range\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m)\n\u001B[1;32m-> 6110\u001B[0m res_values \u001B[38;5;241m=\u001B[39m \u001B[43mops\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcomparison_op\u001B[49m\u001B[43m(\u001B[49m\u001B[43mlvalues\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mrvalues\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mop\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 6112\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_construct_result(res_values, name\u001B[38;5;241m=\u001B[39mres_name)\n", - "File \u001B[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\ops\\array_ops.py:321\u001B[0m, in \u001B[0;36mcomparison_op\u001B[1;34m(left, right, op)\u001B[0m\n\u001B[0;32m 316\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(rvalues, (np\u001B[38;5;241m.\u001B[39mndarray, ABCExtensionArray)):\n\u001B[0;32m 317\u001B[0m \u001B[38;5;66;03m# TODO: make this treatment consistent across ops and classes.\u001B[39;00m\n\u001B[0;32m 318\u001B[0m \u001B[38;5;66;03m# We are not catching all listlikes here (e.g. frozenset, tuple)\u001B[39;00m\n\u001B[0;32m 319\u001B[0m \u001B[38;5;66;03m# The ambiguous case is object-dtype. See GH#27803\u001B[39;00m\n\u001B[0;32m 320\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(lvalues) \u001B[38;5;241m!=\u001B[39m \u001B[38;5;28mlen\u001B[39m(rvalues):\n\u001B[1;32m--> 321\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[0;32m 322\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mLengths must match to compare\u001B[39m\u001B[38;5;124m\"\u001B[39m, lvalues\u001B[38;5;241m.\u001B[39mshape, rvalues\u001B[38;5;241m.\u001B[39mshape\n\u001B[0;32m 323\u001B[0m )\n\u001B[0;32m 325\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m should_extension_dispatch(lvalues, rvalues) \u001B[38;5;129;01mor\u001B[39;00m (\n\u001B[0;32m 326\u001B[0m (\u001B[38;5;28misinstance\u001B[39m(rvalues, (Timedelta, BaseOffset, Timestamp)) \u001B[38;5;129;01mor\u001B[39;00m right \u001B[38;5;129;01mis\u001B[39;00m NaT)\n\u001B[0;32m 327\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m lvalues\u001B[38;5;241m.\u001B[39mdtype \u001B[38;5;241m!=\u001B[39m \u001B[38;5;28mobject\u001B[39m\n\u001B[0;32m 328\u001B[0m ):\n\u001B[0;32m 329\u001B[0m \u001B[38;5;66;03m# Call the method on lvalues\u001B[39;00m\n\u001B[0;32m 330\u001B[0m res_values \u001B[38;5;241m=\u001B[39m op(lvalues, rvalues)\n", - "\u001B[1;31mValueError\u001B[0m: ('Lengths must match to compare', (482,), (0,))" + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[15], line 38\u001b[0m\n\u001b[0;32m 36\u001b[0m \u001b[38;5;66;03m# Применяем функцию к DataFrame\u001b[39;00m\n\u001b[0;32m 37\u001b[0m df \u001b[38;5;241m=\u001b[39m test_gdf \u001b[38;5;66;03m# Замените это на ваш DataFrame\u001b[39;00m\n\u001b[1;32m---> 38\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mlink_comments\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 40\u001b[0m \u001b[38;5;66;03m# Выводим обновленный DataFrame\u001b[39;00m\n\u001b[0;32m 41\u001b[0m df\n", + "Cell \u001b[1;32mIn[15], line 27\u001b[0m, in \u001b[0;36mlink_comments\u001b[1;34m(df)\u001b[0m\n\u001b[0;32m 24\u001b[0m comments \u001b[38;5;241m=\u001b[39m df[(df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcomment\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;241m&\u001b[39m (df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpost_id\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m post[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m'\u001b[39m])]\n\u001b[0;32m 25\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m _, comment \u001b[38;5;129;01min\u001b[39;00m comments\u001b[38;5;241m.\u001b[39miterrows():\n\u001b[0;32m 26\u001b[0m \u001b[38;5;66;03m# Добавляем id комментариев, если у них нет топонима и они не ссылаются на комментарий с топонимом\u001b[39;00m\n\u001b[1;32m---> 27\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m pd\u001b[38;5;241m.\u001b[39misnull(comment[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124monly_full_street_name\u001b[39m\u001b[38;5;124m'\u001b[39m]) \u001b[38;5;129;01mand\u001b[39;00m pd\u001b[38;5;241m.\u001b[39misnull(df[\u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mid\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mcomment\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mparents_stack\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m][\u001b[38;5;124m'\u001b[39m\u001b[38;5;124monly_full_street_name\u001b[39m\u001b[38;5;124m'\u001b[39m])\u001b[38;5;241m.\u001b[39mall():\n\u001b[0;32m 28\u001b[0m linked_ids\u001b[38;5;241m.\u001b[39mappend(comment[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m 30\u001b[0m \u001b[38;5;66;03m# Записываем связанные id комментариев в новую ячейку\u001b[39;00m\n", + "File \u001b[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\ops\\common.py:76\u001b[0m, in \u001b[0;36m_unpack_zerodim_and_defer..new_method\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m 72\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mNotImplemented\u001b[39m\n\u001b[0;32m 74\u001b[0m other \u001b[38;5;241m=\u001b[39m item_from_zerodim(other)\n\u001b[1;32m---> 76\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mother\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\arraylike.py:40\u001b[0m, in \u001b[0;36mOpsMixin.__eq__\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m 38\u001b[0m \u001b[38;5;129m@unpack_zerodim_and_defer\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__eq__\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 39\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__eq__\u001b[39m(\u001b[38;5;28mself\u001b[39m, other):\n\u001b[1;32m---> 40\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_cmp_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mother\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moperator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43meq\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\series.py:6110\u001b[0m, in \u001b[0;36mSeries._cmp_method\u001b[1;34m(self, other, op)\u001b[0m\n\u001b[0;32m 6107\u001b[0m lvalues \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values\n\u001b[0;32m 6108\u001b[0m rvalues \u001b[38;5;241m=\u001b[39m extract_array(other, extract_numpy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, extract_range\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m-> 6110\u001b[0m res_values \u001b[38;5;241m=\u001b[39m \u001b[43mops\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcomparison_op\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mop\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 6112\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_construct_result(res_values, name\u001b[38;5;241m=\u001b[39mres_name)\n", + "File \u001b[1;32mI:\\sloyka\\venv\\lib\\site-packages\\pandas\\core\\ops\\array_ops.py:321\u001b[0m, in \u001b[0;36mcomparison_op\u001b[1;34m(left, right, op)\u001b[0m\n\u001b[0;32m 316\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(rvalues, (np\u001b[38;5;241m.\u001b[39mndarray, ABCExtensionArray)):\n\u001b[0;32m 317\u001b[0m \u001b[38;5;66;03m# TODO: make this treatment consistent across ops and classes.\u001b[39;00m\n\u001b[0;32m 318\u001b[0m \u001b[38;5;66;03m# We are not catching all listlikes here (e.g. frozenset, tuple)\u001b[39;00m\n\u001b[0;32m 319\u001b[0m \u001b[38;5;66;03m# The ambiguous case is object-dtype. See GH#27803\u001b[39;00m\n\u001b[0;32m 320\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(lvalues) \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mlen\u001b[39m(rvalues):\n\u001b[1;32m--> 321\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 322\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLengths must match to compare\u001b[39m\u001b[38;5;124m\"\u001b[39m, lvalues\u001b[38;5;241m.\u001b[39mshape, rvalues\u001b[38;5;241m.\u001b[39mshape\n\u001b[0;32m 323\u001b[0m )\n\u001b[0;32m 325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m should_extension_dispatch(lvalues, rvalues) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[0;32m 326\u001b[0m (\u001b[38;5;28misinstance\u001b[39m(rvalues, (Timedelta, BaseOffset, Timestamp)) \u001b[38;5;129;01mor\u001b[39;00m right \u001b[38;5;129;01mis\u001b[39;00m NaT)\n\u001b[0;32m 327\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m lvalues\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mobject\u001b[39m\n\u001b[0;32m 328\u001b[0m ):\n\u001b[0;32m 329\u001b[0m \u001b[38;5;66;03m# Call the method on lvalues\u001b[39;00m\n\u001b[0;32m 330\u001b[0m res_values \u001b[38;5;241m=\u001b[39m op(lvalues, rvalues)\n", + "\u001b[1;31mValueError\u001b[0m: ('Lengths must match to compare', (482,), (0,))" ] } ], @@ -892,42 +2177,39 @@ "\n", "# Выводим обновленный DataFrame\n", "df" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "df" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "df = df.drop_duplicates(subset='id')\n", "df" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, - "outputs": [], - "source": [], "metadata": { "collapsed": false - } + }, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/examples/parsers_example.ipynb b/examples/parsers_example.ipynb index ae0fabb..d48a2f0 100644 --- a/examples/parsers_example.ipynb +++ b/examples/parsers_example.ipynb @@ -43,7 +43,7 @@ "outputs": [], "source": [ "# owner_id='-129354225' # группа \"центральный район за комфортную среду обитания\"\n", - "token='492f35ec492f35ec492f35ecdd4a3926724492f492f35ec2c17df616b6994a83242f520'" + "token='...'" ] }, { diff --git a/examples/poetry_env.py b/examples/poetry_env.py deleted file mode 100644 index 92fd41d..0000000 --- a/examples/poetry_env.py +++ /dev/null @@ -1,7 +0,0 @@ -import os -import sys - -sys.path.append(os.path.join(os.environ["POETRY_HOME"], "bin")) -os.system(f"poetry shell") - - diff --git a/sloyka/src/geocoder/city_objects_getter.py b/sloyka/src/geocoder/city_objects_getter.py index 0b7cf8a..f0be14e 100644 --- a/sloyka/src/geocoder/city_objects_getter.py +++ b/sloyka/src/geocoder/city_objects_getter.py @@ -1,5 +1,5 @@ from typing import List - +import re import pandas as pd import osmnx as ox from shapely.geometry import Point, Polygon, MultiPolygon @@ -7,6 +7,7 @@ from natasha import MorphVocab from sloyka.src.utils.constants import NUM_CITY_OBJ from sloyka.src.geocoder.address_extractor_titles import AddrNEWExtractor +from rapidfuzz import fuzz import numpy as np diff --git a/sloyka/src/geocoder/geocoder.py b/sloyka/src/geocoder/geocoder.py index dd61789..7ac1a97 100644 --- a/sloyka/src/geocoder/geocoder.py +++ b/sloyka/src/geocoder/geocoder.py @@ -72,6 +72,9 @@ from .city_objects_getter import OtherGeoObjects from .street_getter import Streets +from .location_getter import Location + + stemmer = SnowballStemmer("russian") @@ -543,7 +546,7 @@ def match_group_to_area(self, group_name, df_areas): return best_match, admin_level - def run(self, osm_id, tags, date, df: pd.DataFrame, text_column: str = "text", group_column: str = "group_name"): + def run(self, osm_id, tags, date, df: pd.DataFrame, text_column: str = "text", group_column: str | None = "group_name"): """ Runs the data processing pipeline on the input DataFrame. @@ -569,6 +572,7 @@ def run(self, osm_id, tags, date, df: pd.DataFrame, text_column: str = "text", g df_areas = self.get_df_areas(osm_id, tags, date) df_areas = self.preprocess_area_names(df_areas) + # if group_column: for i, group_name in enumerate(df[group_column]): processed_group_name = self.preprocess_group_name(group_name) best_match, admin_level = self.match_group_to_area(processed_group_name, df_areas)