diff --git a/.gitignore b/.gitignore index 608cc8f..f5048df 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,11 @@ users.csv .ipynb_checkpoints/ vk-api-saved.tar.xz + +lib/ +lib64* +bin/ +share/ +include/ +pyvenv.cfg +geckodriver.log diff --git a/README.md b/README.md index be87a30..1eb034b 100644 --- a/README.md +++ b/README.md @@ -2,19 +2,28 @@ ## WebScraping for the Leadership research project. -## Install +## Installation -LoL, only python +I suppose you have **Python 3.6+** and some kind of **Unix Terminal** installed. + +## Ubuntu: + +```shell +python3 -m venv spider +cd spider +source bin/activate +pip3 install -r requirements.txt +``` ## VK-API scraping ### Create your own VK App -1. Create your VK application here `https://vk.com/apps?act=manage` . +1. Create your VK application [here](https://vk.com/apps?act=manage). You need a Standalone app. Get your `APP_ID` in the app's settings. -2. You have to create `execute stored procedures` in your App (https://vk.com/apps?act=manage). -Find in the app's settings `Stored procedures`. Create new procedure `execute.singleLeader` : +2. Create `execute stored procedures` in your app. +Find in the app's settings `Stored procedures`. Create new procedure `execute.singleLeader`: ```Javascript var user = API.users.get({"user_ids": [Args.user], "fields": ["photo_id", "verified", "sex", "bdate", "city", "country", "home_town", "has_photo", "photo_50", "photo_100", "photo_200_orig", "photo_200", "photo_400_orig", "photo_max", "photo_max_orig", "online", "domain", "has_mobile", "contacts", "site", "education", "universities", "schools", "status", "last_seen", "followers_count", "common_count", "occupation", "nickname", "relatives", "relation", "personal", "connections", "exports", "activities", "interests", "music", "movies", "tv", "books", "games", "about", "quotes", "can_post", "can_see_all_posts", "can_see_audio", "can_write_private_message", "can_send_friend_request", "is_favorite", "is_hidden_from_feed", "timezone", "screen_name", "maiden_name", "crop_photo", "is_friend", "friend_status", "career", "military", "blacklisted", "blacklisted_by_me", "can_be_invited_group"]}); @@ -28,40 +37,9 @@ return [user, groups, wall]; ### Use existing VK App -3. Get the `ACCESS_TOKEN` for your app. -Open in your browser (specify `` from 1 step in the request) - `https://oauth.vk.com/authorize?client_id=&display=page&redirect_uri=https://oauth.vk.com/blank.html&scope=friends&response_type=token&v=5.103&state=123456` - . You will be redirected to another page, in the browser's search string you will find you `ACCESS_TOKEN`. - - -4. Set this `ACCESS_TOKEN` inside of the notebook - -`!!! Don't commit the token to the repo. You can lose an access to your account !!!` - -5. Get `users.csv` file - -6. Launch notebook `vk_api_scrap.ipynb` - - - -### Usefull links - -https://vk.com/dev/authcode_flow_user - -https://vk.com/dev/methods - -https://vk.com/dev/execute - -https://vk.com/dev/users - -https://vk.com/dev/groups - -https://vk.com/dev/wall - -https://vk.com/dev/likes +3. Make sure you've downloaded `users.csv` file -STREAMING API? https://vk.com/dev/streaming_api_docs +4. ```python3 spider.py``` and follow the input instructions. -## No-API scraping [deprecated] -scrap.ipynb +

You have to ensure you have the latest Firefox on your machine & path to the latest geckodriver in PATH variable!

diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fea6cdb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +beautifulsoup4==4.8.2 +bs4==0.0.1 +certifi==2019.11.28 +chardet==3.0.4 +idna==2.9 +numpy==1.18.1 +pandas==1.0.1 +pkg-resources==0.0.0 +python-dateutil==2.8.1 +pytz==2019.3 +requests==2.23.0 +selenium==3.141.0 +six==1.14.0 +soupsieve==2.0 +tqdm==4.43.0 +urllib3==1.25.8 diff --git a/spider.py b/spider.py new file mode 100644 index 0000000..853c693 --- /dev/null +++ b/spider.py @@ -0,0 +1,78 @@ +import requests +from bs4 import BeautifulSoup +from selenium import webdriver +from urllib.parse import urlsplit +from tqdm import tqdm +from tqdm.notebook import tqdm as tqdm_notebook +import os +import json +import pandas as pd +import time + +class OAuthHandler: + def __init__(self, login: str, password: str) -> str: + self.driver = webdriver.Firefox() + self.login_xpath = '/html/body/div/div/div/div[2]/form/div/div/input[6]' + self.password_xpath = '/html/body/div/div/div/div[2]/form/div/div/input[7]' + self.button_xpath = '//*[@id="install_allow"]' + self.login = login + self.password = password + + def auth(self, link): + self.driver.get(link) + try: + email_field = self.driver.find_element_by_xpath(self.login_xpath) + password_field = self.driver.find_element_by_xpath(self.password_xpath) + + email_field.send_keys(self.login) + password_field.send_keys(self.password) + + self.driver.find_element_by_xpath(self.button_xpath).click() + except: + pass + return self.driver.current_url + +def get_token(APP_ID: int) -> str: + login = input('Phone or email: ') + password = input('Password: ') + AUTH_URL = "https://oauth.vk.com/authorize?client_id={APP_ID}&display=page&response_type=token&v=5.103" + response = OAuthHandler(login, password).auth(AUTH_URL.format(APP_ID=APP_ID)) + token = urlsplit(response, scheme='https').fragment.split('&')[0].split('=')[-1] + return token + +def get_single_leader(uid: str, access_token: str, attempt: int=5) -> dict: + url_single_execute = f"https://api.vk.com/method/execute.singleLeader?user={{}}&access_token={access_token}&v=5.103" + + for i in range(attempt): + response = requests.get(url_single_execute.format(uid)).json() + if response.get('response'): + return response + print("Sleep") + time.sleep(1) + raise Exception(f"After {attempt} attempts no response!!!") + +def save_single_leader(uid: str, path: str, access_token: str): + data = get_single_leader(uid, access_token) + assert len(data["response"]) == 3 + with open(f"{path}/{uid}.json", 'w') as f: + f.write(json.dumps(data)) + +def main(): + savepath = input('Absolute path to directory to save to: ') + userscsv_path = input('Absolute path to users.csv file: ') + APP_ID = int(input('VK Standalone App ID: ')) + TOKEN = get_token(APP_ID) + + users = pd.read_csv(userscsv_path).uid + debug_data = [] + for uid in tqdm(users): + if not os.path.exists(savepath): + os.makedirs(savepath) + debug_data.append(save_single_leader(str(uid), savepath, TOKEN)) + + with open('./debug_data.txt', encoding='utf-8', mode='w') as f: + for dd in debug_data: + f.write(dd+'\n') + +if __name__=='__main__': + main() diff --git a/vk_api_scrap.ipynb b/vk_api_scrap.ipynb deleted file mode 100644 index b13c3ae..0000000 --- a/vk_api_scrap.ipynb +++ /dev/null @@ -1,270 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "pycharm": { - "is_executing": false - } - }, - "outputs": [], - "source": [ - "import re\n", - "import requests\n", - "from requests.utils import requote_uri\n", - "\n", - "\n", - "import json\n", - "import pandas as pd\n", - "\n", - "import os\n", - "import time\n", - "import logging" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from IPython.core.display import display, HTML\n", - "display(HTML(\"\"))\n", - "\n", - "save_folder = \"vk-api-saved\"\n", - "os.makedirs(save_folder, exist_ok=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# VK API WebScraping" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_single_leader(uid: str, attempt: int=5) -> dict:\n", - " access_token = os.environ[\"ACCESS_TOKEN\"]\n", - " url_single_execute = f\"https://api.vk.com/method/execute.singleLeader?user={{}}&access_token={access_token}&v=5.103\"\n", - " for i in range(attempt): \n", - " response = requests.get(url_single_execute.format(uid)).json()\n", - " if response.get(\"response\"):\n", - " return response\n", - " print(\"Sleep\")\n", - " time.sleep(1) \n", - " raise Exception(f\"After {attempt} attempts no response!!!\")\n", - "\n", - "def save_single_leader(uid: str, path: str):\n", - " data = get_single_leader(uid)\n", - " assert len(data[\"response\"]) == 3\n", - " with open(f\"{path}/{uid}.json\", 'w') as f:\n", - " f.write(json.dumps(data))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Firstly, check the README" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "try:\n", - " os.environ[\"ACCESS_TOKEN\"]\n", - "except:\n", - " raise Exception(\"Set ACCESS_TOKEN env\") \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "users = pd.read_csv(\"users.csv\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# VK-APP execute stored procedure\n", - "\n", - "execute.singleLeader code:\n", - "\n", - "###\n", - "\n", - "var user = API.users.get({\"user_ids\": [Args.user], \"fields\": [\"photo_id\", \"verified\", \"sex\", \"bdate\", \"city\", \"country\", \"home_town\", \"has_photo\", \"photo_50\", \"photo_100\", \"photo_200_orig\", \"photo_200\", \"photo_400_orig\", \"photo_max\", \"photo_max_orig\", \"online\", \"domain\", \"has_mobile\", \"contacts\", \"site\", \"education\", \"universities\", \"schools\", \"status\", \"last_seen\", \"followers_count\", \"common_count\", \"occupation\", \"nickname\", \"relatives\", \"relation\", \"personal\", \"connections\", \"exports\", \"activities\", \"interests\", \"music\", \"movies\", \"tv\", \"books\", \"games\", \"about\", \"quotes\", \"can_post\", \"can_see_all_posts\", \"can_see_audio\", \"can_write_private_message\", \"can_send_friend_request\", \"is_favorite\", \"is_hidden_from_feed\", \"timezone\", \"screen_name\", \"maiden_name\", \"crop_photo\", \"is_friend\", \"friend_status\", \"career\", \"military\", \"blacklisted\", \"blacklisted_by_me\", \"can_be_invited_group\"]});\n", - "\n", - "var groups = API.groups.get({\"user_id\": Args.user, \"extended\": 1});\n", - "\n", - "var wall = API.wall.get({\"owner_id\": Args.user, \"count\": 100, \"extended\": 1});\n", - " \n", - "return [user, groups, wall];" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Scrap it harder !!! (Prod section)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "users.uid[:10]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for uid in users.uid:\n", - " print(uid)\n", - " debug_data = save_single_leader(str(uid), \"vk-api-saved\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Wall Batch Scrap (# TODO: Test)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "execute.batchLeader procedure:\n", - "\n", - "____\n", - "\n", - "var users = Args.users;\n", - "\n", - "var walls = [];\n", - "\n", - "var i = 0;\n", - "\n", - "while (i < 25) {\n", - "\n", - " walls = walls + [ API.wall.get({\"owner_id\": users[i], \"count\": 100, \"extended\": 1}) ];\n", - " \n", - " i = i + 1;\n", - " \n", - "};\n", - "\n", - "return walls;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_batch_wall_leader(uids: list, attempt: int=5, batch: int=25) -> list:\n", - " access_token = os.environ[\"ACCESS_TOKEN\"]\n", - " url_batch_execute = f\"https://api.vk.com/method/execute.batchLeader?users={{}}&access_token={access_token}&v=5.103\"\n", - " uids_list = list(map(lambda x: str(x), uids))\n", - " uids_str = f\"'[{','.join(uids_list)}]'\"\n", - "# url = requote_uri(url_batch_execute.format(uids))\n", - " for i in range(attempt): \n", - " response = requests.get(url_batch_execute.format(uids_str)).json()\n", - " if response.get(\"response\"):\n", - " return response\n", - " print(\"Sleep\")\n", - " time.sleep(1) \n", - " raise Exception(f\"After {attempt} attempts no response!!!\")\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Test and Debug Section" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "uid_test = users.uid[666]\n", - "print(f\"OMG (Мф. 24:4)\")\n", - "get_single_leader(uid_test)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "save_single_leader(uid_test, \"vk-api-saved\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# os.environ[\"ACCESS_TOKEN\"] = \"<>\"\n", - "access_token = os.environ[\"ACCESS_TOKEN\"]\n", - "\n", - "url_single_execute = f\"https://api.vk.com/method/execute.singleLeader?user={{}}&access_token={access_token}&v=5.103\"\n", - "url_batch_execute = f\"https://api.vk.com/method/execute.batchLeader?user={{}}&access_token={access_token}&v=5.103\" # TODO\n", - "requests.get(url_single_execute.format(uid_test)).json()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "source": [], - "metadata": { - "collapsed": false - } - } - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} \ No newline at end of file