igormusinov · kimaril · Mar 13, 2020 · Mar 13, 2020 · Mar 13, 2020 · Mar 13, 2020
diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,11 @@ users.csv
 .ipynb_checkpoints/
 
 vk-api-saved.tar.xz
+
+lib/
+lib64*
+bin/
+share/
+include/
+pyvenv.cfg
+geckodriver.log
diff --git a/README.md b/README.md
@@ -2,19 +2,28 @@
 
 ## WebScraping for the Leadership research project.
 
-## Install
+## Installation
 
-LoL, only python
+I suppose you have **Python 3.6+** and some kind of **Unix Terminal** installed.
+
+## Ubuntu:
+
+```shell
+python3 -m venv spider 
+cd spider
+source bin/activate
+pip3 install -r requirements.txt
+```
 
 ## VK-API scraping
 
 ### Create your own VK App
 
-1. Create your VK application here `https://vk.com/apps?act=manage` . 
+1. Create your VK application [here](https://vk.com/apps?act=manage). 
 You need a Standalone app. Get your `APP_ID` in the app's settings.
 
-2. You have to create `execute stored procedures` in your App (https://vk.com/apps?act=manage).
-Find in the app's settings `Stored procedures`. Create new procedure `execute.singleLeader` :
+2. Create `execute stored procedures` in your app.
+Find in the app's settings `Stored procedures`. Create new procedure `execute.singleLeader`:
 
 ```Javascript
 var user = API.users.get({"user_ids": [Args.user], "fields": ["photo_id", "verified", "sex", "bdate", "city", "country", "home_town", "has_photo", "photo_50", "photo_100", "photo_200_orig", "photo_200", "photo_400_orig", "photo_max", "photo_max_orig", "online", "domain", "has_mobile", "contacts", "site", "education", "universities", "schools", "status", "last_seen", "followers_count", "common_count", "occupation", "nickname", "relatives", "relation", "personal", "connections", "exports", "activities", "interests", "music", "movies", "tv", "books", "games", "about", "quotes", "can_post", "can_see_all_posts", "can_see_audio", "can_write_private_message", "can_send_friend_request", "is_favorite", "is_hidden_from_feed", "timezone", "screen_name", "maiden_name", "crop_photo", "is_friend", "friend_status", "career", "military", "blacklisted", "blacklisted_by_me", "can_be_invited_group"]});
@@ -28,40 +37,9 @@ return [user, groups, wall];
 
 ### Use existing VK App
 
-3. Get the `ACCESS_TOKEN` for your app. 
-Open in your browser (specify `<APP_ID>` from 1 step in the request)
- `https://oauth.vk.com/authorize?client_id=<APP_ID>&display=page&redirect_uri=https://oauth.vk.com/blank.html&scope=friends&response_type=token&v=5.103&state=123456`
- . You will be redirected to another page, in the browser's search string you will find you `ACCESS_TOKEN`. 
-
-
-4. Set this `ACCESS_TOKEN` inside of the notebook 
-
-`!!! Don't commit the token to the repo. You can lose an access to your account !!!`
-
-5. Get `users.csv` file
-
-6. Launch notebook `vk_api_scrap.ipynb`
-
-
-
-### Usefull links
-
-https://vk.com/dev/authcode_flow_user
-
-https://vk.com/dev/methods
-
-https://vk.com/dev/execute
-
-https://vk.com/dev/users
-
-https://vk.com/dev/groups
-
-https://vk.com/dev/wall
-
-https://vk.com/dev/likes 
+3. Make sure you've downloaded `users.csv` file
 
-STREAMING API? https://vk.com/dev/streaming_api_docs
+4. ```python3 spider.py``` and follow the input instructions.
 
-## No-API scraping [deprecated]
 
-scrap.ipynb 
+<h1><center>You have to ensure you have the latest Firefox on your machine & path to the latest geckodriver in PATH variable!</center></h1>
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,16 @@
+beautifulsoup4==4.8.2
+bs4==0.0.1
+certifi==2019.11.28
+chardet==3.0.4
+idna==2.9
+numpy==1.18.1
+pandas==1.0.1
+pkg-resources==0.0.0
+python-dateutil==2.8.1
+pytz==2019.3
+requests==2.23.0
+selenium==3.141.0
+six==1.14.0
+soupsieve==2.0
+tqdm==4.43.0
+urllib3==1.25.8
diff --git a/spider.py b/spider.py
@@ -0,0 +1,78 @@
+import requests
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from urllib.parse import urlsplit
+from tqdm import tqdm
+from tqdm.notebook import tqdm as tqdm_notebook
+import os
+import json
+import pandas as pd
+import time
+
+class OAuthHandler:
+    def __init__(self, login: str, password: str) -> str:
+        self.driver = webdriver.Firefox()
+        self.login_xpath = '/html/body/div/div/div/div[2]/form/div/div/input[6]'
+        self.password_xpath = '/html/body/div/div/div/div[2]/form/div/div/input[7]'
+        self.button_xpath = '//*[@id="install_allow"]'
+        self.login = login
+        self.password = password
+
+    def auth(self, link):
+        self.driver.get(link)
+        try:
+            email_field = self.driver.find_element_by_xpath(self.login_xpath)
+            password_field = self.driver.find_element_by_xpath(self.password_xpath)
+
+            email_field.send_keys(self.login)
+            password_field.send_keys(self.password)
+
+            self.driver.find_element_by_xpath(self.button_xpath).click()
+        except:
+            pass
+        return self.driver.current_url
+
+def get_token(APP_ID: int) -> str:
+    login = input('Phone or email: ')
+    password = input('Password: ')
+    AUTH_URL = "https://oauth.vk.com/authorize?client_id={APP_ID}&display=page&response_type=token&v=5.103"
+    response = OAuthHandler(login, password).auth(AUTH_URL.format(APP_ID=APP_ID))
+    token = urlsplit(response, scheme='https').fragment.split('&')[0].split('=')[-1]
+    return token
+
+def get_single_leader(uid: str, access_token: str, attempt: int=5) -> dict:
+    url_single_execute = f"https://api.vk.com/method/execute.singleLeader?user={{}}&access_token={access_token}&v=5.103"
+
+    for i in range(attempt):
+        response = requests.get(url_single_execute.format(uid)).json()
+        if response.get('response'):
+            return response
+        print("Sleep")
+        time.sleep(1)
+    raise Exception(f"After {attempt} attempts no response!!!")
+
+def save_single_leader(uid: str, path: str, access_token: str):
+    data = get_single_leader(uid, access_token)
+    assert len(data["response"]) == 3
+    with open(f"{path}/{uid}.json", 'w') as f:
+        f.write(json.dumps(data))
+
+def main():
+    savepath = input('Absolute path to directory to save to: ')
+    userscsv_path = input('Absolute path to users.csv file: ')
+    APP_ID = int(input('VK Standalone App ID: '))
+    TOKEN = get_token(APP_ID)
+
+    users = pd.read_csv(userscsv_path).uid
+    debug_data = []
+    for uid in tqdm(users):
+        if not os.path.exists(savepath):
+            os.makedirs(savepath)
+        debug_data.append(save_single_leader(str(uid), savepath, TOKEN))
+
+    with open('./debug_data.txt', encoding='utf-8', mode='w') as f:
+        for dd in debug_data:
+            f.write(dd+'\n')
+
+if __name__=='__main__':
+    main()