Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,11 @@ users.csv
.ipynb_checkpoints/

vk-api-saved.tar.xz

lib/
lib64*
bin/
share/
include/
pyvenv.cfg
geckodriver.log
56 changes: 17 additions & 39 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,28 @@

## WebScraping for the Leadership research project.

## Install
## Installation

LoL, only python
I suppose you have **Python 3.6+** and some kind of **Unix Terminal** installed.

## Ubuntu:

```shell
python3 -m venv spider
cd spider
source bin/activate
pip3 install -r requirements.txt
```

## VK-API scraping

### Create your own VK App

1. Create your VK application here `https://vk.com/apps?act=manage` .
1. Create your VK application [here](https://vk.com/apps?act=manage).
You need a Standalone app. Get your `APP_ID` in the app's settings.

2. You have to create `execute stored procedures` in your App (https://vk.com/apps?act=manage).
Find in the app's settings `Stored procedures`. Create new procedure `execute.singleLeader` :
2. Create `execute stored procedures` in your app.
Find in the app's settings `Stored procedures`. Create new procedure `execute.singleLeader`:

```Javascript
var user = API.users.get({"user_ids": [Args.user], "fields": ["photo_id", "verified", "sex", "bdate", "city", "country", "home_town", "has_photo", "photo_50", "photo_100", "photo_200_orig", "photo_200", "photo_400_orig", "photo_max", "photo_max_orig", "online", "domain", "has_mobile", "contacts", "site", "education", "universities", "schools", "status", "last_seen", "followers_count", "common_count", "occupation", "nickname", "relatives", "relation", "personal", "connections", "exports", "activities", "interests", "music", "movies", "tv", "books", "games", "about", "quotes", "can_post", "can_see_all_posts", "can_see_audio", "can_write_private_message", "can_send_friend_request", "is_favorite", "is_hidden_from_feed", "timezone", "screen_name", "maiden_name", "crop_photo", "is_friend", "friend_status", "career", "military", "blacklisted", "blacklisted_by_me", "can_be_invited_group"]});
Expand All @@ -28,40 +37,9 @@ return [user, groups, wall];

### Use existing VK App

3. Get the `ACCESS_TOKEN` for your app.
Open in your browser (specify `<APP_ID>` from 1 step in the request)
`https://oauth.vk.com/authorize?client_id=<APP_ID>&display=page&redirect_uri=https://oauth.vk.com/blank.html&scope=friends&response_type=token&v=5.103&state=123456`
. You will be redirected to another page, in the browser's search string you will find you `ACCESS_TOKEN`.


4. Set this `ACCESS_TOKEN` inside of the notebook

`!!! Don't commit the token to the repo. You can lose an access to your account !!!`

5. Get `users.csv` file

6. Launch notebook `vk_api_scrap.ipynb`



### Usefull links

https://vk.com/dev/authcode_flow_user

https://vk.com/dev/methods

https://vk.com/dev/execute

https://vk.com/dev/users

https://vk.com/dev/groups

https://vk.com/dev/wall

https://vk.com/dev/likes
3. Make sure you've downloaded `users.csv` file

STREAMING API? https://vk.com/dev/streaming_api_docs
4. ```python3 spider.py``` and follow the input instructions.

## No-API scraping [deprecated]

scrap.ipynb
<h1><center>You have to ensure you have the latest Firefox on your machine & path to the latest geckodriver in PATH variable!</center></h1>
16 changes: 16 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
beautifulsoup4==4.8.2
bs4==0.0.1
certifi==2019.11.28
chardet==3.0.4
idna==2.9
numpy==1.18.1
pandas==1.0.1
pkg-resources==0.0.0
python-dateutil==2.8.1
pytz==2019.3
requests==2.23.0
selenium==3.141.0
six==1.14.0
soupsieve==2.0
tqdm==4.43.0
urllib3==1.25.8
78 changes: 78 additions & 0 deletions spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from urllib.parse import urlsplit
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
import os
import json
import pandas as pd
import time

class OAuthHandler:
def __init__(self, login: str, password: str) -> str:
self.driver = webdriver.Firefox()
self.login_xpath = '/html/body/div/div/div/div[2]/form/div/div/input[6]'
self.password_xpath = '/html/body/div/div/div/div[2]/form/div/div/input[7]'
self.button_xpath = '//*[@id="install_allow"]'
self.login = login
self.password = password

def auth(self, link):
self.driver.get(link)
try:
email_field = self.driver.find_element_by_xpath(self.login_xpath)
password_field = self.driver.find_element_by_xpath(self.password_xpath)

email_field.send_keys(self.login)
password_field.send_keys(self.password)

self.driver.find_element_by_xpath(self.button_xpath).click()
except:
pass
return self.driver.current_url

def get_token(APP_ID: int) -> str:
login = input('Phone or email: ')
password = input('Password: ')
AUTH_URL = "https://oauth.vk.com/authorize?client_id={APP_ID}&display=page&response_type=token&v=5.103"
response = OAuthHandler(login, password).auth(AUTH_URL.format(APP_ID=APP_ID))
token = urlsplit(response, scheme='https').fragment.split('&')[0].split('=')[-1]
return token

def get_single_leader(uid: str, access_token: str, attempt: int=5) -> dict:
url_single_execute = f"https://api.vk.com/method/execute.singleLeader?user={{}}&access_token={access_token}&v=5.103"

for i in range(attempt):
response = requests.get(url_single_execute.format(uid)).json()
if response.get('response'):
return response
print("Sleep")
time.sleep(1)
raise Exception(f"After {attempt} attempts no response!!!")

def save_single_leader(uid: str, path: str, access_token: str):
data = get_single_leader(uid, access_token)
assert len(data["response"]) == 3
with open(f"{path}/{uid}.json", 'w') as f:
f.write(json.dumps(data))

def main():
savepath = input('Absolute path to directory to save to: ')
userscsv_path = input('Absolute path to users.csv file: ')
APP_ID = int(input('VK Standalone App ID: '))
TOKEN = get_token(APP_ID)

users = pd.read_csv(userscsv_path).uid
debug_data = []
for uid in tqdm(users):
if not os.path.exists(savepath):
os.makedirs(savepath)
debug_data.append(save_single_leader(str(uid), savepath, TOKEN))

with open('./debug_data.txt', encoding='utf-8', mode='w') as f:
for dd in debug_data:
f.write(dd+'\n')

if __name__=='__main__':
main()
Loading