docker/docker-compose working

Former-commit-id: ea90323
AnitCloud1 · Oct 26, 2017 · 0fb6451 · 0fb6451
1 parent d2261af
commit 0fb6451
Show file tree

Hide file tree

Showing 13 changed files with 750 additions and 49 deletions.
diff --git a/LICENCE.md b/LICENCE.md
diff --git a/LICENCE.md.REMOVED.git-id b/LICENCE.md.REMOVED.git-id
diff --git a/README.md b/README.md
@@ -1,44 +1,77 @@
 # Linkedin Scraping
-Ubuntu, python3.
+
+[![built with Selenium](https://img.shields.io/badge/built%20with-Selenium-yellow.svg)](https://github.com/SeleniumHQ/selenium)
+[![built with Python3](https://img.shields.io/badge/built%20with-Python3-red.svg)](https://www.python.org/)
+
 
 Scraping software aimed to visit as more pages of the linkedin users as possible, the purpose is to gain visibility: since LinkedIn notifies when you visit another user page.
 
-Uses: Scrapy, Selenium web driver, Chromium headless and python3.
+Uses: Scrapy, Selenium web driver, Chromium headless, docker and python3.
 
-Tested on Ubuntu 16.04.2 LTS
 
 
 # Install
+Docker allows very easy and fast run without any pain and tears.
 
-```bash
-virtualenv -p python3 .venv
-source .venv/bin/activate
+### 0. Preparations
 
-pip install -r requirements.txt
+Install docker from the official website [https://www.docker.com/](https://www.docker.com/)
+
+Install VNC viewer if you do not have one. 
+For ubuntu, go for vinagre:
 
-```
-On Ubuntu, sometimes:
 ```bash
-sudo apt-get install python3-dev
+sudo apt-get update
+sudo apt-get install vinagre
 ```
 
-# Usage:
-Rename the conf_template.py to conf.py, modify it with linkein username and password and type:
+Then connect to localhost:5900, password: secret
+
+### 1. Set your linkedin login and password
+
+Open `conf.py` and fill the quotes with your credentials.
+
+### 2. Run and build containers with docker-compose
+
+First you need to open your terminal, move to the root folder (usually with the `cd` command) of the project and then type:
 
 ```bash
-scrapy crawl linkedin
+docker-compose up -d --build
 ```
 
-Instead, for use chrome headless:
+
+### 3. See what your bot can do right now
+
+Run your VNC viewer, and type address and port `localhost:5900`. The password is `secret`.
+
+### 4. Stop the scraper
+
+Use your terminal again, type in the same window:
 ```bash
-scrapy crawl linkedin -a headless=True
+docker-compose down
 ```
 
 
-
 ##### Test:
+
+Create the selenium server:
 ```bash
+docker run --name selenium -p 4444:4444 -p 5900:5900 --publish-all --shm-size="128M" selenium/standalone-chrome-debug
+```
 
-python -m unittest selenium_chromium/test.py
+
+```bash
+virtualenvs -p python .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+
+python -m unittest test.py
 
 ```
+
+Stop and delete selenium server:
+```bash
+docker stop $(docker ps -aq --filter name=selenium)
+
+docker rm $(docker ps -aq --filter name=selenium)
+```
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,19 @@
+version: '3'
+services:
+  web:
+    command: ["./wait-for-selenium.sh", "http://selenium:4444/wd/hub", "--", "scrapy", "crawl", "linkedin"]
+    environment:
+      - PYTHONUNBUFFERED=0
+    build:
+      context: .
+      dockerfile: ./docker_conf/prod/Dockerfile
+    depends_on:
+      - selenium
+    volumes:
+      - ./logs:/code/logs
+  selenium:
+    container_name: selenium
+    image: selenium/standalone-chrome-debug
+    ports:
+      - "5900:5900"
+    shm_size: 128M
diff --git a/docker_conf/prod/Dockerfile b/docker_conf/prod/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.5
+RUN mkdir /code
+RUN mkdir /config
+WORKDIR /code
+COPY ./requirements.txt /config/
+RUN pip install -r /config/requirements.txt
+COPY ./ /code/
diff --git a/linkedin/middlewares.py b/linkedin/middlewares.py
@@ -1,8 +1,8 @@
 from scrapy.http import HtmlResponse
 from scrapy.utils.python import to_bytes
-from selenium.webdriver.common.keys import Keys
 
-from selenium_utils import get_by_xpath
+
+from .selenium_utils import get_by_xpath
 
 
 class Selenium(object):

diff --git a/linkedin/selenium_utils.py b/linkedin/selenium_utils.py
@@ -0,0 +1,41 @@
+from selenium import webdriver
+from selenium.webdriver import DesiredCapabilities
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as ec
+from selenium.webdriver.support.ui import WebDriverWait
+
+"""
+number of seconds used to wait the web page's loading.
+"""
+WAIT_TIMEOUT = 10
+
+
+def get_by_xpath(driver, xpath):
+    """
+    Get a web element through the xpath passed by performing a Wait on it.
+    :param driver: Selenium web driver to use.
+    :param xpath: xpath to use.
+    :return: The web element
+    """
+    return WebDriverWait(driver, WAIT_TIMEOUT).until(
+        ec.presence_of_element_located(
+            (By.XPATH, xpath)
+        ))
+
+
+def init_chromium(selenium_host):
+    selenium_url = 'http://%s:4444/wd/hub' % selenium_host
+
+    print('Initializing chromium, remote url: %s' % selenium_url)
+
+    chrome_options = DesiredCapabilities.CHROME
+    # chrome_options.add_argument('--disable-notifications')
+
+    prefs = {"credentials_enable_service": False, "profile.password_manager_enabled": False}
+
+    chrome_options['prefs'] = prefs
+
+    driver = webdriver.Remote(command_executor=selenium_url,
+                              desired_capabilities=chrome_options)
+    return driver
diff --git a/linkedin/spiders/linkedin.py b/linkedin/spiders/linkedin.py
@@ -3,8 +3,7 @@
 from scrapy.spiders import Rule
 
 from conf import EMAIL, PASSWORD
-from selenium_chromium import init_chromium
-from selenium_utils import get_by_xpath
+from linkedin.selenium_utils import get_by_xpath, init_chromium
 
 LINKEDIN_DOMAIN_URL = 'https://it.linkedin.com/'
 NETWORK_URL = 'https://www.linkedin.com/mynetwork/invite-connect/connections/'
@@ -24,10 +23,8 @@ class Linkedin(CrawlSpider):
              ),
     )
 
-    def __init__(self, headless=False, *a, **kw):
-        if headless:
-            headless = True
-        self.driver = init_chromium(headless)
+    def __init__(self, host='selenium', *a, **kw):
+        self.driver = init_chromium(host)
 
         # Stop web page from asking me if really want to leave - past implementation, FIREFOX
         # profile = webdriver.FirefoxProfile()
@@ -46,4 +43,3 @@ def __init__(self, headless=False, *a, **kw):
         get_by_xpath(self.driver, '//*[@id="login-submit"]').click()
 
         super().__init__(*a, **kw)
-
diff --git a/logs/users.txt b/logs/users.txt
diff --git a/requirements.txt b/requirements.txt
@@ -17,7 +17,7 @@ PyDispatcher==2.0.5
 pyOpenSSL==17.2.0
 queuelib==1.4.2
 Scrapy==1.4.0
-selenium==3.4.3
+selenium==3.6
 service-identity==17.0.0
 six==1.10.0
 Twisted==17.5.0

diff --git a/selenium_utils.py b/selenium_utils.py
diff --git a/test.py b/test.py
@@ -0,0 +1,15 @@
+import unittest
+
+from linkedin.selenium_utils import init_chromium
+
+
+class TestChromium(unittest.TestCase):
+
+    def test_init(self):
+        webdriver = init_chromium('localhost')
+        self.assertIsNotNone(webdriver)
+        print("type: %s" % type(webdriver))
+        webdriver.close()
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/wait-for-selenium.sh b/wait-for-selenium.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# wait-for-selenium.sh
+
+set -e
+
+url="$1"
+shift
+cmd="$@"
+
+until wget -O- "$url"; do
+  >&2 echo "Selenium is unavailable - sleeping"
+  sleep 1
+done
+
+>&2 echo "Selenium is up - executing command"
+exec $cmd