first commit

Former-commit-id: 184835d
AnitCloud1 · Oct 7, 2016 · 5509109 · 5509109
commit 5509109
Show file tree

Hide file tree

Showing 23 changed files with 319 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+conf.py
+
+__pycache__/
+selenium_prototype.py
diff --git a/.idea/linkedin.iml b/.idea/linkedin.iml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/workspace.xml.REMOVED.git-id b/.idea/workspace.xml.REMOVED.git-id
diff --git a/conf_template.py b/conf_template.py
@@ -0,0 +1,12 @@
+import logging
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+logging.getLogger('scrapy').setLevel(logging.WARNING)
+
+logging.getLogger('selenium').setLevel(logging.WARNING)
+
+
+email = ''
+password = ''
diff --git a/linkedin/__init__.py b/linkedin/__init__.py
diff --git a/linkedin/__pycache__/__init__.cpython-34.pyc b/linkedin/__pycache__/__init__.cpython-34.pyc
diff --git a/linkedin/__pycache__/middlewares.cpython-34.pyc b/linkedin/__pycache__/middlewares.cpython-34.pyc
diff --git a/linkedin/__pycache__/settings.cpython-34.pyc b/linkedin/__pycache__/settings.cpython-34.pyc
diff --git a/linkedin/items.py b/linkedin/items.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class LinkedinItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
diff --git a/linkedin/middlewares.py b/linkedin/middlewares.py
@@ -0,0 +1,27 @@
+from scrapy.http import HtmlResponse
+from selenium_utils import *
+import linkedin
+from scrapy.utils.python import to_bytes
+
+class Selenium(object):
+    def process_request(self, request, spider):
+        driver = spider.driver
+
+        logger.info('SeleniumMiddleware - getting the page')
+        driver.get(request.url)
+
+        logger.info('SeleniumMiddleware - click more options')
+        more_option = get_by_xpath(driver, '//div/div/button[@class="more-options dropdown-caret"]')
+        more_option.send_keys(Keys.NULL)
+        more_option.click()
+
+        logger.info('SeleniumMiddleware - wait for names')
+        name = get_by_xpath(driver, '//ul[@class="browse-map-list"]/li/h4/a')
+        name.send_keys(Keys.NULL)
+
+        #request.meta['driver'] = self.driver  # to access driver from response
+
+        logging.info('SeleniumMiddleware - retrieving body')
+        body = to_bytes(driver.page_source)  # body must be of type bytes
+        return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
+
diff --git a/linkedin/pipelines.py b/linkedin/pipelines.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class LinkedinPipeline(object):
+    def process_item(self, item, spider):
+        return item
diff --git a/linkedin/settings.py b/linkedin/settings.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for linkedin project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'linkedin'
+
+SPIDER_MODULES = ['linkedin.spiders']
+NEWSPIDER_MODULE = 'linkedin.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'linkedin (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'linkedin.middlewares.MyCustomSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'linkedin.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'linkedin.pipelines.SomePipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+DOWNLOADER_MIDDLEWARES = {
+    'linkedin.middlewares.Selenium': 200
+}
diff --git a/linkedin/spiders/__init__.py b/linkedin/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/linkedin/spiders/__pycache__/__init__.cpython-34.pyc b/linkedin/spiders/__pycache__/__init__.cpython-34.pyc
diff --git a/linkedin/spiders/__pycache__/linkedin.cpython-34.pyc b/linkedin/spiders/__pycache__/linkedin.cpython-34.pyc
diff --git a/linkedin/spiders/linkedin.py b/linkedin/spiders/linkedin.py
@@ -0,0 +1,47 @@
+import scrapy
+
+from selenium_utils import *
+import random
+
+
+class Linkedin(scrapy.Spider):
+    name = "linkedin"
+    start_urls = ['https://www.linkedin.com/in/ludovica-rain%C3%B2-8a1055113?authType=NAME_SEARCH&authToken=E2lZ&trk=tyah&trkInfo=clickedVertical%3Amynetwork%2CentityType%3AentityHistoryName%2CclickedEntityId%3Amynetwork_474885049%2Cidx%3A8']
+
+
+    def __init__(self):
+        logger.info('Init Firefox Browser')
+        profile = webdriver.FirefoxProfile()
+        profile.set_preference('dom.disable_beforeunload', True)
+        self.driver = webdriver.Firefox(profile)
+
+        self.driver.get('https://it.linkedin.com/')
+
+        logger.info('Searching for the Login btn')
+        get_by_xpath(self.driver, '//*[@class="login-email"]').send_keys(email)
+
+        logger.info('Searching for the password btn')
+        get_by_xpath(self.driver, '//*[@class="login-password"]').send_keys(password)
+
+        logger.info('Searching for the submit')
+        get_by_xpath(self.driver, '//*[@id="login-submit"]').click()
+
+
+    def parse(self, response):
+        driver = self.driver
+
+        logger.info('Scrapy parse - get the names list')
+        names = driver.find_elements_by_xpath('//ul[@class="browse-map-list"]/li/h4/a')
+
+        frontier = []
+        for name in names:
+            name.send_keys(Keys.NULL)
+            link = name.get_attribute('href')
+            frontier.append(scrapy.Request(link, callback=self.parse))
+
+        for f in frontier:
+            yield f
+
+
+
+
diff --git a/readme.md b/readme.md
@@ -0,0 +1,13 @@
+# Linkedin Scraping
+
+Scraping software aimed to simply visit the pages of the linkedin users, the purpose is to gain visibility, because linkedin notifies when you watch another user page.
+
+Uses: Scrapy, Selenium web driver and Firefox 45
+
+# Install
+    pip install -r requirements.txt
+
+# Usage:
+Modify the conf.py file and type:
+
+    scrapy crawl linkedin
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,20 @@
+attrs==16.2.0
+cffi==1.8.3
+cryptography==1.5.2
+cssselect==0.9.2
+idna==2.1
+lxml==3.6.4
+parsel==1.0.3
+pyasn1==0.1.9
+pyasn1-modules==0.0.8
+pycparser==2.14
+PyDispatcher==2.0.5
+pyOpenSSL==16.1.0
+queuelib==1.4.2
+Scrapy==1.2.0
+selenium==2.53.6
+service-identity==16.0.0
+six==1.10.0
+Twisted==16.4.1
+w3lib==1.15.0
+zope.interface==4.3.2
diff --git a/scraper.py b/scraper.py
diff --git a/scrapy.cfg b/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = linkedin.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = linkedin
diff --git a/selenium_utils.py b/selenium_utils.py
@@ -0,0 +1,28 @@
+from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException
+from selenium.webdriver.firefox import webdriver
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as ec
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium import webdriver
+from conf import *
+
+
+
+"""
+number of seconds used to wait the web page's loading.
+"""
+WAIT_TIMEOUT = 10
+
+
+def get_by_xpath(driver, xpath):
+    """
+    Get a web element through the xpath passed by performing a Wait on it.
+    :param driver: Selenium web driver to use.
+    :param xpath: xpath to use.
+    :return: The web element
+    """
+    return WebDriverWait(driver, WAIT_TIMEOUT).until(
+        ec.presence_of_element_located(
+            (By.XPATH, xpath)
+        ))