diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..8ad36ac
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+conf.py
+
+__pycache__/
+selenium_prototype.py
diff --git a/.idea/linkedin.iml b/.idea/linkedin.iml
new file mode 100644
index 0000000..24eeee8
--- /dev/null
+++ b/.idea/linkedin.iml
@@ -0,0 +1,11 @@
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..d3c9685
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..b0d6a75
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/workspace.xml.REMOVED.git-id b/.idea/workspace.xml.REMOVED.git-id
new file mode 100644
index 0000000..f2febc0
--- /dev/null
+++ b/.idea/workspace.xml.REMOVED.git-id
@@ -0,0 +1 @@
+495ac8de055a5ef5c301d3562b32de876843e05d
\ No newline at end of file
diff --git a/conf_template.py b/conf_template.py
new file mode 100644
index 0000000..68e637f
--- /dev/null
+++ b/conf_template.py
@@ -0,0 +1,12 @@
+import logging
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+logging.getLogger('scrapy').setLevel(logging.WARNING)
+
+logging.getLogger('selenium').setLevel(logging.WARNING)
+
+
+email = ''
+password = ''
diff --git a/linkedin/__init__.py b/linkedin/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/linkedin/__pycache__/__init__.cpython-34.pyc b/linkedin/__pycache__/__init__.cpython-34.pyc
new file mode 100644
index 0000000..a02c717
Binary files /dev/null and b/linkedin/__pycache__/__init__.cpython-34.pyc differ
diff --git a/linkedin/__pycache__/middlewares.cpython-34.pyc b/linkedin/__pycache__/middlewares.cpython-34.pyc
new file mode 100644
index 0000000..cc9e832
Binary files /dev/null and b/linkedin/__pycache__/middlewares.cpython-34.pyc differ
diff --git a/linkedin/__pycache__/settings.cpython-34.pyc b/linkedin/__pycache__/settings.cpython-34.pyc
new file mode 100644
index 0000000..8d3f06d
Binary files /dev/null and b/linkedin/__pycache__/settings.cpython-34.pyc differ
diff --git a/linkedin/items.py b/linkedin/items.py
new file mode 100644
index 0000000..6c3f645
--- /dev/null
+++ b/linkedin/items.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class LinkedinItem(scrapy.Item):
+ # define the fields for your item here like:
+ # name = scrapy.Field()
+ pass
diff --git a/linkedin/middlewares.py b/linkedin/middlewares.py
new file mode 100644
index 0000000..a2f5a5d
--- /dev/null
+++ b/linkedin/middlewares.py
@@ -0,0 +1,27 @@
+from scrapy.http import HtmlResponse
+from selenium_utils import *
+import linkedin
+from scrapy.utils.python import to_bytes
+
+class Selenium(object):
+ def process_request(self, request, spider):
+ driver = spider.driver
+
+ logger.info('SeleniumMiddleware - getting the page')
+ driver.get(request.url)
+
+ logger.info('SeleniumMiddleware - click more options')
+ more_option = get_by_xpath(driver, '//div/div/button[@class="more-options dropdown-caret"]')
+ more_option.send_keys(Keys.NULL)
+ more_option.click()
+
+ logger.info('SeleniumMiddleware - wait for names')
+ name = get_by_xpath(driver, '//ul[@class="browse-map-list"]/li/h4/a')
+ name.send_keys(Keys.NULL)
+
+ #request.meta['driver'] = self.driver # to access driver from response
+
+ logging.info('SeleniumMiddleware - retrieving body')
+ body = to_bytes(driver.page_source) # body must be of type bytes
+ return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
+
diff --git a/linkedin/pipelines.py b/linkedin/pipelines.py
new file mode 100644
index 0000000..57df283
--- /dev/null
+++ b/linkedin/pipelines.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class LinkedinPipeline(object):
+ def process_item(self, item, spider):
+ return item
diff --git a/linkedin/settings.py b/linkedin/settings.py
new file mode 100644
index 0000000..18684c3
--- /dev/null
+++ b/linkedin/settings.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for linkedin project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+# http://doc.scrapy.org/en/latest/topics/settings.html
+# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'linkedin'
+
+SPIDER_MODULES = ['linkedin.spiders']
+NEWSPIDER_MODULE = 'linkedin.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'linkedin (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+# 'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+# 'linkedin.middlewares.MyCustomSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+# 'linkedin.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+# 'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+# 'linkedin.pipelines.SomePipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+DOWNLOADER_MIDDLEWARES = {
+ 'linkedin.middlewares.Selenium': 200
+}
diff --git a/linkedin/spiders/__init__.py b/linkedin/spiders/__init__.py
new file mode 100644
index 0000000..ebd689a
--- /dev/null
+++ b/linkedin/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/linkedin/spiders/__pycache__/__init__.cpython-34.pyc b/linkedin/spiders/__pycache__/__init__.cpython-34.pyc
new file mode 100644
index 0000000..7ffeda6
Binary files /dev/null and b/linkedin/spiders/__pycache__/__init__.cpython-34.pyc differ
diff --git a/linkedin/spiders/__pycache__/linkedin.cpython-34.pyc b/linkedin/spiders/__pycache__/linkedin.cpython-34.pyc
new file mode 100644
index 0000000..517c109
Binary files /dev/null and b/linkedin/spiders/__pycache__/linkedin.cpython-34.pyc differ
diff --git a/linkedin/spiders/linkedin.py b/linkedin/spiders/linkedin.py
new file mode 100644
index 0000000..e2a4d17
--- /dev/null
+++ b/linkedin/spiders/linkedin.py
@@ -0,0 +1,47 @@
+import scrapy
+
+from selenium_utils import *
+import random
+
+
+class Linkedin(scrapy.Spider):
+ name = "linkedin"
+ start_urls = ['https://www.linkedin.com/in/ludovica-rain%C3%B2-8a1055113?authType=NAME_SEARCH&authToken=E2lZ&trk=tyah&trkInfo=clickedVertical%3Amynetwork%2CentityType%3AentityHistoryName%2CclickedEntityId%3Amynetwork_474885049%2Cidx%3A8']
+
+
+ def __init__(self):
+ logger.info('Init Firefox Browser')
+ profile = webdriver.FirefoxProfile()
+ profile.set_preference('dom.disable_beforeunload', True)
+ self.driver = webdriver.Firefox(profile)
+
+ self.driver.get('https://it.linkedin.com/')
+
+ logger.info('Searching for the Login btn')
+ get_by_xpath(self.driver, '//*[@class="login-email"]').send_keys(email)
+
+ logger.info('Searching for the password btn')
+ get_by_xpath(self.driver, '//*[@class="login-password"]').send_keys(password)
+
+ logger.info('Searching for the submit')
+ get_by_xpath(self.driver, '//*[@id="login-submit"]').click()
+
+
+ def parse(self, response):
+ driver = self.driver
+
+ logger.info('Scrapy parse - get the names list')
+ names = driver.find_elements_by_xpath('//ul[@class="browse-map-list"]/li/h4/a')
+
+ frontier = []
+ for name in names:
+ name.send_keys(Keys.NULL)
+ link = name.get_attribute('href')
+ frontier.append(scrapy.Request(link, callback=self.parse))
+
+ for f in frontier:
+ yield f
+
+
+
+
diff --git a/readme.md b/readme.md
new file mode 100644
index 0000000..4de118e
--- /dev/null
+++ b/readme.md
@@ -0,0 +1,13 @@
+# Linkedin Scraping
+
+Scraping software aimed to simply visit the pages of the linkedin users, the purpose is to gain visibility, because linkedin notifies when you watch another user page.
+
+Uses: Scrapy, Selenium web driver and Firefox 45
+
+# Install
+ pip install -r requirements.txt
+
+# Usage:
+Modify the conf.py file and type:
+
+ scrapy crawl linkedin
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..a138bcc
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,20 @@
+attrs==16.2.0
+cffi==1.8.3
+cryptography==1.5.2
+cssselect==0.9.2
+idna==2.1
+lxml==3.6.4
+parsel==1.0.3
+pyasn1==0.1.9
+pyasn1-modules==0.0.8
+pycparser==2.14
+PyDispatcher==2.0.5
+pyOpenSSL==16.1.0
+queuelib==1.4.2
+Scrapy==1.2.0
+selenium==2.53.6
+service-identity==16.0.0
+six==1.10.0
+Twisted==16.4.1
+w3lib==1.15.0
+zope.interface==4.3.2
diff --git a/scraper.py b/scraper.py
new file mode 100644
index 0000000..e69de29
diff --git a/scrapy.cfg b/scrapy.cfg
new file mode 100644
index 0000000..f3f1965
--- /dev/null
+++ b/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = linkedin.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = linkedin
diff --git a/selenium_utils.py b/selenium_utils.py
new file mode 100644
index 0000000..1ff668b
--- /dev/null
+++ b/selenium_utils.py
@@ -0,0 +1,28 @@
+from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException
+from selenium.webdriver.firefox import webdriver
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as ec
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium import webdriver
+from conf import *
+
+
+
+"""
+number of seconds used to wait the web page's loading.
+"""
+WAIT_TIMEOUT = 10
+
+
+def get_by_xpath(driver, xpath):
+ """
+ Get a web element through the xpath passed by performing a Wait on it.
+ :param driver: Selenium web driver to use.
+ :param xpath: xpath to use.
+ :return: The web element
+ """
+ return WebDriverWait(driver, WAIT_TIMEOUT).until(
+ ec.presence_of_element_located(
+ (By.XPATH, xpath)
+ ))