diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8ad36ac --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +conf.py + +__pycache__/ +selenium_prototype.py diff --git a/.idea/linkedin.iml b/.idea/linkedin.iml new file mode 100644 index 0000000..24eeee8 --- /dev/null +++ b/.idea/linkedin.iml @@ -0,0 +1,11 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..d3c9685 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..b0d6a75 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml.REMOVED.git-id b/.idea/workspace.xml.REMOVED.git-id new file mode 100644 index 0000000..f2febc0 --- /dev/null +++ b/.idea/workspace.xml.REMOVED.git-id @@ -0,0 +1 @@ +495ac8de055a5ef5c301d3562b32de876843e05d \ No newline at end of file diff --git a/conf_template.py b/conf_template.py new file mode 100644 index 0000000..68e637f --- /dev/null +++ b/conf_template.py @@ -0,0 +1,12 @@ +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +logging.getLogger('scrapy').setLevel(logging.WARNING) + +logging.getLogger('selenium').setLevel(logging.WARNING) + + +email = '' +password = '' diff --git a/linkedin/__init__.py b/linkedin/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/linkedin/__pycache__/__init__.cpython-34.pyc b/linkedin/__pycache__/__init__.cpython-34.pyc new file mode 100644 index 0000000..a02c717 Binary files /dev/null and b/linkedin/__pycache__/__init__.cpython-34.pyc differ diff --git a/linkedin/__pycache__/middlewares.cpython-34.pyc b/linkedin/__pycache__/middlewares.cpython-34.pyc new file mode 100644 index 0000000..cc9e832 Binary files /dev/null and b/linkedin/__pycache__/middlewares.cpython-34.pyc differ diff --git a/linkedin/__pycache__/settings.cpython-34.pyc b/linkedin/__pycache__/settings.cpython-34.pyc new file mode 100644 index 0000000..8d3f06d Binary files /dev/null and b/linkedin/__pycache__/settings.cpython-34.pyc differ diff --git a/linkedin/items.py b/linkedin/items.py new file mode 100644 index 0000000..6c3f645 --- /dev/null +++ b/linkedin/items.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class LinkedinItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/linkedin/middlewares.py b/linkedin/middlewares.py new file mode 100644 index 0000000..a2f5a5d --- /dev/null +++ b/linkedin/middlewares.py @@ -0,0 +1,27 @@ +from scrapy.http import HtmlResponse +from selenium_utils import * +import linkedin +from scrapy.utils.python import to_bytes + +class Selenium(object): + def process_request(self, request, spider): + driver = spider.driver + + logger.info('SeleniumMiddleware - getting the page') + driver.get(request.url) + + logger.info('SeleniumMiddleware - click more options') + more_option = get_by_xpath(driver, '//div/div/button[@class="more-options dropdown-caret"]') + more_option.send_keys(Keys.NULL) + more_option.click() + + logger.info('SeleniumMiddleware - wait for names') + name = get_by_xpath(driver, '//ul[@class="browse-map-list"]/li/h4/a') + name.send_keys(Keys.NULL) + + #request.meta['driver'] = self.driver # to access driver from response + + logging.info('SeleniumMiddleware - retrieving body') + body = to_bytes(driver.page_source) # body must be of type bytes + return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request) + diff --git a/linkedin/pipelines.py b/linkedin/pipelines.py new file mode 100644 index 0000000..57df283 --- /dev/null +++ b/linkedin/pipelines.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html + + +class LinkedinPipeline(object): + def process_item(self, item, spider): + return item diff --git a/linkedin/settings.py b/linkedin/settings.py new file mode 100644 index 0000000..18684c3 --- /dev/null +++ b/linkedin/settings.py @@ -0,0 +1,94 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for linkedin project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'linkedin' + +SPIDER_MODULES = ['linkedin.spiders'] +NEWSPIDER_MODULE = 'linkedin.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'linkedin (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'linkedin.middlewares.MyCustomSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'linkedin.middlewares.MyCustomDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# 'linkedin.pipelines.SomePipeline': 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See http://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +DOWNLOADER_MIDDLEWARES = { + 'linkedin.middlewares.Selenium': 200 +} diff --git a/linkedin/spiders/__init__.py b/linkedin/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/linkedin/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/linkedin/spiders/__pycache__/__init__.cpython-34.pyc b/linkedin/spiders/__pycache__/__init__.cpython-34.pyc new file mode 100644 index 0000000..7ffeda6 Binary files /dev/null and b/linkedin/spiders/__pycache__/__init__.cpython-34.pyc differ diff --git a/linkedin/spiders/__pycache__/linkedin.cpython-34.pyc b/linkedin/spiders/__pycache__/linkedin.cpython-34.pyc new file mode 100644 index 0000000..517c109 Binary files /dev/null and b/linkedin/spiders/__pycache__/linkedin.cpython-34.pyc differ diff --git a/linkedin/spiders/linkedin.py b/linkedin/spiders/linkedin.py new file mode 100644 index 0000000..e2a4d17 --- /dev/null +++ b/linkedin/spiders/linkedin.py @@ -0,0 +1,47 @@ +import scrapy + +from selenium_utils import * +import random + + +class Linkedin(scrapy.Spider): + name = "linkedin" + start_urls = ['https://www.linkedin.com/in/ludovica-rain%C3%B2-8a1055113?authType=NAME_SEARCH&authToken=E2lZ&trk=tyah&trkInfo=clickedVertical%3Amynetwork%2CentityType%3AentityHistoryName%2CclickedEntityId%3Amynetwork_474885049%2Cidx%3A8'] + + + def __init__(self): + logger.info('Init Firefox Browser') + profile = webdriver.FirefoxProfile() + profile.set_preference('dom.disable_beforeunload', True) + self.driver = webdriver.Firefox(profile) + + self.driver.get('https://it.linkedin.com/') + + logger.info('Searching for the Login btn') + get_by_xpath(self.driver, '//*[@class="login-email"]').send_keys(email) + + logger.info('Searching for the password btn') + get_by_xpath(self.driver, '//*[@class="login-password"]').send_keys(password) + + logger.info('Searching for the submit') + get_by_xpath(self.driver, '//*[@id="login-submit"]').click() + + + def parse(self, response): + driver = self.driver + + logger.info('Scrapy parse - get the names list') + names = driver.find_elements_by_xpath('//ul[@class="browse-map-list"]/li/h4/a') + + frontier = [] + for name in names: + name.send_keys(Keys.NULL) + link = name.get_attribute('href') + frontier.append(scrapy.Request(link, callback=self.parse)) + + for f in frontier: + yield f + + + + diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..4de118e --- /dev/null +++ b/readme.md @@ -0,0 +1,13 @@ +# Linkedin Scraping + +Scraping software aimed to simply visit the pages of the linkedin users, the purpose is to gain visibility, because linkedin notifies when you watch another user page. + +Uses: Scrapy, Selenium web driver and Firefox 45 + +# Install + pip install -r requirements.txt + +# Usage: +Modify the conf.py file and type: + + scrapy crawl linkedin \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a138bcc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,20 @@ +attrs==16.2.0 +cffi==1.8.3 +cryptography==1.5.2 +cssselect==0.9.2 +idna==2.1 +lxml==3.6.4 +parsel==1.0.3 +pyasn1==0.1.9 +pyasn1-modules==0.0.8 +pycparser==2.14 +PyDispatcher==2.0.5 +pyOpenSSL==16.1.0 +queuelib==1.4.2 +Scrapy==1.2.0 +selenium==2.53.6 +service-identity==16.0.0 +six==1.10.0 +Twisted==16.4.1 +w3lib==1.15.0 +zope.interface==4.3.2 diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..e69de29 diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..f3f1965 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.org/en/latest/deploy.html + +[settings] +default = linkedin.settings + +[deploy] +#url = http://localhost:6800/ +project = linkedin diff --git a/selenium_utils.py b/selenium_utils.py new file mode 100644 index 0000000..1ff668b --- /dev/null +++ b/selenium_utils.py @@ -0,0 +1,28 @@ +from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException +from selenium.webdriver.firefox import webdriver +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as ec +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys +from selenium import webdriver +from conf import * + + + +""" +number of seconds used to wait the web page's loading. +""" +WAIT_TIMEOUT = 10 + + +def get_by_xpath(driver, xpath): + """ + Get a web element through the xpath passed by performing a Wait on it. + :param driver: Selenium web driver to use. + :param xpath: xpath to use. + :return: The web element + """ + return WebDriverWait(driver, WAIT_TIMEOUT).until( + ec.presence_of_element_located( + (By.XPATH, xpath) + ))