forked from eracle/linkedin
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
23 changed files
with
319 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
conf.py | ||
|
||
__pycache__/ | ||
selenium_prototype.py |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import logging | ||
|
||
logging.basicConfig(level=logging.INFO) | ||
logger = logging.getLogger(__name__) | ||
|
||
logging.getLogger('scrapy').setLevel(logging.WARNING) | ||
|
||
logging.getLogger('selenium').setLevel(logging.WARNING) | ||
|
||
|
||
email = '' | ||
password = '' |
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Define here the models for your scraped items | ||
# | ||
# See documentation in: | ||
# http://doc.scrapy.org/en/latest/topics/items.html | ||
|
||
import scrapy | ||
|
||
|
||
class LinkedinItem(scrapy.Item): | ||
# define the fields for your item here like: | ||
# name = scrapy.Field() | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from scrapy.http import HtmlResponse | ||
from selenium_utils import * | ||
import linkedin | ||
from scrapy.utils.python import to_bytes | ||
|
||
class Selenium(object): | ||
def process_request(self, request, spider): | ||
driver = spider.driver | ||
|
||
logger.info('SeleniumMiddleware - getting the page') | ||
driver.get(request.url) | ||
|
||
logger.info('SeleniumMiddleware - click more options') | ||
more_option = get_by_xpath(driver, '//div/div/button[@class="more-options dropdown-caret"]') | ||
more_option.send_keys(Keys.NULL) | ||
more_option.click() | ||
|
||
logger.info('SeleniumMiddleware - wait for names') | ||
name = get_by_xpath(driver, '//ul[@class="browse-map-list"]/li/h4/a') | ||
name.send_keys(Keys.NULL) | ||
|
||
#request.meta['driver'] = self.driver # to access driver from response | ||
|
||
logging.info('SeleniumMiddleware - retrieving body') | ||
body = to_bytes(driver.page_source) # body must be of type bytes | ||
return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Define your item pipelines here | ||
# | ||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting | ||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html | ||
|
||
|
||
class LinkedinPipeline(object): | ||
def process_item(self, item, spider): | ||
return item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Scrapy settings for linkedin project | ||
# | ||
# For simplicity, this file contains only settings considered important or | ||
# commonly used. You can find more settings consulting the documentation: | ||
# | ||
# http://doc.scrapy.org/en/latest/topics/settings.html | ||
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | ||
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | ||
|
||
BOT_NAME = 'linkedin' | ||
|
||
SPIDER_MODULES = ['linkedin.spiders'] | ||
NEWSPIDER_MODULE = 'linkedin.spiders' | ||
|
||
|
||
# Crawl responsibly by identifying yourself (and your website) on the user-agent | ||
#USER_AGENT = 'linkedin (+http://www.yourdomain.com)' | ||
|
||
# Obey robots.txt rules | ||
ROBOTSTXT_OBEY = True | ||
|
||
# Configure maximum concurrent requests performed by Scrapy (default: 16) | ||
#CONCURRENT_REQUESTS = 32 | ||
|
||
# Configure a delay for requests for the same website (default: 0) | ||
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay | ||
# See also autothrottle settings and docs | ||
#DOWNLOAD_DELAY = 3 | ||
# The download delay setting will honor only one of: | ||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 | ||
#CONCURRENT_REQUESTS_PER_IP = 16 | ||
|
||
# Disable cookies (enabled by default) | ||
#COOKIES_ENABLED = False | ||
|
||
# Disable Telnet Console (enabled by default) | ||
#TELNETCONSOLE_ENABLED = False | ||
|
||
# Override the default request headers: | ||
#DEFAULT_REQUEST_HEADERS = { | ||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | ||
# 'Accept-Language': 'en', | ||
#} | ||
|
||
# Enable or disable spider middlewares | ||
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | ||
#SPIDER_MIDDLEWARES = { | ||
# 'linkedin.middlewares.MyCustomSpiderMiddleware': 543, | ||
#} | ||
|
||
# Enable or disable downloader middlewares | ||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | ||
#DOWNLOADER_MIDDLEWARES = { | ||
# 'linkedin.middlewares.MyCustomDownloaderMiddleware': 543, | ||
#} | ||
|
||
# Enable or disable extensions | ||
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html | ||
#EXTENSIONS = { | ||
# 'scrapy.extensions.telnet.TelnetConsole': None, | ||
#} | ||
|
||
# Configure item pipelines | ||
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html | ||
#ITEM_PIPELINES = { | ||
# 'linkedin.pipelines.SomePipeline': 300, | ||
#} | ||
|
||
# Enable and configure the AutoThrottle extension (disabled by default) | ||
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html | ||
#AUTOTHROTTLE_ENABLED = True | ||
# The initial download delay | ||
#AUTOTHROTTLE_START_DELAY = 5 | ||
# The maximum download delay to be set in case of high latencies | ||
#AUTOTHROTTLE_MAX_DELAY = 60 | ||
# The average number of requests Scrapy should be sending in parallel to | ||
# each remote server | ||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 | ||
# Enable showing throttling stats for every response received: | ||
#AUTOTHROTTLE_DEBUG = False | ||
|
||
# Enable and configure HTTP caching (disabled by default) | ||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings | ||
#HTTPCACHE_ENABLED = True | ||
#HTTPCACHE_EXPIRATION_SECS = 0 | ||
#HTTPCACHE_DIR = 'httpcache' | ||
#HTTPCACHE_IGNORE_HTTP_CODES = [] | ||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' | ||
|
||
DOWNLOADER_MIDDLEWARES = { | ||
'linkedin.middlewares.Selenium': 200 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# This package will contain the spiders of your Scrapy project | ||
# | ||
# Please refer to the documentation for information on how to create and manage | ||
# your spiders. |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
import scrapy | ||
|
||
from selenium_utils import * | ||
import random | ||
|
||
|
||
class Linkedin(scrapy.Spider): | ||
name = "linkedin" | ||
start_urls = ['https://www.linkedin.com/in/ludovica-rain%C3%B2-8a1055113?authType=NAME_SEARCH&authToken=E2lZ&trk=tyah&trkInfo=clickedVertical%3Amynetwork%2CentityType%3AentityHistoryName%2CclickedEntityId%3Amynetwork_474885049%2Cidx%3A8'] | ||
|
||
|
||
def __init__(self): | ||
logger.info('Init Firefox Browser') | ||
profile = webdriver.FirefoxProfile() | ||
profile.set_preference('dom.disable_beforeunload', True) | ||
self.driver = webdriver.Firefox(profile) | ||
|
||
self.driver.get('https://it.linkedin.com/') | ||
|
||
logger.info('Searching for the Login btn') | ||
get_by_xpath(self.driver, '//*[@class="login-email"]').send_keys(email) | ||
|
||
logger.info('Searching for the password btn') | ||
get_by_xpath(self.driver, '//*[@class="login-password"]').send_keys(password) | ||
|
||
logger.info('Searching for the submit') | ||
get_by_xpath(self.driver, '//*[@id="login-submit"]').click() | ||
|
||
|
||
def parse(self, response): | ||
driver = self.driver | ||
|
||
logger.info('Scrapy parse - get the names list') | ||
names = driver.find_elements_by_xpath('//ul[@class="browse-map-list"]/li/h4/a') | ||
|
||
frontier = [] | ||
for name in names: | ||
name.send_keys(Keys.NULL) | ||
link = name.get_attribute('href') | ||
frontier.append(scrapy.Request(link, callback=self.parse)) | ||
|
||
for f in frontier: | ||
yield f | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# Linkedin Scraping | ||
|
||
Scraping software aimed to simply visit the pages of the linkedin users, the purpose is to gain visibility, because linkedin notifies when you watch another user page. | ||
|
||
Uses: Scrapy, Selenium web driver and Firefox 45 | ||
|
||
# Install | ||
pip install -r requirements.txt | ||
|
||
# Usage: | ||
Modify the conf.py file and type: | ||
|
||
scrapy crawl linkedin |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
attrs==16.2.0 | ||
cffi==1.8.3 | ||
cryptography==1.5.2 | ||
cssselect==0.9.2 | ||
idna==2.1 | ||
lxml==3.6.4 | ||
parsel==1.0.3 | ||
pyasn1==0.1.9 | ||
pyasn1-modules==0.0.8 | ||
pycparser==2.14 | ||
PyDispatcher==2.0.5 | ||
pyOpenSSL==16.1.0 | ||
queuelib==1.4.2 | ||
Scrapy==1.2.0 | ||
selenium==2.53.6 | ||
service-identity==16.0.0 | ||
six==1.10.0 | ||
Twisted==16.4.1 | ||
w3lib==1.15.0 | ||
zope.interface==4.3.2 |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Automatically created by: scrapy startproject | ||
# | ||
# For more information about the [deploy] section see: | ||
# https://scrapyd.readthedocs.org/en/latest/deploy.html | ||
|
||
[settings] | ||
default = linkedin.settings | ||
|
||
[deploy] | ||
#url = http://localhost:6800/ | ||
project = linkedin |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException | ||
from selenium.webdriver.firefox import webdriver | ||
from selenium.webdriver.support.ui import WebDriverWait | ||
from selenium.webdriver.support import expected_conditions as ec | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.common.keys import Keys | ||
from selenium import webdriver | ||
from conf import * | ||
|
||
|
||
|
||
""" | ||
number of seconds used to wait the web page's loading. | ||
""" | ||
WAIT_TIMEOUT = 10 | ||
|
||
|
||
def get_by_xpath(driver, xpath): | ||
""" | ||
Get a web element through the xpath passed by performing a Wait on it. | ||
:param driver: Selenium web driver to use. | ||
:param xpath: xpath to use. | ||
:return: The web element | ||
""" | ||
return WebDriverWait(driver, WAIT_TIMEOUT).until( | ||
ec.presence_of_element_located( | ||
(By.XPATH, xpath) | ||
)) |