Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
Former-commit-id: 184835d
  • Loading branch information
eracle committed Oct 7, 2016
0 parents commit 5509109
Show file tree
Hide file tree
Showing 23 changed files with 319 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
conf.py

__pycache__/
selenium_prototype.py
11 changes: 11 additions & 0 deletions .idea/linkedin.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions .idea/workspace.xml.REMOVED.git-id

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions conf_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

logging.getLogger('scrapy').setLevel(logging.WARNING)

logging.getLogger('selenium').setLevel(logging.WARNING)


email = ''
password = ''
Empty file added linkedin/__init__.py
Empty file.
Binary file added linkedin/__pycache__/__init__.cpython-34.pyc
Binary file not shown.
Binary file added linkedin/__pycache__/middlewares.cpython-34.pyc
Binary file not shown.
Binary file added linkedin/__pycache__/settings.cpython-34.pyc
Binary file not shown.
14 changes: 14 additions & 0 deletions linkedin/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class LinkedinItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
27 changes: 27 additions & 0 deletions linkedin/middlewares.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from scrapy.http import HtmlResponse
from selenium_utils import *
import linkedin
from scrapy.utils.python import to_bytes

class Selenium(object):
def process_request(self, request, spider):
driver = spider.driver

logger.info('SeleniumMiddleware - getting the page')
driver.get(request.url)

logger.info('SeleniumMiddleware - click more options')
more_option = get_by_xpath(driver, '//div/div/button[@class="more-options dropdown-caret"]')
more_option.send_keys(Keys.NULL)
more_option.click()

logger.info('SeleniumMiddleware - wait for names')
name = get_by_xpath(driver, '//ul[@class="browse-map-list"]/li/h4/a')
name.send_keys(Keys.NULL)

#request.meta['driver'] = self.driver # to access driver from response

logging.info('SeleniumMiddleware - retrieving body')
body = to_bytes(driver.page_source) # body must be of type bytes
return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)

11 changes: 11 additions & 0 deletions linkedin/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


class LinkedinPipeline(object):
def process_item(self, item, spider):
return item
94 changes: 94 additions & 0 deletions linkedin/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-

# Scrapy settings for linkedin project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'linkedin'

SPIDER_MODULES = ['linkedin.spiders']
NEWSPIDER_MODULE = 'linkedin.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'linkedin (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'linkedin.middlewares.MyCustomSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'linkedin.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'linkedin.pipelines.SomePipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

DOWNLOADER_MIDDLEWARES = {
'linkedin.middlewares.Selenium': 200
}
4 changes: 4 additions & 0 deletions linkedin/spiders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
Binary file added linkedin/spiders/__pycache__/__init__.cpython-34.pyc
Binary file not shown.
Binary file not shown.
47 changes: 47 additions & 0 deletions linkedin/spiders/linkedin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import scrapy

from selenium_utils import *
import random


class Linkedin(scrapy.Spider):
name = "linkedin"
start_urls = ['https://www.linkedin.com/in/ludovica-rain%C3%B2-8a1055113?authType=NAME_SEARCH&authToken=E2lZ&trk=tyah&trkInfo=clickedVertical%3Amynetwork%2CentityType%3AentityHistoryName%2CclickedEntityId%3Amynetwork_474885049%2Cidx%3A8']


def __init__(self):
logger.info('Init Firefox Browser')
profile = webdriver.FirefoxProfile()
profile.set_preference('dom.disable_beforeunload', True)
self.driver = webdriver.Firefox(profile)

self.driver.get('https://it.linkedin.com/')

logger.info('Searching for the Login btn')
get_by_xpath(self.driver, '//*[@class="login-email"]').send_keys(email)

logger.info('Searching for the password btn')
get_by_xpath(self.driver, '//*[@class="login-password"]').send_keys(password)

logger.info('Searching for the submit')
get_by_xpath(self.driver, '//*[@id="login-submit"]').click()


def parse(self, response):
driver = self.driver

logger.info('Scrapy parse - get the names list')
names = driver.find_elements_by_xpath('//ul[@class="browse-map-list"]/li/h4/a')

frontier = []
for name in names:
name.send_keys(Keys.NULL)
link = name.get_attribute('href')
frontier.append(scrapy.Request(link, callback=self.parse))

for f in frontier:
yield f




13 changes: 13 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Linkedin Scraping

Scraping software aimed to simply visit the pages of the linkedin users, the purpose is to gain visibility, because linkedin notifies when you watch another user page.

Uses: Scrapy, Selenium web driver and Firefox 45

# Install
pip install -r requirements.txt

# Usage:
Modify the conf.py file and type:

scrapy crawl linkedin
20 changes: 20 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
attrs==16.2.0
cffi==1.8.3
cryptography==1.5.2
cssselect==0.9.2
idna==2.1
lxml==3.6.4
parsel==1.0.3
pyasn1==0.1.9
pyasn1-modules==0.0.8
pycparser==2.14
PyDispatcher==2.0.5
pyOpenSSL==16.1.0
queuelib==1.4.2
Scrapy==1.2.0
selenium==2.53.6
service-identity==16.0.0
six==1.10.0
Twisted==16.4.1
w3lib==1.15.0
zope.interface==4.3.2
Empty file added scraper.py
Empty file.
11 changes: 11 additions & 0 deletions scrapy.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = linkedin.settings

[deploy]
#url = http://localhost:6800/
project = linkedin
28 changes: 28 additions & 0 deletions selenium_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException
from selenium.webdriver.firefox import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from conf import *



"""
number of seconds used to wait the web page's loading.
"""
WAIT_TIMEOUT = 10


def get_by_xpath(driver, xpath):
"""
Get a web element through the xpath passed by performing a Wait on it.
:param driver: Selenium web driver to use.
:param xpath: xpath to use.
:return: The web element
"""
return WebDriverWait(driver, WAIT_TIMEOUT).until(
ec.presence_of_element_located(
(By.XPATH, xpath)
))

0 comments on commit 5509109

Please sign in to comment.