From 55091090a4d10a3ebce243a6a2e0af8b9f6120e6 Mon Sep 17 00:00:00 2001
From: eracle <ercole.adeluca@gmail.com>
Date: Fri, 7 Oct 2016 20:09:43 +0200
Subject: [PATCH] first commit

Former-commit-id: 184835d87ff044b12c034a133b3e821ff92d5b1c
---
 .gitignore                                    |   4 +
 .idea/linkedin.iml                            |  11 ++
 .idea/misc.xml                                |  14 +++
 .idea/modules.xml                             |   8 ++
 .idea/workspace.xml.REMOVED.git-id            |   1 +
 conf_template.py                              |  12 +++
 linkedin/__init__.py                          |   0
 linkedin/__pycache__/__init__.cpython-34.pyc  | Bin 0 -> 141 bytes
 .../__pycache__/middlewares.cpython-34.pyc    | Bin 0 -> 1270 bytes
 linkedin/__pycache__/settings.cpython-34.pyc  | Bin 0 -> 337 bytes
 linkedin/items.py                             |  14 +++
 linkedin/middlewares.py                       |  27 +++++
 linkedin/pipelines.py                         |  11 ++
 linkedin/settings.py                          |  94 ++++++++++++++++++
 linkedin/spiders/__init__.py                  |   4 +
 .../__pycache__/__init__.cpython-34.pyc       | Bin 0 -> 149 bytes
 .../__pycache__/linkedin.cpython-34.pyc       | Bin 0 -> 1879 bytes
 linkedin/spiders/linkedin.py                  |  47 +++++++++
 readme.md                                     |  13 +++
 requirements.txt                              |  20 ++++
 scraper.py                                    |   0
 scrapy.cfg                                    |  11 ++
 selenium_utils.py                             |  28 ++++++
 23 files changed, 319 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 .idea/linkedin.iml
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/modules.xml
 create mode 100644 .idea/workspace.xml.REMOVED.git-id
 create mode 100644 conf_template.py
 create mode 100644 linkedin/__init__.py
 create mode 100644 linkedin/__pycache__/__init__.cpython-34.pyc
 create mode 100644 linkedin/__pycache__/middlewares.cpython-34.pyc
 create mode 100644 linkedin/__pycache__/settings.cpython-34.pyc
 create mode 100644 linkedin/items.py
 create mode 100644 linkedin/middlewares.py
 create mode 100644 linkedin/pipelines.py
 create mode 100644 linkedin/settings.py
 create mode 100644 linkedin/spiders/__init__.py
 create mode 100644 linkedin/spiders/__pycache__/__init__.cpython-34.pyc
 create mode 100644 linkedin/spiders/__pycache__/linkedin.cpython-34.pyc
 create mode 100644 linkedin/spiders/linkedin.py
 create mode 100644 readme.md
 create mode 100644 requirements.txt
 create mode 100644 scraper.py
 create mode 100644 scrapy.cfg
 create mode 100644 selenium_utils.py
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..8ad36ac
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+conf.py
+
+__pycache__/
+selenium_prototype.py
diff --git a/.idea/linkedin.iml b/.idea/linkedin.iml
new file mode 100644
index 0000000..24eeee8
--- /dev/null
+++ b/.idea/linkedin.iml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Python 3.4.3 virtualenv at ~/python/virtualenvs/linkedin" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="Unittests" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..d3c9685
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectLevelVcsManager" settingsEditedManually="false">
+    <OptionsSetting value="true" id="Add" />
+    <OptionsSetting value="true" id="Remove" />
+    <OptionsSetting value="true" id="Checkout" />
+    <OptionsSetting value="true" id="Update" />
+    <OptionsSetting value="true" id="Status" />
+    <OptionsSetting value="true" id="Edit" />
+    <ConfirmationsSetting value="0" id="Add" />
+    <ConfirmationsSetting value="0" id="Remove" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.4.3 virtualenv at ~/python/virtualenvs/linkedin" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..b0d6a75
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/linkedin.iml" filepath="$PROJECT_DIR$/.idea/linkedin.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/workspace.xml.REMOVED.git-id b/.idea/workspace.xml.REMOVED.git-id
new file mode 100644
index 0000000..f2febc0
--- /dev/null
+++ b/.idea/workspace.xml.REMOVED.git-id
@@ -0,0 +1 @@
+495ac8de055a5ef5c301d3562b32de876843e05d
\ No newline at end of file
diff --git a/conf_template.py b/conf_template.py
new file mode 100644
index 0000000..68e637f
--- /dev/null
+++ b/conf_template.py
@@ -0,0 +1,12 @@
+import logging
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+logging.getLogger('scrapy').setLevel(logging.WARNING)
+
+logging.getLogger('selenium').setLevel(logging.WARNING)
+
+
+email = ''
+password = ''
diff --git a/linkedin/__init__.py b/linkedin/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/linkedin/__pycache__/__init__.cpython-34.pyc b/linkedin/__pycache__/__init__.cpython-34.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a02c717df34a168d182efb17fa36f05afccc78f0
GIT binary patch
literal 141
zcmaFI!^<U7^DUeK2p)q77+?f49Dul(1xTbYFa&Ed`mJOr0tq9CUsn1V`MIh3sYQv&
zIjQ;ul_eSZdHMxK`B|ySCB^zVnR(f%DVcc)Mtpo`US>&ryk0@&Ee@O9{FKt1R6CHF
I#X!se08!#2k^lez

literal 0
HcmV?d00001

diff --git a/linkedin/__pycache__/middlewares.cpython-34.pyc b/linkedin/__pycache__/middlewares.cpython-34.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc9e832ec185bcfeab5e42c0fb3780034bae86ea
GIT binary patch
literal 1270
zcmZuwOK;Oa5FW>mIFC{!F1!vAPX#Hpgb?BYs1gT|fC^C%QV_Bn?>2F|_L|*wOQQ4?
z{tUl^f3XKHoVajB;>3)TmV&zS%<k;Y%s2Dx`e%2eefQh1cbgu-cQ`fBhHVu63Khf8
z0RfOZFm%8ny8)sBZ)oL$aAD}yULFV!avz31I8T`mEP&7G5&#ciiC#@|-|aVlU{>fm
zw)@QF;x*T$EHp>kxwC^hG%_B|4A&{S9ghYswo&v|R2--hgdaAjHY^8ms0d2$*>IN&
zUSw6Cy68-!i4vkbLD64PEgCQdIC8-_Fs+j;v9*N@Qx_Jv<pJ62!^X<+0nZU)*?=Pt
z79LCkFin_-^)cBkSi(uvI>jo=ucL77{~Kl6H9Yb|a2-OJcF8!x+#9ud4p<Ww0k9BQ
z3s{@#TRQG;zq2#HdEDvCj4{Cv6UBRj-k2Mc72}?n@LrjW`TX{oZmG!9{a!9HMwTYS
zOPJq}B9<N0GOCP`#ry44Bw9bZMs9<(TaPJOGI>}GQUo;DKFqJ532>NXrne_muSjyP
z=eMG$5+?~qN*-!H$dhs)GL1<hkwuexQ8K@AW-<g)89$)DMv~2}SL=`$sbsi0_banE
zcwn1~f2_DR{Y@J%l^t+p10ly_oV~2rlh#FctXnBzyST!a7Jn)eGqEkr3l{J5nYP~Z
zTJOAl@xuBwOSVZl5Jq|Ucx;DQjkT;)$`!p#+es^>cwu5p+;3V>bFruB#o0iY8RJT|
z&=_`z#&|`e^@tV~;Qkj-mvH+1D4NKeM_eVT;88g<6In#1l2e`<jr<h*9DDt{$WLxv
zZ<RB<iLNQvI$m=?uU=n91x~B6;haa=bj~~N)$EZqzj#dc)_=h}^drSF)fmUN9mlz3
zm7sMujz3n3Se>XVn7*$5lp;*!qoU?C5!7{RyC2$)PE}IQwkF1uEAGxAS@rK4S0)qM
uhR2QC3v1tcA3G#IRyPpH5s^|SJq1iGDnhs6!&SYH9V&rt0vmU_NWvdU>{J2(

literal 0
HcmV?d00001

diff --git a/linkedin/__pycache__/settings.cpython-34.pyc b/linkedin/__pycache__/settings.cpython-34.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d3f06d1be302bdda0fc6b9c2164198b4a3c2c3a
GIT binary patch
literal 337
zcmaFI!^<^w_xJFA9tMWT3`l?($aVnYVjUon!oUy(BpDgf85yD&L5wISh7=}-C}xIC
zAeWgTg@wT^iiIJCl_6M@Es7&2GcP+eB{Q!|0LIcQF33ztEh-Lqc>-u&l`LF5H!~$A
zC$&7WD79EGI5j6VFS9h)Pm}c)hm(Iuyq}}5>n*<E08ba!pm<+@mrx(q;9CNIuHh&`
zxA=nmfvST;B0}Q*om?YtiMjZP`}z1gf^~VixcIn+I|jK1uVg4<2U-p$ep%^f<maa9
zrxqn9=cMWvRF-7q=jj&|<!7ZPmlW&6oQGf(r<Rmt=A{?w6;$5hu*uC&Da}c>0|pvH
SF&B_vW8!3FV`p<?W&{A)zFwLD

literal 0
HcmV?d00001

diff --git a/linkedin/items.py b/linkedin/items.py
new file mode 100644
index 0000000..6c3f645
--- /dev/null
+++ b/linkedin/items.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class LinkedinItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
diff --git a/linkedin/middlewares.py b/linkedin/middlewares.py
new file mode 100644
index 0000000..a2f5a5d
--- /dev/null
+++ b/linkedin/middlewares.py
@@ -0,0 +1,27 @@
+from scrapy.http import HtmlResponse
+from selenium_utils import *
+import linkedin
+from scrapy.utils.python import to_bytes
+
+class Selenium(object):
+    def process_request(self, request, spider):
+        driver = spider.driver
+
+        logger.info('SeleniumMiddleware - getting the page')
+        driver.get(request.url)
+
+        logger.info('SeleniumMiddleware - click more options')
+        more_option = get_by_xpath(driver, '//div/div/button[@class="more-options dropdown-caret"]')
+        more_option.send_keys(Keys.NULL)
+        more_option.click()
+
+        logger.info('SeleniumMiddleware - wait for names')
+        name = get_by_xpath(driver, '//ul[@class="browse-map-list"]/li/h4/a')
+        name.send_keys(Keys.NULL)
+
+        #request.meta['driver'] = self.driver  # to access driver from response
+
+        logging.info('SeleniumMiddleware - retrieving body')
+        body = to_bytes(driver.page_source)  # body must be of type bytes
+        return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
+
diff --git a/linkedin/pipelines.py b/linkedin/pipelines.py
new file mode 100644
index 0000000..57df283
--- /dev/null
+++ b/linkedin/pipelines.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class LinkedinPipeline(object):
+    def process_item(self, item, spider):
+        return item
diff --git a/linkedin/settings.py b/linkedin/settings.py
new file mode 100644
index 0000000..18684c3
--- /dev/null
+++ b/linkedin/settings.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for linkedin project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'linkedin'
+
+SPIDER_MODULES = ['linkedin.spiders']
+NEWSPIDER_MODULE = 'linkedin.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'linkedin (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'linkedin.middlewares.MyCustomSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'linkedin.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'linkedin.pipelines.SomePipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+DOWNLOADER_MIDDLEWARES = {
+    'linkedin.middlewares.Selenium': 200
+}
diff --git a/linkedin/spiders/__init__.py b/linkedin/spiders/__init__.py
new file mode 100644
index 0000000..ebd689a
--- /dev/null
+++ b/linkedin/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/linkedin/spiders/__pycache__/__init__.cpython-34.pyc b/linkedin/spiders/__pycache__/__init__.cpython-34.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ffeda6389e5f5ca4149856d1f486ebbd2d04b82
GIT binary patch
literal 149
zcmaFI!^<U7^DTTK0|UcjAcg}*Aj<)Wi&=m~3Ijv1CZpd<h9ZzKg81d6pOK%Ns-Iev
zn4FWUUr<?+k)NkuP?VpQnp{$>pOcxFotl!FhhP*JWTvDR73;^xXXa&=#K-FuRNmsS
R$<0qG%}KQbnOzLT3;^=gCu9Hs

literal 0
HcmV?d00001

diff --git a/linkedin/spiders/__pycache__/linkedin.cpython-34.pyc b/linkedin/spiders/__pycache__/linkedin.cpython-34.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..517c109dce75a2d7ef5a93d6f1a0b577a0566cd0
GIT binary patch
literal 1879
zcmZuxUvJ|?5TA7%$4S#(IgYy?pi{1>m2y(+{?!#kx&DQ#C=t4pt7KGJj=gC%acp;Y
zb8Vvb0hN!ym*7kAF?d*s7kJ`@S0tX8b&_7WVr3`e`M2Yl-^^d-$<osAe;@yv2lxYS
zJtp#7c*F}NA^r$N0Pc`CAR=%yk{aL|@Ku<D(1agRZvq8q=ioXAuBrX%)%>3r1Zq{H
z$g@DR<HtZ|3y*k?1l?Ss@4|TK=3+zxPLaVV<LGW4Li}0!Cf9<nfYEJbAF{YZLl!GL
z8gYZ)T2iLsGsihUKVKY67lWkhu-J)uVe*Cr{sQ+|Jh!_#x3jXa;V&<(uP-mJzVLgp
zb<|JkW^MbMeXqX1{c88+d?rjfG~V1_iQIX~JDalaw={DQH<QgEVgcsxnsSMOqq)`X
zZa=2-JmH<Wm0cQ3Ci^-lDuypFnUD$Z*ZeL;gOU4wW_J+YcK6nvuWf9sFRguwb}YO=
z-9|vLVEWM&%;DL^BYr_5vG`a;0@nnHG_p<rToJe=(8fw)ZLv4TP?y(IG`s}xEKS=M
zv<te2nfEYMe)wXK^jLg!Zx=z9Zd(=Jv9iZj6Su93?^t07j8%qf0D_&bS0`$N2L~~e
zl`k2m&E%r8!;^DC`CukYx{Dzbej}n@167>%;wbUMqruF$)lB}k(}R!d)aOBq#itc?
zsK^$r944nMt~6vkc<eaOzTXNWUx>}wNQ)O}*Jsh}$>7OdzZA`|nPKMo@m<%k9=&Jo
zdW&A8%jDou=EuVEeM9l=Np(_Ln87LK%4CQpw|Gt)A!lzWcc(^`_?9P47EyOfQ0b*u
z9Li}NP`5DBD{JVWa;H?fWh7ps?_H$6Y^kE4ap-kuU#NWcjmjS9>*h1)RSjhd8Z}iR
z9p3XZ@F|4TKRZq<=~9PsKZvN4_GK%H9kglFKniC(SZ^7TvXJt(n#HuQY|mrZQ_m}F
zls-fPWRjH0G`Vj~8<XVm=nUQ)5Ax#JBhUcXkB!H|qxXmS35iB40^0zIz|J;?#_VN=
zctknAT;<>tEljvLgv%W4o$SFCB0jI@Kq5sTYY*fQ@;DUefkR+I+Xh*Lb_p(V!<gAc
z8pK3fpRK_0#rxW7?_>h53UFC~*W~O6IL_dkuMM8o1Ma8&O6qe#D+?8DL<YPV2VGPm
zoch7M<Mg5%=ruBgFLeEML2GAEu$WG3&GD7l!Zs@#*GbgygHE-~3;5w2cQtTb3-@7@
zVP9!PySP)tO}}|jE6uNUzs=frhllQ@hJ!C9XN{hu%JS0`5uq%R<*Ev==vj{nsq$IY
zRh#QhDVtM~CNYkmzE|b5PjKxfPjF3XtZIQWH3$@GW_THrduYxdA$fvFJU{}Zh)up{
zSlIY!a*uq1Z-c3uc4ybtss#-d9n|ww$@98N*u!b}RN3>+dVVz2@DDMZDhcUx>GgOd
z_%sT#>u#l>!OOEe^m^3!3k{(%u}CpjzR$I^n&%(jpB|}*#;n)%dXhy#S=^6t-6*S`
afjP7NztGD5e3;@FdL0YhlCn`E<=nr+F7ht`

literal 0
HcmV?d00001

diff --git a/linkedin/spiders/linkedin.py b/linkedin/spiders/linkedin.py
new file mode 100644
index 0000000..e2a4d17
--- /dev/null
+++ b/linkedin/spiders/linkedin.py
@@ -0,0 +1,47 @@
+import scrapy
+
+from selenium_utils import *
+import random
+
+
+class Linkedin(scrapy.Spider):
+    name = "linkedin"
+    start_urls = ['https://www.linkedin.com/in/ludovica-rain%C3%B2-8a1055113?authType=NAME_SEARCH&authToken=E2lZ&trk=tyah&trkInfo=clickedVertical%3Amynetwork%2CentityType%3AentityHistoryName%2CclickedEntityId%3Amynetwork_474885049%2Cidx%3A8']
+
+
+    def __init__(self):
+        logger.info('Init Firefox Browser')
+        profile = webdriver.FirefoxProfile()
+        profile.set_preference('dom.disable_beforeunload', True)
+        self.driver = webdriver.Firefox(profile)
+
+        self.driver.get('https://it.linkedin.com/')
+
+        logger.info('Searching for the Login btn')
+        get_by_xpath(self.driver, '//*[@class="login-email"]').send_keys(email)
+
+        logger.info('Searching for the password btn')
+        get_by_xpath(self.driver, '//*[@class="login-password"]').send_keys(password)
+
+        logger.info('Searching for the submit')
+        get_by_xpath(self.driver, '//*[@id="login-submit"]').click()
+
+
+    def parse(self, response):
+        driver = self.driver
+
+        logger.info('Scrapy parse - get the names list')
+        names = driver.find_elements_by_xpath('//ul[@class="browse-map-list"]/li/h4/a')
+
+        frontier = []
+        for name in names:
+            name.send_keys(Keys.NULL)
+            link = name.get_attribute('href')
+            frontier.append(scrapy.Request(link, callback=self.parse))
+
+        for f in frontier:
+            yield f
+
+
+
+
diff --git a/readme.md b/readme.md
new file mode 100644
index 0000000..4de118e
--- /dev/null
+++ b/readme.md
@@ -0,0 +1,13 @@
+# Linkedin Scraping
+
+Scraping software aimed to simply visit the pages of the linkedin users, the purpose is to gain visibility, because linkedin notifies when you watch another user page.
+
+Uses: Scrapy, Selenium web driver and Firefox 45
+
+# Install
+    pip install -r requirements.txt
+    
+# Usage:
+Modify the conf.py file and type:
+
+    scrapy crawl linkedin
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..a138bcc
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,20 @@
+attrs==16.2.0
+cffi==1.8.3
+cryptography==1.5.2
+cssselect==0.9.2
+idna==2.1
+lxml==3.6.4
+parsel==1.0.3
+pyasn1==0.1.9
+pyasn1-modules==0.0.8
+pycparser==2.14
+PyDispatcher==2.0.5
+pyOpenSSL==16.1.0
+queuelib==1.4.2
+Scrapy==1.2.0
+selenium==2.53.6
+service-identity==16.0.0
+six==1.10.0
+Twisted==16.4.1
+w3lib==1.15.0
+zope.interface==4.3.2
diff --git a/scraper.py b/scraper.py
new file mode 100644
index 0000000..e69de29
diff --git a/scrapy.cfg b/scrapy.cfg
new file mode 100644
index 0000000..f3f1965
--- /dev/null
+++ b/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = linkedin.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = linkedin
diff --git a/selenium_utils.py b/selenium_utils.py
new file mode 100644
index 0000000..1ff668b
--- /dev/null
+++ b/selenium_utils.py
@@ -0,0 +1,28 @@
+from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException
+from selenium.webdriver.firefox import webdriver
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as ec
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium import webdriver
+from conf import *
+
+
+
+"""
+number of seconds used to wait the web page's loading.
+"""
+WAIT_TIMEOUT = 10
+
+
+def get_by_xpath(driver, xpath):
+    """
+    Get a web element through the xpath passed by performing a Wait on it.
+    :param driver: Selenium web driver to use.
+    :param xpath: xpath to use.
+    :return: The web element
+    """
+    return WebDriverWait(driver, WAIT_TIMEOUT).until(
+        ec.presence_of_element_located(
+            (By.XPATH, xpath)
+        ))