From 55091090a4d10a3ebce243a6a2e0af8b9f6120e6 Mon Sep 17 00:00:00 2001 From: eracle Date: Fri, 7 Oct 2016 20:09:43 +0200 Subject: [PATCH] first commit Former-commit-id: 184835d87ff044b12c034a133b3e821ff92d5b1c --- .gitignore | 4 + .idea/linkedin.iml | 11 ++ .idea/misc.xml | 14 +++ .idea/modules.xml | 8 ++ .idea/workspace.xml.REMOVED.git-id | 1 + conf_template.py | 12 +++ linkedin/__init__.py | 0 linkedin/__pycache__/__init__.cpython-34.pyc | Bin 0 -> 141 bytes .../__pycache__/middlewares.cpython-34.pyc | Bin 0 -> 1270 bytes linkedin/__pycache__/settings.cpython-34.pyc | Bin 0 -> 337 bytes linkedin/items.py | 14 +++ linkedin/middlewares.py | 27 +++++ linkedin/pipelines.py | 11 ++ linkedin/settings.py | 94 ++++++++++++++++++ linkedin/spiders/__init__.py | 4 + .../__pycache__/__init__.cpython-34.pyc | Bin 0 -> 149 bytes .../__pycache__/linkedin.cpython-34.pyc | Bin 0 -> 1879 bytes linkedin/spiders/linkedin.py | 47 +++++++++ readme.md | 13 +++ requirements.txt | 20 ++++ scraper.py | 0 scrapy.cfg | 11 ++ selenium_utils.py | 28 ++++++ 23 files changed, 319 insertions(+) create mode 100644 .gitignore create mode 100644 .idea/linkedin.iml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/workspace.xml.REMOVED.git-id create mode 100644 conf_template.py create mode 100644 linkedin/__init__.py create mode 100644 linkedin/__pycache__/__init__.cpython-34.pyc create mode 100644 linkedin/__pycache__/middlewares.cpython-34.pyc create mode 100644 linkedin/__pycache__/settings.cpython-34.pyc create mode 100644 linkedin/items.py create mode 100644 linkedin/middlewares.py create mode 100644 linkedin/pipelines.py create mode 100644 linkedin/settings.py create mode 100644 linkedin/spiders/__init__.py create mode 100644 linkedin/spiders/__pycache__/__init__.cpython-34.pyc create mode 100644 linkedin/spiders/__pycache__/linkedin.cpython-34.pyc create mode 100644 linkedin/spiders/linkedin.py create mode 100644 readme.md create mode 100644 requirements.txt create mode 100644 scraper.py create mode 100644 scrapy.cfg create mode 100644 selenium_utils.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8ad36ac --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +conf.py + +__pycache__/ +selenium_prototype.py diff --git a/.idea/linkedin.iml b/.idea/linkedin.iml new file mode 100644 index 0000000..24eeee8 --- /dev/null +++ b/.idea/linkedin.iml @@ -0,0 +1,11 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..d3c9685 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..b0d6a75 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml.REMOVED.git-id b/.idea/workspace.xml.REMOVED.git-id new file mode 100644 index 0000000..f2febc0 --- /dev/null +++ b/.idea/workspace.xml.REMOVED.git-id @@ -0,0 +1 @@ +495ac8de055a5ef5c301d3562b32de876843e05d \ No newline at end of file diff --git a/conf_template.py b/conf_template.py new file mode 100644 index 0000000..68e637f --- /dev/null +++ b/conf_template.py @@ -0,0 +1,12 @@ +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +logging.getLogger('scrapy').setLevel(logging.WARNING) + +logging.getLogger('selenium').setLevel(logging.WARNING) + + +email = '' +password = '' diff --git a/linkedin/__init__.py b/linkedin/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/linkedin/__pycache__/__init__.cpython-34.pyc b/linkedin/__pycache__/__init__.cpython-34.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a02c717df34a168d182efb17fa36f05afccc78f0 GIT binary patch literal 141 zcmaFI!^&ryk0@&Ee@O9{FKt1R6CHF I#X!se08!#2k^lez literal 0 HcmV?d00001 diff --git a/linkedin/__pycache__/middlewares.cpython-34.pyc b/linkedin/__pycache__/middlewares.cpython-34.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc9e832ec185bcfeab5e42c0fb3780034bae86ea GIT binary patch literal 1270 zcmZuwOK;Oa5FW>mIFC{!F1!vAPX#Hpgb?BYs1gT|fC^C%QV_Bn?>2F|_L|*wOQQ4? z{tUl^f3XKHoVajB;>3)TmV&zS%fm zw)@QF;x*T$EHp>kxwC^hG%_B|4A&{S9ghYswo&v|R2--hgdaAjHY^8ms0d2$*>IN& zUSw6Cy68-!i4vkbLD64PEgCQdIC8-_Fs+j;v9*N@Qx_Jvjo=ucL77{~Kl6H9Yb|a2-OJcF8!x+#9ud4p}GQUo;DKFqJ532>NXrne_muSjyP z=eMG$5+?~qN*-!H$dhs)GL1cwu5p+;3V>bFruB#o0iY8RJT| z&=_`z#&|`e^@tV~;Qkj-mvH+1D4NKeM_eVT;88g<6In#1l2e`XVn7*$5lp;*!qoU?C5!7{RyC2$)PE}IQwkF1uEAGxAS@rK4S0)qM uhR2QC3v1tcA3G#IRyPpH5s^|SJq1iGDnhs6!&SYH9V&rt0vmU_NWvdU>{J2( literal 0 HcmV?d00001 diff --git a/linkedin/__pycache__/settings.cpython-34.pyc b/linkedin/__pycache__/settings.cpython-34.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d3f06d1be302bdda0fc6b9c2164198b4a3c2c3a GIT binary patch literal 337 zcmaFI!^<^w_xJFA9tMWT3`l?($aVnYVjUon!oUy(BpDgf85yD&L5wISh7=}-C}xIC zAeWgTg@wT^iiIJCl_6M@Es7&2GcP+eB{Q!|0LIcQF33ztEh-Lqc>-u&l`LF5H!~$A zC$&7WD79EGI5j6VFS9h)Pm}c)hm(Iuyq}}5>n*0|pvH SF&B_vW8!3FV`ppOcxFotl!FhhP*JWTvDR73;^xXXa&=#K-FuRNmsS R$<0qG%}KQbnOzLT3;^=gCu9Hs literal 0 HcmV?d00001 diff --git a/linkedin/spiders/__pycache__/linkedin.cpython-34.pyc b/linkedin/spiders/__pycache__/linkedin.cpython-34.pyc new file mode 100644 index 0000000000000000000000000000000000000000..517c109dce75a2d7ef5a93d6f1a0b577a0566cd0 GIT binary patch literal 1879 zcmZuxUvJ|?5TA7%$4S#(IgYy?pi{1>m2y(+{?!#kx&DQ#C=t4pt7KGJj=gC%acp;Y zb8Vvb0hN!ym*7kAF?d*s7kJ`@S0tX8b&_7WVr3`e`M2Yl-^^d-$3r1Zq{H z$g@DR%;wbUMqruF$)lB}k(}R!d)aOBq#itc? zsK^$r944nMt~6vkc}wNQ)O}*Jsh}$>7OdzZA`|nPKMo@m<%k9=&Jo zdW&A8%jDou=EuVEeM9l=Np(_Ln87LK%4CQpw|Gt)A!lzWcc(^`_?9P47EyOfQ0b*u z9Li}NP`5DBD{JVWa;H?fWh7ps?_H$6Y^kE4ap-kuU#NWcjmjS9>*h1)RSjhd8Z}iR z9p3XZ@F|4TKRZq<=~9PsKZvN4_GK%H9kglFKniC(SZ^7TvXJt(n#HuQY|mrZQ_m}F zls-fPWRjH0G`Vj~8tEljvLgv%W4o$SFCB0jI@Kq5sTYY*fQ@;DUefkR+I+Xh*Lb_p(V!h53UFC~*W~O6IL_dkuMM8o1Ma8&O6qe#D+?8DL+dVVz2@DDMZDhcUx>GgOd z_%sT#>u#l>!OOEe^m^3!3k{(%u}CpjzR$I^n&%(jpB|}*#;n)%dXhy#S=^6t-6*S` afjP7NztGD5e3;@FdL0YhlCn`E<=nr+F7ht` literal 0 HcmV?d00001 diff --git a/linkedin/spiders/linkedin.py b/linkedin/spiders/linkedin.py new file mode 100644 index 0000000..e2a4d17 --- /dev/null +++ b/linkedin/spiders/linkedin.py @@ -0,0 +1,47 @@ +import scrapy + +from selenium_utils import * +import random + + +class Linkedin(scrapy.Spider): + name = "linkedin" + start_urls = ['https://www.linkedin.com/in/ludovica-rain%C3%B2-8a1055113?authType=NAME_SEARCH&authToken=E2lZ&trk=tyah&trkInfo=clickedVertical%3Amynetwork%2CentityType%3AentityHistoryName%2CclickedEntityId%3Amynetwork_474885049%2Cidx%3A8'] + + + def __init__(self): + logger.info('Init Firefox Browser') + profile = webdriver.FirefoxProfile() + profile.set_preference('dom.disable_beforeunload', True) + self.driver = webdriver.Firefox(profile) + + self.driver.get('https://it.linkedin.com/') + + logger.info('Searching for the Login btn') + get_by_xpath(self.driver, '//*[@class="login-email"]').send_keys(email) + + logger.info('Searching for the password btn') + get_by_xpath(self.driver, '//*[@class="login-password"]').send_keys(password) + + logger.info('Searching for the submit') + get_by_xpath(self.driver, '//*[@id="login-submit"]').click() + + + def parse(self, response): + driver = self.driver + + logger.info('Scrapy parse - get the names list') + names = driver.find_elements_by_xpath('//ul[@class="browse-map-list"]/li/h4/a') + + frontier = [] + for name in names: + name.send_keys(Keys.NULL) + link = name.get_attribute('href') + frontier.append(scrapy.Request(link, callback=self.parse)) + + for f in frontier: + yield f + + + + diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..4de118e --- /dev/null +++ b/readme.md @@ -0,0 +1,13 @@ +# Linkedin Scraping + +Scraping software aimed to simply visit the pages of the linkedin users, the purpose is to gain visibility, because linkedin notifies when you watch another user page. + +Uses: Scrapy, Selenium web driver and Firefox 45 + +# Install + pip install -r requirements.txt + +# Usage: +Modify the conf.py file and type: + + scrapy crawl linkedin \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a138bcc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,20 @@ +attrs==16.2.0 +cffi==1.8.3 +cryptography==1.5.2 +cssselect==0.9.2 +idna==2.1 +lxml==3.6.4 +parsel==1.0.3 +pyasn1==0.1.9 +pyasn1-modules==0.0.8 +pycparser==2.14 +PyDispatcher==2.0.5 +pyOpenSSL==16.1.0 +queuelib==1.4.2 +Scrapy==1.2.0 +selenium==2.53.6 +service-identity==16.0.0 +six==1.10.0 +Twisted==16.4.1 +w3lib==1.15.0 +zope.interface==4.3.2 diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..e69de29 diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..f3f1965 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.org/en/latest/deploy.html + +[settings] +default = linkedin.settings + +[deploy] +#url = http://localhost:6800/ +project = linkedin diff --git a/selenium_utils.py b/selenium_utils.py new file mode 100644 index 0000000..1ff668b --- /dev/null +++ b/selenium_utils.py @@ -0,0 +1,28 @@ +from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException +from selenium.webdriver.firefox import webdriver +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as ec +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys +from selenium import webdriver +from conf import * + + + +""" +number of seconds used to wait the web page's loading. +""" +WAIT_TIMEOUT = 10 + + +def get_by_xpath(driver, xpath): + """ + Get a web element through the xpath passed by performing a Wait on it. + :param driver: Selenium web driver to use. + :param xpath: xpath to use. + :return: The web element + """ + return WebDriverWait(driver, WAIT_TIMEOUT).until( + ec.presence_of_element_located( + (By.XPATH, xpath) + ))