From f6e81a6aeaa3c12689c2ae7d90671681723aab1a Mon Sep 17 00:00:00 2001 From: Rishi N Desai Date: Thu, 1 Oct 2020 01:21:02 +0530 Subject: [PATCH 1/3] glassdoor crawler --- joble/spiders/glassdoor.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 joble/spiders/glassdoor.py diff --git a/joble/spiders/glassdoor.py b/joble/spiders/glassdoor.py new file mode 100644 index 0000000..98b0ff3 --- /dev/null +++ b/joble/spiders/glassdoor.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +import scrapy +import urllib + +# scrapy crawl Glassdoor +class GlassdoorSpider(scrapy.Spider): + name = 'Glassdoor' + allowed_domains = ['glassdoor.com'] + + def __init__(self, keyword=None, count=20): + self.keyword = keyword + self.count = int(count) + + def start_requests(self): + url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword={}'.format(self.keyword) + yield scrapy.Request(url=url, callback=self.parse) + + def parse(self, response): + elements = response.css('ul.jlGrid li.react-job-listing') + jobs = [] + + for element in elements[:self.count]: + job = { + 'title': element.attrib['data-normalize-job-title'], + 'location': element.attrib['data-job-loc'], + 'employer': 'https://www.glassdoor.com' + element.css('div div.jobHeader a span::text').get(), + 'job-link': element.css('div div.jobHeader a::attr(href)').get() + } + jobs.append(job) + + print(jobs) \ No newline at end of file From f60be3069817d4f9f96995b22c8078808cff15a2 Mon Sep 17 00:00:00 2001 From: Rishi N Desai Date: Thu, 1 Oct 2020 20:53:26 +0530 Subject: [PATCH 2/3] used generator --- joble/spiders/glassdoor.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/joble/spiders/glassdoor.py b/joble/spiders/glassdoor.py index 98b0ff3..c519b54 100644 --- a/joble/spiders/glassdoor.py +++ b/joble/spiders/glassdoor.py @@ -17,7 +17,6 @@ def start_requests(self): def parse(self, response): elements = response.css('ul.jlGrid li.react-job-listing') - jobs = [] for element in elements[:self.count]: job = { @@ -26,6 +25,4 @@ def parse(self, response): 'employer': 'https://www.glassdoor.com' + element.css('div div.jobHeader a span::text').get(), 'job-link': element.css('div div.jobHeader a::attr(href)').get() } - jobs.append(job) - - print(jobs) \ No newline at end of file + yield job \ No newline at end of file From f3885f7f3bf4d4ca474fe94622d122539696e2e6 Mon Sep 17 00:00:00 2001 From: Rishi N Desai Date: Thu, 1 Oct 2020 20:58:49 +0530 Subject: [PATCH 3/3] stored url --- joble/spiders/glassdoor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/joble/spiders/glassdoor.py b/joble/spiders/glassdoor.py index c519b54..fc13592 100644 --- a/joble/spiders/glassdoor.py +++ b/joble/spiders/glassdoor.py @@ -6,13 +6,14 @@ class GlassdoorSpider(scrapy.Spider): name = 'Glassdoor' allowed_domains = ['glassdoor.com'] + url = 'https://www.glassdoor.com' def __init__(self, keyword=None, count=20): self.keyword = keyword self.count = int(count) def start_requests(self): - url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword={}'.format(self.keyword) + url = '{}/Job/jobs.htm?sc.keyword={}'.format(self.url, self.keyword) yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): @@ -22,7 +23,7 @@ def parse(self, response): job = { 'title': element.attrib['data-normalize-job-title'], 'location': element.attrib['data-job-loc'], - 'employer': 'https://www.glassdoor.com' + element.css('div div.jobHeader a span::text').get(), + 'employer': self.url + element.css('div div.jobHeader a span::text').get(), 'job-link': element.css('div div.jobHeader a::attr(href)').get() } yield job \ No newline at end of file