Skip to content

Commit bd761a1

Browse files
committed
init
0 parents  commit bd761a1

24 files changed

+557
-0
lines changed

.gitignore

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
gaia/
2+
bkb/
3+
.DS_Store
4+
venv/
5+
.vscode/
6+
.scrapy/
7+
dev/
8+
__pycache__/
9+
.env
10+
dump/
11+
credentials.json
12+
R/
13+
pipeline/
14+
achieve/
15+
waurzenczak.json
16+
andryas.json

README.md

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Quick tutorial for scraping with GCP with no database
2+
3+
Long story short, the idea is to schedule a machine to start and die after webscraping. To avoid having a database, we create a JSON and insert which page crawled inside this json, and after it finishes collecting, we send it to the bucket.
4+
5+
- create a VM
6+
- create a schedule for the VM usign crontab
7+
- create a bucket, get this bucket name and use in the next step
8+
- create a file .env in this repository and set BUCKET=bucket-name
9+
- send this repository to the vm,
10+
- sudo bash setup.sh
11+
- config crontab, for instance, `15 12 * * MON bash /home/wavrzenczak/scraping/crawl.sh test`
12+
- Note that the crontab for schedule machine need to be close to the crontab setting in the VM, I'd put 15 minutes from each other just in case.
13+
14+
Now, see how is create the `src/spiders/test.py`, create another one following
15+
the structure, `pipeline.py`, and be happy.
16+
17+

connect.sh

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
gcloud compute ssh --zone "us-central1-c" "scraping" --project "waurzenczak"

crawl.sh

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
cd /home/wavrzenczak/scraping
3+
source venv/bin/activate
4+
5+
# get the start date and time
6+
start_datetime=$(date '+%m_%d_%Y_%H_%M_%S')
7+
echo "${start_datetime} - starting spider"
8+
9+
# prevent click, which pipenv relies on, from freaking out to due to lack of locale info https://click.palletsprojects.com/en/7.x/python3/
10+
export LC_ALL=en_US.utf-8
11+
12+
scrapy crawl $1
13+
14+
# -a debug=$DEBUG &> "logs/log_${start_datetime}.txt"
15+
# get the end date and time
16+
end_datetime=$(date '+%m_%d_%Y_%H_%M_%S')
17+
echo "${end_datetime} - spider finished successfully"
18+
root shutdown

deploy.sh

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
2+
rsync -rv --exclude=.gitignore --exclude=.git --exclude=.vscode --exclude=venv/ . scraping/
3+
gcloud compute scp --recurse scraping/ scraping:/home/wavrzenczak/scraping --zone us-central1-c
4+
sudo rm -rf scraping/

requirements.txt

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
wheel
2+
Scrapy==2.6.2
3+
scrapy-splash==0.8.0
4+
scrapy-user-agents==0.1.1
5+
6+
python-dotenv==0.19.2
7+
python-decouple==3.0
8+
9+
lxml==4.9.1
10+
requests==2.28.1
11+
12+
pyopenssl==22.0.0
13+
cryptography==36.0.0
14+
dateparser==1.1.2
15+
unidecode==1.3.4
16+
17+
psycopg2-binary==2.9.5
18+
random_user_agent==1.0.1
19+
20+
jsonlines==3.1.0
21+
google-cloud-storage

scrapy.cfg

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# https://scrapyd.readthedocs.io/en/latest/deploy.html
5+
6+
[settings]
7+
default = src.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = src.settings

setup.sh

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
apt update && apt upgrade
2+
# apt -y install python3-venv
3+
apt -y install python3.9
4+
apt -y install python3.9-venv
5+
# apt -y install python3-virtualenv
6+
# apt -y install python3 python3-pip
7+
virtualenv -p python3.9 venv
8+
source venv/bin/activate
9+
pip install -r requirements.txt

src/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.

src/middlewares.py

+102
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your spider middleware
4+
#
5+
# See documentation in:
6+
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7+
8+
from scrapy import signals
9+
10+
class ScrSpiderMiddleware(object):
11+
# Not all methods need to be defined. If a method is not defined,
12+
# scrapy acts as if the spider middleware does not modify the
13+
# passed objects.
14+
15+
@classmethod
16+
def from_crawler(cls, crawler):
17+
# This method is used by Scrapy to create your spiders.
18+
s = cls()
19+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
20+
return s
21+
22+
def process_spider_input(self, response, spider):
23+
# Called for each response that goes through the spider
24+
# middleware and into the spider.
25+
26+
# Should return None or raise an exception.
27+
return None
28+
29+
def process_spider_output(self, response, result, spider):
30+
# Called with the results returned from the Spider, after
31+
# it has processed the response.
32+
33+
# Must return an iterable of Request, dict or Item objects.
34+
for i in result:
35+
yield i
36+
37+
def process_spider_exception(self, response, exception, spider):
38+
# Called when a spider or process_spider_input() method
39+
# (from other spider middleware) raises an exception.
40+
41+
# Should return either None or an iterable of Response, dict
42+
# or Item objects.
43+
pass
44+
45+
def process_start_requests(self, start_requests, spider):
46+
# Called with the start requests of the spider, and works
47+
# similarly to the process_spider_output() method, except
48+
# that it doesn’t have a response associated.
49+
50+
# Must return only requests (not items).
51+
for r in start_requests:
52+
yield r
53+
54+
def spider_opened(self, spider):
55+
spider.logger.info('Spider opened: %s' % spider.name)
56+
57+
58+
class ScrDownloaderMiddleware(object):
59+
# Not all methods need to be defined. If a method is not defined,
60+
# scrapy acts as if the downloader middleware does not modify the
61+
# passed objects.
62+
63+
@classmethod
64+
def from_crawler(cls, crawler):
65+
# This method is used by Scrapy to create your spiders.
66+
s = cls()
67+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
68+
return s
69+
70+
def process_request(self, request, spider):
71+
# Called for each request that goes through the downloader
72+
# middleware.
73+
74+
# Must either:
75+
# - return None: continue processing this request
76+
# - or return a Response object
77+
# - or return a Request object
78+
# - or raise IgnoreRequest: process_exception() methods of
79+
# installed downloader middleware will be called
80+
return None
81+
82+
def process_response(self, request, response, spider):
83+
# Called with the response returned from the downloader.
84+
85+
# Must either;
86+
# - return a Response object
87+
# - return a Request object
88+
# - or raise IgnoreRequest
89+
return response
90+
91+
def process_exception(self, request, exception, spider):
92+
# Called when a download handler or a process_request()
93+
# (from other downloader middleware) raises an exception.
94+
95+
# Must either:
96+
# - return None: continue processing this exception
97+
# - return a Response object: stops process_exception() chain
98+
# - return a Request object: stops process_exception() chain
99+
pass
100+
101+
def spider_opened(self, spider):
102+
spider.logger.info('Spider opened: %s' % spider.name)

src/pipeline.py

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import json
2+
from itemadapter import ItemAdapter
3+
from src.utils.gcp import *
4+
from src.utils.destymd import destymd
5+
from src.settings import *
6+
7+
class JsonWriterPipeline:
8+
9+
def __init__(self):
10+
pass
11+
12+
def open_spider(self, spider):
13+
self.file = open(spider.name + ".jsonl", "w")
14+
15+
def close_spider(self, spider):
16+
self.file.close()
17+
upload_blob(
18+
BUCKET,
19+
spider.name + ".jsonl",
20+
destymd(spider.name, "jsonl")
21+
)
22+
23+
def process_item(self, item, spider):
24+
line = json.dumps(ItemAdapter(item).asdict()) + "\n"
25+
self.file.write(line)
26+
return item

src/settings.py

+103
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
from dotenv import load_dotenv
2+
from os import getenv
3+
4+
load_dotenv()
5+
6+
BUCKET = getenv("BUCKET")
7+
8+
BOT_NAME = 'src'
9+
10+
SPIDER_MODULES = ['src.spiders']
11+
NEWSPIDER_MODULE = 'src.spiders'
12+
13+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
14+
#USER_AGENT = 'scr (+http://www.yourdomain.com)'
15+
16+
# Obey robots.txt rules
17+
ROBOTSTXT_OBEY = False
18+
19+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
20+
CONCURRENT_REQUESTS = 32
21+
22+
# Configure a delay for requests for the same website (default: 0)
23+
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
24+
# See also autothrottle settings and docs
25+
DOWNLOAD_DELAY = 0.25
26+
# The download delay setting will honor only one of:
27+
CONCURRENT_REQUESTS_PER_DOMAIN = 32
28+
29+
#CONCURRENT_REQUESTS_PER_IP = 16
30+
31+
# Disable cookies (enabled by default)
32+
COOKIES_ENABLED = True
33+
34+
# Disable Telnet Console (enabled by default)
35+
#TELNETCONSOLE_ENABLED = False
36+
37+
# Override the default request headers:
38+
DEFAULT_REQUEST_HEADERS = {
39+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
40+
'Accept-Language': 'en',
41+
}
42+
43+
# Enable or disable spider middlewares
44+
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
45+
#SPIDER_MIDDLEWARES = {
46+
# 'scr.middlewares.ScrSpiderMiddleware': 543,
47+
#}
48+
DOWNLOAD_HANDLERS = {
49+
'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
50+
'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
51+
'ftp': None,
52+
'file': None,
53+
's3': None
54+
}
55+
# Enable or disable downloader middlewares
56+
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
57+
DOWNLOADER_MIDDLEWARES = {
58+
'scrapy_splash.SplashCookiesMiddleware': 720,
59+
'scrapy_splash.SplashMiddleware': 730,
60+
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
61+
62+
# USER-AGENT
63+
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
64+
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
65+
}
66+
67+
# Enable or disable extensions
68+
# See https://doc.scrapy.org/en/latest/topics/extensions.html
69+
#EXTENSIONS = {
70+
# 'scrapy.extensions.telnet.TelnetConsole': None,
71+
#}
72+
73+
# Configure item pipelines
74+
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
75+
# ITEM_PIPELINES = {
76+
# 'src.pipelines.Pipeline': 300,
77+
# }
78+
79+
# Enable and configure the AutoThrottle extension (disabled by default)
80+
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
81+
AUTOTHROTTLE_ENABLED = True
82+
# The initial download delay
83+
AUTOTHROTTLE_START_DELAY = 2
84+
# The maximum download delay to be set in case of high latencies
85+
AUTOTHROTTLE_MAX_DELAY = 20
86+
# The average number of requests Scrapy should be sending in parallel to
87+
# each remote server
88+
AUTOTHROTTLE_TARGET_CONCURRENCY = 20
89+
# Enable showing throttling stats for every response received:
90+
#AUTOTHROTTLE_DEBUG = False
91+
92+
# Enable and configure HTTP caching (disabled by default)
93+
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
94+
HTTPCACHE_ENABLED = False
95+
HTTPCACHE_EXPIRATION_SECS = 0
96+
HTTPCACHE_DIR = 'httpcache'
97+
HTTPCACHE_IGNORE_HTTP_CODES = []
98+
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
99+
100+
# FEEDS = {
101+
# 'data.json': {'format': 'json', 'overwrite': True}
102+
# # 'data.jsonl': {'format': 'jsonlines', 'overwrite': True}
103+
# }

src/spiders/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.

src/spiders/test.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# -*- coding: utf-8 -*-
2+
import scrapy
3+
4+
from src.settings import *
5+
from src.utils.lubridate import now
6+
7+
class TestGCPSpider(scrapy.Spider):
8+
name = 'test_gcp'
9+
10+
custom_settings = {
11+
'ITEM_PIPELINES': {
12+
'src.pipeline.JsonWriterPipeline': 300
13+
}
14+
}
15+
16+
def start_requests(self):
17+
urls = [
18+
'https://quotes.toscrape.com/page/1/',
19+
]
20+
for url in urls:
21+
yield scrapy.Request(url=url, callback=self.parse)
22+
23+
def parse(self, response):
24+
for quote in response.css('div.quote'):
25+
yield {
26+
'created_at': now(False),
27+
'text': quote.css('span.text::text').get(),
28+
'author': quote.css('small.author::text').get(),
29+
'tags': quote.css('div.tags a.tag::text').getall()
30+
}
31+
32+
33+
34+

src/utils/__init__.py

Whitespace-only changes.

src/utils/chunks.py

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# https://www.codegrepper.com/code-examples/python/split+a+list+in+100+each
2+
def chunks(lst, n):
3+
"""Yield successive n-sized chunks from lst."""
4+
for i in range(0, len(lst), n):
5+
yield lst[i:i + n]

0 commit comments

Comments
 (0)