Andryas
diff --git a/‎.gitignore
Lines changed: 16 additions & 0 deletions b/‎.gitignore
Lines changed: 16 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 17 additions & 0 deletions b/‎README.md
Lines changed: 17 additions & 0 deletions
diff --git a/‎connect.sh
Lines changed: 1 addition & 0 deletions b/‎connect.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎crawl.sh
Lines changed: 18 additions & 0 deletions b/‎crawl.sh
Lines changed: 18 additions & 0 deletions
diff --git a/‎deploy.sh
Lines changed: 4 additions & 0 deletions b/‎deploy.sh
Lines changed: 4 additions & 0 deletions
diff --git a/‎requirements.txt
Lines changed: 21 additions & 0 deletions b/‎requirements.txt
Lines changed: 21 additions & 0 deletions
diff --git a/‎scrapy.cfg
Lines changed: 11 additions & 0 deletions b/‎scrapy.cfg
Lines changed: 11 additions & 0 deletions
diff --git a/‎setup.sh
Lines changed: 9 additions & 0 deletions b/‎setup.sh
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/__init__.py
Lines changed: 4 additions & 0 deletions b/‎src/__init__.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/middlewares.py
Lines changed: 102 additions & 0 deletions b/‎src/middlewares.py
Lines changed: 102 additions & 0 deletions
@@ -0,0 +1,16 @@
+gaia/
+bkb/
+.DS_Store
+venv/
+.vscode/
+.scrapy/
+dev/
+__pycache__/
+.env
+dump/
+credentials.json
+R/
+pipeline/
+achieve/
+waurzenczak.json
+andryas.json
@@ -0,0 +1,17 @@
+# Quick tutorial for scraping with GCP with no database
+
+Long story short, the idea is to schedule a machine to start and die after webscraping. To avoid having a database, we create a JSON and insert which page crawled inside this json, and after it finishes collecting, we send it to the bucket. 
+
+- create a VM
+- create a schedule for the VM usign crontab
+- create a bucket, get this bucket name and use in the next step
+- create a file .env in this repository and set BUCKET=bucket-name
+- send this repository to the vm,
+  - sudo bash setup.sh
+  - config crontab, for instance, `15 12 * * MON bash /home/wavrzenczak/scraping/crawl.sh test`
+- Note that the crontab for schedule machine need to be close to the crontab setting in the VM, I'd put 15 minutes from each other just in case.
+
+Now, see how is create the `src/spiders/test.py`,  create another one following
+the structure, `pipeline.py`, and be happy.
+
+
@@ -0,0 +1 @@
+gcloud compute ssh --zone "us-central1-c" "scraping" --project "waurzenczak"
@@ -0,0 +1,18 @@
+#!/bin/bash
+cd /home/wavrzenczak/scraping
+source venv/bin/activate
+
+# get the start date and time
+start_datetime=$(date '+%m_%d_%Y_%H_%M_%S')
+echo "${start_datetime} - starting spider"
+
+# prevent click, which pipenv relies on, from freaking out to due to lack of locale info https://click.palletsprojects.com/en/7.x/python3/
+export LC_ALL=en_US.utf-8
+
+scrapy crawl $1
+
+# -a debug=$DEBUG &> "logs/log_${start_datetime}.txt"
+# get the end date and time
+end_datetime=$(date '+%m_%d_%Y_%H_%M_%S')
+echo "${end_datetime} - spider finished successfully"
+root shutdown
@@ -0,0 +1,4 @@
+
+rsync -rv --exclude=.gitignore --exclude=.git --exclude=.vscode --exclude=venv/ . scraping/ 
+gcloud compute scp --recurse scraping/ scraping:/home/wavrzenczak/scraping --zone us-central1-c
+sudo rm -rf scraping/
@@ -0,0 +1,21 @@
+wheel
+Scrapy==2.6.2
+scrapy-splash==0.8.0
+scrapy-user-agents==0.1.1
+
+python-dotenv==0.19.2
+python-decouple==3.0
+
+lxml==4.9.1
+requests==2.28.1
+
+pyopenssl==22.0.0
+cryptography==36.0.0
+dateparser==1.1.2
+unidecode==1.3.4
+
+psycopg2-binary==2.9.5
+random_user_agent==1.0.1
+
+jsonlines==3.1.0
+google-cloud-storage
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = src.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = src.settings
@@ -0,0 +1,9 @@
+apt update && apt upgrade
+# apt -y install python3-venv
+apt -y install python3.9
+apt -y install python3.9-venv
+# apt -y install python3-virtualenv
+# apt -y install python3 python3-pip
+virtualenv -p python3.9 venv
+source venv/bin/activate
+pip install -r requirements.txt
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+class ScrSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class ScrDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+gcloud compute ssh --zone "us-central1-c" "scraping" --project "waurzenczak"`