diff --git a/data/textdata/scrapy.cfg b/data/textdata/scrapy.cfg deleted file mode 100644 index fcc4ecb..0000000 --- a/data/textdata/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.org/en/latest/deploy.html - -[settings] -default = textdata.settings - -[deploy] -#url = http://localhost:6800/ -project = textdata diff --git a/data/textdata/textdata/__init__.py b/data/textdata/textdata/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/data/textdata/textdata/items.py b/data/textdata/textdata/items.py deleted file mode 100644 index 1c7a813..0000000 --- a/data/textdata/textdata/items.py +++ /dev/null @@ -1,14 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# http://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class TextdataItem(scrapy.Item): - # define the fields for your item here like: - # name = scrapy.Field() - pass diff --git a/data/textdata/textdata/middlewares.py b/data/textdata/textdata/middlewares.py deleted file mode 100644 index 5c87492..0000000 --- a/data/textdata/textdata/middlewares.py +++ /dev/null @@ -1,56 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# http://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class TextdataSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/data/textdata/textdata/pipelines.py b/data/textdata/textdata/pipelines.py deleted file mode 100644 index cc9bd49..0000000 --- a/data/textdata/textdata/pipelines.py +++ /dev/null @@ -1,11 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html - - -class TextdataPipeline(object): - def process_item(self, item, spider): - return item diff --git a/data/textdata/textdata/settings.py b/data/textdata/textdata/settings.py deleted file mode 100644 index 2b0f1a1..0000000 --- a/data/textdata/textdata/settings.py +++ /dev/null @@ -1,90 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for textdata project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# http://doc.scrapy.org/en/latest/topics/settings.html -# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'textdata' - -SPIDER_MODULES = ['textdata.spiders'] -NEWSPIDER_MODULE = 'textdata.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'textdata (+http://www.yourdomain.com)' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'textdata.middlewares.TextdataSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'textdata.middlewares.MyCustomDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { -# 'textdata.pipelines.TextdataPipeline': 300, -#} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See http://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'