seriyps · pavel-shpilev · Jan 30, 2013 · Jan 30, 2013
diff --git a/README.md b/README.md
@@ -26,6 +26,9 @@ Competitors
 * html5lib
   http://code.google.com/p/html5lib/
   Pure python DOM parser oriented to HTML5.
+* Scrapy
+  http://scrapy.org/
+  High-level screen scraping and web crawling framework.
 
 ### PyPi
 
@@ -105,17 +108,17 @@ Install OS dependencies python-virtualenv, erlang, pypy, C compiler and libxml2
 dev packages
 
     sudo apt-get install ...
-        libxml2-dev libxslt1-dev build-essential  # common
-        python-virtualenv python-lxml             # python
-        erlang-base                               # erlang
-        pypy                                      # python PyPy
-        nodejs npm                                # NodeJS
-        cabal-install libicu-dev                  # Haskell
-        php5-cli php5-tidy                        # PHP
-        golang                                    # Go
-        ruby1.9.1 ruby1.9.1-dev rubygems1.9.1     # Ruby
-        maven2 default-jdk                        # Java
-        mono-runtime mono-dmcs                    # Mono
+        libxml2-dev libxslt1-dev build-essential               # common
+        python-virtualenv python-lxml python-scrapy            # python
+        erlang-base                                            # erlang
+        pypy                                                   # python PyPy
+        nodejs npm                                             # NodeJS
+        cabal-install libicu-dev                               # Haskell
+        php5-cli php5-tidy                                     # PHP
+        golang                                                 # Go
+        ruby1.9.1 ruby1.9.1-dev rubygems1.9.1                  # Ruby
+        maven2 default-jdk                                     # Java
+        mono-runtime mono-dmcs                                 # Mono
 
 Then run (it will prepare virtual environments, fetch dependencies, compile sources etc)
 

diff --git a/lib.sh b/lib.sh
@@ -4,7 +4,12 @@ timeit () {
     # XXX: how to redirect time's output to stdout, but leave command's
     # errors on stderr? -o /dev/tty is ok in general, but causes problems
     # with GNU parallel
-    /usr/bin/time --format="real:%e	user:%U	sys:%S	max RSS:%M" $@ 2>&1
+    if [ ${OSTYPE//[0-9.]/} == 'darwin' ]; then
+    	# --format not supported in MacOS
+    	/usr/bin/time $@ 2>&1
+    else
+    	/usr/bin/time --format="real:%e	user:%U	sys:%S	max RSS:%M" $@ 2>&1
+    fi
 }
 
 print_header() {

diff --git a/python/prepare.sh b/python/prepare.sh
@@ -3,4 +3,4 @@
 virtualenv --system-site-packages env
 source env/bin/activate
 
-pip install lxml beautifulsoup4 BeautifulSoup html5lib
+pip install lxml beautifulsoup4 BeautifulSoup html5lib scrapy
diff --git a/python/scrapy_parser.py b/python/scrapy_parser.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+'''
+Created on 2013-01-30
+
+@author: Pavel Shpilev <[email protected]>
+'''
+from scrapy.crawler import Crawler
+from scrapy.settings import Settings
+from scrapy.spider import BaseSpider
+from scrapy.selector import HtmlXPathSelector
+from scrapy.http import Request
+
+import sys
+import time
+import os
+
+
+class BenchmarkSpider(BaseSpider):
+    def parse(self, response):
+        hxs = HtmlXPathSelector(response)
+        yield hxs.extract()
+        yield Request(response.url, callback=self.parse, dont_filter=True)
+
+
+def main():
+    do_parse_test(os.path.join('file://127.0.0.1', sys.argv[1]), int(sys.argv[2]))
+
+
+def do_parse_test(html, n):
+    start = time.time()
+    spider = BenchmarkSpider(name="benchmark", start_urls=[html])
+    crawler = Crawler(Settings(values={'TELNETCONSOLE_PORT': None}))
+    crawler.configure()
+    crawler.crawl(spider)
+    for i in xrange(n):
+    	crawler.start()
+    	crawler.stop()
+    stop = time.time()
+    print stop - start, "s"
+
+
+if __name__ == '__main__':
+    main()