From 5eed9cf4566fb3df2dde6b12292d9298e0690bbc Mon Sep 17 00:00:00 2001 From: Pascal Zumkehr Date: Tue, 2 Apr 2013 14:31:04 +0200 Subject: [PATCH 1/5] add delete requests to the end of the queue --- lib/relevance/tarantula/crawler.rb | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/lib/relevance/tarantula/crawler.rb b/lib/relevance/tarantula/crawler.rb index 8cd18b6..e3d4b1d 100644 --- a/lib/relevance/tarantula/crawler.rb +++ b/lib/relevance/tarantula/crawler.rb @@ -63,7 +63,7 @@ def crawl(url = "/") @times_to_crawl.times do |num| queue_link url - begin + begin do_crawl num rescue CrawlTimeout => e puts @@ -140,7 +140,7 @@ def grab_log! def make_result(options) defaults = { :log => grab_log!, - :test_name => test_name + :test_name => test_name } Result.new(defaults.merge(options)).freeze end @@ -189,7 +189,7 @@ def transform_url(url) def queue_link(dest, referrer = nil) dest = Link.new(dest, self, referrer) return if should_skip_link?(dest) - @crawl_queue << dest + append_to_queue(dest) @links_queued << dest dest end @@ -201,12 +201,21 @@ def queue_form(form, referrer = nil) fs.action = transform_url(fs.action) return if should_skip_form_submission?(fs) @referrers[fs.action] = referrer if referrer - @crawl_queue << fs + append_to_queue(fs) @form_signatures_queued << fs.signature end end end + # append delete requests to the end of the queue, all others just before the first delete request + def append_to_queue(request) + if request.method != 'delete' && index = @crawl_queue.index {|r| r.method == 'delete' } + @crawl_queue.insert(index, request) + else + @crawl_queue << request + end + end + def report_dir File.join(rails_root, "tmp", "tarantula") end From e1c915c179a1531e57dfc5fa354d0c45997be9c8 Mon Sep 17 00:00:00 2001 From: Pascal Zumkehr Date: Tue, 2 Apr 2013 15:33:03 +0200 Subject: [PATCH 2/5] process queue fifo style --- lib/relevance/tarantula/crawler.rb | 2 +- lib/relevance/tarantula/link.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/relevance/tarantula/crawler.rb b/lib/relevance/tarantula/crawler.rb index e3d4b1d..5c4ad0b 100644 --- a/lib/relevance/tarantula/crawler.rb +++ b/lib/relevance/tarantula/crawler.rb @@ -98,7 +98,7 @@ def do_crawl(number) end def crawl_the_queue(number = 0) - while (request = @crawl_queue.pop) + while (request = @crawl_queue.shift) request.crawl blip(number) end diff --git a/lib/relevance/tarantula/link.rb b/lib/relevance/tarantula/link.rb index da47d9e..2c6b658 100644 --- a/lib/relevance/tarantula/link.rb +++ b/lib/relevance/tarantula/link.rb @@ -74,7 +74,7 @@ def meth @tag['data-method'] == m.to_s.downcase end) || :get - end + end.to_s end def transform_url(link) From 4382146fb2a27bc394a311ee793f3210944ff3b1 Mon Sep 17 00:00:00 2001 From: Pascal Zumkehr Date: Wed, 3 Apr 2013 10:26:56 +0200 Subject: [PATCH 3/5] consistently use strings for method values --- lib/relevance/tarantula/link.rb | 6 +++--- spec/relevance/tarantula/link_spec.rb | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/relevance/tarantula/link.rb b/lib/relevance/tarantula/link.rb index 2c6b658..00b4f76 100644 --- a/lib/relevance/tarantula/link.rb +++ b/lib/relevance/tarantula/link.rb @@ -17,7 +17,7 @@ def method_javascript_function(method, url = '', href = nil) "var f = document.createElement('form'); f.style.display = 'none'; " + "this.parentNode.appendChild(f); f.method = 'POST'; f.action = #{action};" - unless method == :post + unless method == 'post' submit_function << "var m = document.createElement('input'); m.setAttribute('type', 'hidden'); " submit_function << "m.setAttribute('name', '_method'); m.setAttribute('value', '#{method}'); f.appendChild(m);" end @@ -71,10 +71,10 @@ def meth (@tag && [:put, :delete, :post, :patch].detect do |m| # post should be last since it's least specific @tag['onclick'] =~ METHOD_REGEXPS[m] || - @tag['data-method'] == m.to_s.downcase + @tag['data-method'] == m.to_s end) || :get - end.to_s + end end def transform_url(link) diff --git a/spec/relevance/tarantula/link_spec.rb b/spec/relevance/tarantula/link_spec.rb index 3114863..825bb3a 100644 --- a/spec/relevance/tarantula/link_spec.rb +++ b/spec/relevance/tarantula/link_spec.rb @@ -16,7 +16,7 @@ end it "parses anchor tags with POST 'method'" do - link = make_link(Hpricot(%Q{foo}).at('a')) + link = make_link(Hpricot(%Q{foo}).at('a')) link.href.should == '/foo' link.meth.should == :post end @@ -28,7 +28,7 @@ end it "parses anchor tags with PUT 'method'" do - link = make_link(Hpricot(%Q{foo}).at('a')) + link = make_link(Hpricot(%Q{foo}).at('a')) link.href.should == '/foo' link.meth.should == :put end @@ -46,7 +46,7 @@ end it "parses anchor tags with DELETE 'method'" do - link = make_link(Hpricot(%Q{foo}).at('a')) + link = make_link(Hpricot(%Q{foo}).at('a')) link.href.should == '/foo' link.meth.should == :delete end From b6db1186e232b5acf6388d0647272a3fa087b3ce Mon Sep 17 00:00:00 2001 From: Pascal Zumkehr Date: Thu, 18 Apr 2013 11:34:21 +0200 Subject: [PATCH 4/5] test queue ordering --- spec/relevance/tarantula/crawler_spec.rb | 33 ++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/spec/relevance/tarantula/crawler_spec.rb b/spec/relevance/tarantula/crawler_spec.rb index a48acff..1c392b4 100644 --- a/spec/relevance/tarantula/crawler_spec.rb +++ b/spec/relevance/tarantula/crawler_spec.rb @@ -111,6 +111,39 @@ crawler.queue_link('/url', '/some-referrer') end + it "queues DELETE requests at the end, everything else before" do + crawler = Relevance::Tarantula::Crawler.new + create_link = Hpricot('Create').at('a') + crawler.queue_link(create_link) + create_link = Relevance::Tarantula::Link.new(create_link, crawler, nil) + crawler.crawl_queue.should == [create_link] + delete_link = Hpricot('Destroy').at('a') + crawler.queue_link(delete_link) + delete_link = Relevance::Tarantula::Link.new(delete_link, crawler, nil) + crawler.crawl_queue.should == [create_link, delete_link] + get_link = Hpricot('Show').at('a') + crawler.queue_link(get_link) + get_link = Relevance::Tarantula::Link.new(get_link, crawler, nil) + crawler.crawl_queue.should == [create_link, get_link, delete_link] + end + + it "queues is crawled from tip not tail" do + crawler = Relevance::Tarantula::Crawler.new + + create_link = Hpricot('Create').at('a') + crawler.queue_link(create_link) + delete_link = Hpricot('Destroy').at('a') + crawler.queue_link(delete_link) + get_link = Hpricot('Show').at('a') + crawler.queue_link(get_link) + + q = sequence('queue') + response = stub(:code => "200") + crawler.expects(:follow).with('post', '/create').returns(response).in_sequence(q) + crawler.expects(:follow).with('get', '/read').returns(response).in_sequence(q) + crawler.expects(:follow).with('delete', '/destroy').returns(response).in_sequence(q) + crawler.do_crawl(0) + end end describe "crawling" do From 8ce1e2f15caf942df2a6b8c97e9ee07749e52ba0 Mon Sep 17 00:00:00 2001 From: Pascal Zumkehr Date: Tue, 28 May 2013 17:08:00 +0200 Subject: [PATCH 5/5] use #meth instead of #method in FormSubmission --- lib/relevance/tarantula/crawler.rb | 4 +- lib/relevance/tarantula/form_submission.rb | 8 +- spec/relevance/tarantula/crawler_spec.rb | 127 +++++++++++---------- spec/relevance/tarantula/link_spec.rb | 6 +- 4 files changed, 73 insertions(+), 72 deletions(-) diff --git a/lib/relevance/tarantula/crawler.rb b/lib/relevance/tarantula/crawler.rb index 5c4ad0b..126ed6c 100644 --- a/lib/relevance/tarantula/crawler.rb +++ b/lib/relevance/tarantula/crawler.rb @@ -147,7 +147,7 @@ def make_result(options) def handle_form_results(form, response) handlers.each do |h| - save_result h.handle(Result.new(:method => form.method, + save_result h.handle(Result.new(:method => form.meth, :url => form.action, :response => response, :log => grab_log!, @@ -209,7 +209,7 @@ def queue_form(form, referrer = nil) # append delete requests to the end of the queue, all others just before the first delete request def append_to_queue(request) - if request.method != 'delete' && index = @crawl_queue.index {|r| r.method == 'delete' } + if request.meth.to_s != 'delete' && index = @crawl_queue.index {|r| r.meth.to_s == 'delete' } @crawl_queue.insert(index, request) else @crawl_queue << request diff --git a/lib/relevance/tarantula/form_submission.rb b/lib/relevance/tarantula/form_submission.rb index 76cb7e7..6f87b2f 100644 --- a/lib/relevance/tarantula/form_submission.rb +++ b/lib/relevance/tarantula/form_submission.rb @@ -3,7 +3,7 @@ module Tarantula class FormSubmission include Relevance::Tarantula - attr_accessor :method, :action, :data, :attack, :form + attr_accessor :meth, :action, :data, :attack, :form class << self def attacks @@ -24,7 +24,7 @@ def attacks=(atts) def initialize(form, attack = Relevance::Tarantula::BasicAttack.new) @form = form - @method = form.method + @meth = form.method @action = form.action @attack = attack @data = mutate_selects(form).merge(mutate_text_areas(form)).merge(mutate_inputs(form)) @@ -32,7 +32,7 @@ def initialize(form, attack = Relevance::Tarantula::BasicAttack.new) def crawl begin - response = form.crawler.submit(method, action, data) + response = form.crawler.submit(meth, action, data) log "Response #{response.code} for #{self}" rescue ActiveRecord::RecordNotFound => e log "Skipping #{action}, presumed ok that record is missing" @@ -47,7 +47,7 @@ def self.mutate(form) end def to_s - "#{action} #{method} #{data.inspect} #{attack.inspect}" + "#{action} #{meth} #{data.inspect} #{attack.inspect}" end # a form's signature is what makes it unique (e.g. action + fields) diff --git a/spec/relevance/tarantula/crawler_spec.rb b/spec/relevance/tarantula/crawler_spec.rb index 1c392b4..488af9e 100644 --- a/spec/relevance/tarantula/crawler_spec.rb +++ b/spec/relevance/tarantula/crawler_spec.rb @@ -1,7 +1,7 @@ require "spec_helper" describe Relevance::Tarantula::Crawler do - + describe "transform_url" do before { @crawler = Relevance::Tarantula::Crawler.new } @@ -10,13 +10,13 @@ obfuscated_mailto = "mailto:" @crawler.transform_url(obfuscated_mailto).should == "mailto:" end - + it "strips the trailing name portion of a link" do @crawler.transform_url('http://host/path#name').should == 'http://host/path' end end - - + + describe "log grabbing" do it "returns nil if no grabber is specified" do @@ -29,11 +29,11 @@ crawler.log_grabber = stub(:grab! => "fake log entry") crawler.grab_log!.should == "fake log entry" end - + end - + describe "interrupt" do - + it 'catches interruption and writes the partial report' do crawler = Relevance::Tarantula::Crawler.new crawler.stubs(:queue_link) @@ -42,31 +42,31 @@ $stderr.expects(:puts).with("CTRL-C") crawler.crawl end - + end - + describe 'handle_form_results' do - + it 'captures the result values (bugfix)' do response = stub_everything - result_args = {:url => :action_stub, - :data => 'nil', - :response => response, - :referrer => :action_stub, - :log => nil, + result_args = {:url => :action_stub, + :data => 'nil', + :response => response, + :referrer => :action_stub, + :log => nil, :method => :stub_method, :test_name => nil} result = Relevance::Tarantula::Result.new(result_args) Relevance::Tarantula::Result.expects(:new).with(result_args).returns(result) crawler = Relevance::Tarantula::Crawler.new - crawler.handle_form_results(stub_everything(:method => :stub_method, :action => :action_stub), + crawler.handle_form_results(stub_everything(:meth => :stub_method, :action => :action_stub), response) end - + end - + describe "crawl" do - + it 'queues the first url, does crawl, and then reports results' do crawler = Relevance::Tarantula::Crawler.new crawler.expects(:queue_link).with("/foobar") @@ -81,9 +81,9 @@ crawler.expects(:report_results) lambda {crawler.crawl('/')}.should raise_error(RuntimeError) end - + end - + describe "queueing" do it 'queues and remembers links' do @@ -108,9 +108,10 @@ crawler = Relevance::Tarantula::Crawler.new Relevance::Tarantula::Link.expects(:new).with('/url', crawler, '/some-referrer') crawler.stubs(:should_skip_link?) + crawler.stubs(:append_to_queue) crawler.queue_link('/url', '/some-referrer') end - + it "queues DELETE requests at the end, everything else before" do crawler = Relevance::Tarantula::Crawler.new create_link = Hpricot('Create').at('a') @@ -126,39 +127,39 @@ get_link = Relevance::Tarantula::Link.new(get_link, crawler, nil) crawler.crawl_queue.should == [create_link, get_link, delete_link] end - + it "queues is crawled from tip not tail" do crawler = Relevance::Tarantula::Crawler.new - + create_link = Hpricot('Create').at('a') crawler.queue_link(create_link) delete_link = Hpricot('Destroy').at('a') crawler.queue_link(delete_link) get_link = Hpricot('Show').at('a') crawler.queue_link(get_link) - + q = sequence('queue') response = stub(:code => "200") - crawler.expects(:follow).with('post', '/create').returns(response).in_sequence(q) - crawler.expects(:follow).with('get', '/read').returns(response).in_sequence(q) - crawler.expects(:follow).with('delete', '/destroy').returns(response).in_sequence(q) + crawler.expects(:follow).with(:post, '/create').returns(response).in_sequence(q) + crawler.expects(:follow).with(:get, '/read').returns(response).in_sequence(q) + crawler.expects(:follow).with(:delete, '/destroy').returns(response).in_sequence(q) crawler.do_crawl(0) end end - + describe "crawling" do before do @form = Hpricot('
').at('form') end - + it "does two things with each link: crawl and blip" do crawler = Relevance::Tarantula::Crawler.new crawler.proxy = stub crawler.crawl_queue = links = [make_link("/foo1", crawler), make_link("/foo2", crawler)] - + links.each{|link| link.expects(:crawl)} crawler.expects(:blip).times(2) - + crawler.crawl_the_queue crawler.crawl_queue.should == [] end @@ -170,7 +171,7 @@ crawler.expects(:blip) crawler.crawl_the_queue end - + # TODO this is the same as "resets to the initial links/forms ..." and doesn't appear to test anything related to a timeout. it "breaks out early if a timeout is set" @@ -186,9 +187,9 @@ crawler.times_to_crawl = 2 crawler.crawl end - + end - + describe "report_results" do it "prints a final summary line" do crawler = Relevance::Tarantula::Crawler.new @@ -197,16 +198,16 @@ crawler.expects(:puts).with("Crawled 42 links and forms.") crawler.report_results end - + it "delegates to generate_reports" do crawler = Relevance::Tarantula::Crawler.new crawler.stubs(:puts) crawler.expects(:generate_reports) crawler.report_results end - + end - + describe "blip" do it "blips the current progress if !verbose" do @@ -217,7 +218,7 @@ crawler.expects(:print).with("\r 0 of 0 links completed ") crawler.blip end - + it "suppresses the blip message if not writing to a tty" do $stdout.stubs(:tty?).returns(false) crawler = Relevance::Tarantula::Crawler.new @@ -226,7 +227,7 @@ crawler.expects(:print).never crawler.blip end - + it "blips nothing if verbose" do $stdout.stubs(:tty?).returns(true) crawler = Relevance::Tarantula::Crawler.new @@ -234,9 +235,9 @@ crawler.expects(:print).never crawler.blip end - + end - + describe "finished?" do it "is finished when the links and forms are crawled" do @@ -255,7 +256,7 @@ crawler.crawl_queue = [:stub_form] crawler.finished?.should == false end - + end it "crawls links and forms again and again until finished?==true" do @@ -264,7 +265,7 @@ crawler.expects(:crawl_the_queue).times(2) crawler.do_crawl(1) end - + it "asks each reporter to write its report in report_dir" do crawler = Relevance::Tarantula::Crawler.new crawler.stubs(:report_dir).returns(test_output_dir) @@ -275,31 +276,31 @@ crawler.save_result stub(:code => "404", :url => "/uh-oh") crawler.generate_reports end - + it "builds a report dir relative to rails root" do crawler = Relevance::Tarantula::Crawler.new crawler.expects(:rails_root).returns("faux_rails_root") crawler.report_dir.should == "faux_rails_root/tmp/tarantula" end - + it "skips links that are already queued" do crawler = Relevance::Tarantula::Crawler.new crawler.should_skip_link?(make_link("/foo")).should == false crawler.queue_link("/foo").should == make_link("/foo") crawler.should_skip_link?(make_link("/foo")).should == true end - + describe "link skipping" do before { @crawler = Relevance::Tarantula::Crawler.new } - + it "skips links that are too long" do @crawler.should_skip_link?(make_link("/foo")).should == false @crawler.max_url_length = 2 @crawler.expects(:log).with("Skipping long url /foo") @crawler.should_skip_link?(make_link("/foo")).should == true end - + it "skips outbound links (those that begin with http)" do @crawler.expects(:log).with("Skipping http-anything") @crawler.should_skip_link?(make_link("http-anything")).should == true @@ -314,29 +315,29 @@ @crawler.expects(:log).with("Skipping mailto-anything") @crawler.should_skip_link?(make_link("mailto-anything")).should == true end - + it 'skips blank links' do @crawler.queue_link(nil) @crawler.crawl_queue.should == [] @crawler.queue_link("") @crawler.crawl_queue.should == [] end - + it "logs and skips links that match a pattern" do @crawler.expects(:log).with("Skipping /the-red-button") @crawler.skip_uri_patterns << /red-button/ @crawler.queue_link("/blue-button").should == make_link("/blue-button") @crawler.queue_link("/the-red-button").should == nil - end - + end + it "logs and skips form submissions that match a pattern" do @crawler.expects(:log).with("Skipping /reset-password-form") - @crawler.skip_uri_patterns << /reset-password/ + @crawler.skip_uri_patterns << /reset-password/ fs = stub_everything(:action => "/reset-password-form") @crawler.should_skip_form_submission?(fs).should == true end end - + describe "allow_nnn_for" do it "installs result as a response_code_handler" do @@ -355,9 +356,9 @@ crawler = Relevance::Tarantula::Crawler.new lambda{crawler.foo}.should raise_error(NoMethodError) end - + end - + describe "timeouts" do it "sets start and end times for a single crawl" do @@ -372,7 +373,7 @@ crawler.crawl_start_times.first.should == start_time crawler.crawl_end_times.first.should == end_time end - + it "has elasped time for a crawl" do start_time = Time.parse("March 1st, 2008 10:00am") elasped_time_check = Time.parse("March 1st, 2008, 10:10:00am") @@ -384,7 +385,7 @@ crawler.crawl crawler.elasped_time_for_pass(0).should == 600.seconds end - + it "raises out of the crawl if elasped time is greater then the crawl timeout" do start_time = Time.parse("March 1st, 2008 10:00am") elasped_time_check = Time.parse("March 1st, 2008, 10:35:00am") @@ -392,17 +393,17 @@ crawler = Relevance::Tarantula::Crawler.new crawler.crawl_timeout = 5.minutes - + crawler.crawl_queue = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)] crawler.proxy = stub crawler.proxy.stubs(:get).returns(response = stub(:code => "200")) - + stub_puts_and_print(crawler) lambda { crawler.do_crawl(0) }.should raise_error end - + end - + end diff --git a/spec/relevance/tarantula/link_spec.rb b/spec/relevance/tarantula/link_spec.rb index 825bb3a..3114863 100644 --- a/spec/relevance/tarantula/link_spec.rb +++ b/spec/relevance/tarantula/link_spec.rb @@ -16,7 +16,7 @@ end it "parses anchor tags with POST 'method'" do - link = make_link(Hpricot(%Q{foo}).at('a')) + link = make_link(Hpricot(%Q{foo}).at('a')) link.href.should == '/foo' link.meth.should == :post end @@ -28,7 +28,7 @@ end it "parses anchor tags with PUT 'method'" do - link = make_link(Hpricot(%Q{foo}).at('a')) + link = make_link(Hpricot(%Q{foo}).at('a')) link.href.should == '/foo' link.meth.should == :put end @@ -46,7 +46,7 @@ end it "parses anchor tags with DELETE 'method'" do - link = make_link(Hpricot(%Q{foo}).at('a')) + link = make_link(Hpricot(%Q{foo}).at('a')) link.href.should == '/foo' link.meth.should == :delete end