Merge pull request #95 from ElixirTeSS/fix_portugal_scraper

Fix Portugal scraper
ElixirTeSS · Oct 22, 2020 · 9ce9af8 · 9ce9af8
2 parents 2ba3812 + 53b174a
commit 9ce9af8
Show file tree

Hide file tree

Showing 6 changed files with 82 additions and 54 deletions.
diff --git a/Gemfile b/Gemfile
@@ -2,7 +2,7 @@ source 'https://rubygems.org'
 
 gem 'geocoder'
 gem 'inifile'
-gem 'nokogiri', '~> 1.8.1'
+gem 'nokogiri', '~> 1.10.9'
 gem 'redcarpet'
 gem 'simple-rss'
 gem 'google_places'

diff --git a/Gemfile.lock b/Gemfile.lock
@@ -19,11 +19,12 @@ GIT
 GEM
   remote: https://rubygems.org/
   specs:
-    activesupport (5.2.1)
+    activesupport (6.0.3.1)
       concurrent-ruby (~> 1.0, >= 1.0.2)
       i18n (>= 0.7, < 2)
       minitest (~> 5.1)
       tzinfo (~> 1.1)
+      zeitwerk (~> 2.2, >= 2.2.2)
     addressable (2.7.0)
       public_suffix (>= 2.0.2, < 5.0)
     bcp47 (0.3.3)
@@ -52,7 +53,7 @@ GEM
       domain_name (~> 0.5)
     httparty (0.14.0)
       multi_xml (>= 0.5.2)
-    i18n (1.5.1)
+    i18n (1.8.2)
       concurrent-ruby (~> 1.0)
     icalendar (2.4.1)
     inifile (3.0.0)
@@ -98,17 +99,17 @@ GEM
     mime-types (3.2.2)
       mime-types-data (~> 3.2015)
     mime-types-data (3.2018.0812)
-    mini_portile2 (2.3.0)
-    minitest (5.11.3)
+    mini_portile2 (2.4.0)
+    minitest (5.14.1)
     multi_json (1.14.1)
     multi_xml (0.5.5)
     net (0.3.3)
       activesupport
     net-http-persistent (3.1.0)
       connection_pool (~> 2.2)
     netrc (0.11.0)
-    nokogiri (1.8.5)
-      mini_portile2 (~> 2.3.0)
+    nokogiri (1.10.9)
+      mini_portile2 (~> 2.4.0)
     nokogumbo (1.5.0)
       nokogiri
     public_suffix (3.1.1)
@@ -201,11 +202,12 @@ GEM
     temple (0.8.2)
     thread_safe (0.3.6)
     tilt (2.0.10)
-    tzinfo (1.2.5)
+    tzinfo (1.2.7)
       thread_safe (~> 0.1)
     unf (0.1.4)
       unf_ext
     unf_ext (0.0.7.5)
+    zeitwerk (2.3.0)
 
 PLATFORMS
   ruby
@@ -217,7 +219,7 @@ DEPENDENCIES
   icalendar
   inifile
   linkeddata (~> 2.0)
-  nokogiri (~> 1.8.1)
+  nokogiri (~> 1.10.9)
   redcarpet
   sanitize
   simple-rss

diff --git a/app/scrapers/edinburgh_scraper.rb b/app/scrapers/edinburgh_scraper.rb
@@ -34,6 +34,7 @@ def scrape
                 json = json[1].gsub('<!--//--><![CDATA[// ><!--', '')
                 json = json.gsub('//--><!]]>', '')
                 json = json.gsub('<br />', '')
+                json = json.gsub("\n", "")
                 event = Tess::Rdf::EventExtractor.new(json, :jsonld).extract { |p| Tess::API::Event.new(p) }.first
                 event.content_provider = cp
                 event.event_types = [:workshops_and_courses]

diff --git a/app/scrapers/intermine_scraper.rb b/app/scrapers/intermine_scraper.rb
@@ -1,40 +1,64 @@
-
 class IntermineScraper < Tess::Scrapers::Scraper
-
   def self.config
     {
-        name: 'Intermine Scraper',
-		index_page: 'http://intermine.org/tutorials/'
+      name: "Intermine Scraper",
+      index_page: "http://intermine.org/tutorials/",
     }
   end
 
   def scrape
-      cp = add_content_provider(Tess::API::ContentProvider.new(
-                                  title: "InterMine", #name
-                                  url: "http://intermine.org", #url
-                                  image_url: "https://cdn.rawgit.com/intermine/design-materials/ff1ec6bf/logos/intermine/intermine-300x37.png", #logo
-                                  description: "nterMine integrates biological data sources, making it easy to query and analyse data.", #description
-                                  content_provider_type: :project,
-                                  node_name: :'UK'
-                                ))  	
+    cp = add_content_provider(Tess::API::ContentProvider.new(
+      title: "InterMine", #name
+      url: "http://intermine.org", #url
+      image_url: "https://cdn.rawgit.com/intermine/design-materials/ff1ec6bf/logos/intermine/intermine-300x37.png", #logo
+      description: "nterMine integrates biological data sources, making it easy to query and analyse data.", #description
+      content_provider_type: :project,
+      node_name: :'UK',
+    ))
+
+    #page = Nokogiri::HTML.parse(open(config[:index_page]).read)
+    index = "http://intermine.org/tutorials/"
+    page = Nokogiri::HTML.parse(open(index).read)
+    tutorials = page.xpath("/html/body/main/ul/li")
 
-      #page = Nokogiri::HTML.parse(open(config[:index_page]).read)
-      index = 'http://intermine.org/tutorials/'
-      page = Nokogiri::HTML.parse(open(index).read)
-      tutorials = page.xpath('/html/body/main/ul/li')
-
-      tutorials.each do |tutorial|
-	      	url = tutorial.children.first.attributes['href'].value
-	      	title = tutorial.text.split('-').first
-	      	description = tutorial.text
-	      	event = Tess::API::Material.new(
-	      		title: title,
-	            url: url,
-	            content_provider: cp,
-	            short_description: description,
-	            keywords: title.split(' ').first
-	       )
-			add_material(event)
-	  end
-	end
+    tutorials.each do |tutorial|
+      if !tutorial.children.first.attributes["href"].nil?
+        url = tutorial.children.first.attributes["href"].value
+        title = tutorial.text.split("-").first
+        description = tutorial.text
+        event = Tess::API::Material.new(
+          title: title,
+          url: url,
+          content_provider: cp,
+          short_description: description,
+          keywords: title.split(" ").first,
+        )
+        add_material(event)
+      end
+    end
+
+    #page = Nokogiri::HTML.parse(open(config[:index_page]).read)
+    index = "http://intermine.org/training-workshops/"
+    page = Nokogiri::HTML.parse(open(index).read)
+    tutorials = page.xpath("/html/body/div/section/ul/li/a")
+    tutorials.each do |tutorial|
+      if !tutorial.attributes["href"].nil?
+        url = tutorial.attributes["href"].value
+        # Complete relative URL paths
+        if !url.include? "http"
+          url = index + url
+        end
+        title = tutorial.children.text
+        description = title
+        event = Tess::API::Material.new(
+          title: title,
+          url: url,
+          content_provider: cp,
+          short_description: description,
+          keywords: title.split(" ").first,
+        )
+        add_material(event)
+      end
+    end
+  end
 end
diff --git a/app/scrapers/portugal_events_scraper.rb b/app/scrapers/portugal_events_scraper.rb
@@ -5,15 +5,16 @@ class PortugalEventsScraper < Tess::Scrapers::Scraper
   def self.config
     {
         name: 'ELIXIR Portugal',
-        root_url: 'http://elixir-portugal.org/events',
-        base_url: 'http://elixir-portugal.org'
+        # TODO: change URL to https://biodata.pt/events before deploying
+        root_url: 'https://biodata.pt/past-events',
+        base_url: 'https://biodata.pt'
     }
   end
 
   def scrape
     cp = add_content_provider(Tess::API::ContentProvider.new(
         { title: "ELIXIR Portugal",
-          url: "https://elixir-portugal.org",
+          url: "https://biodata.pt",
           image_url: "https://tess.elixir-europe.org/assets/nodes/logos/PT-9f2611b1953f3109fa81668d960e7068390f4ef69be8f4b950ec0e8d7b106503.png",
           description: "ELIXIR Portugal is organized as a consortium of Portuguese research institutions which are part of the national biological information network, BioData.pt. Like ELIXIR itself, the Portuguese node is a decentralized network of specialized centers, under a common hardware and software infrastructure, and with shared training and industry/entrepreneurship programmes.",
           content_provider_type: :organisation,
@@ -22,21 +23,21 @@ def scrape
 
 
 
-	index = Nokogiri::HTML(open(config[:root_url]))
-    urls = index.search('tbody a').map{|x| x.values}.flatten
+    index = Nokogiri::HTML(open(config[:root_url]))
+    urls = index.search('tbody td.views-field-title a').map{|x| x.values}.flatten
 
     urls.each do |url|
-	   	#url = 
-	    html = open(config[:base_url] + url).read
-		 #extract JSON with regex because passing whole JSON to RDFEventExtractor throws errors up.
-	    a = /<script type="application\/ld\+json">(.*)<\/script>/m.match(html)
+      #url =
+      html = open(config[:base_url] + url).read
+      #extract JSON with regex because passing whole JSON to RDFEventExtractor throws errors up.
+      a = /<script type="application\/ld\+json">(.*?)<\/script>/m.match(html)
 
       if a
-	      events = Tess::Rdf::EventExtractor.new(a[1], :jsonld).extract { |p| Tess::API::Event.new(p) }
+        events = Tess::Rdf::EventExtractor.new(a[1], :jsonld).extract { |p| Tess::API::Event.new(p) }
 
-	      events.each do |event|
-	        event.content_provider = cp
-	        add_event(event)
+        events.each do |event|
+          event.content_provider = cp
+          add_event(event)
         end
       end
     end

diff --git a/run_scrapers.rb b/run_scrapers.rb
@@ -45,7 +45,7 @@
    NbisEventsScraper,
    NgsRegistryScraper,
    OpenTargetJsonScraper,
-   PortugalEventsScraper,
+   #PortugalEventsScraper,
    PraceEventsScraper,
    RssScraper,
    SheffieldScraper,