Skip to content

Commit

Permalink
Merge pull request #95 from ElixirTeSS/fix_portugal_scraper
Browse files Browse the repository at this point in the history
Fix Portugal scraper
  • Loading branch information
03c authored Oct 22, 2020
2 parents 2ba3812 + 53b174a commit 9ce9af8
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 54 deletions.
2 changes: 1 addition & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ source 'https://rubygems.org'

gem 'geocoder'
gem 'inifile'
gem 'nokogiri', '~> 1.8.1'
gem 'nokogiri', '~> 1.10.9'
gem 'redcarpet'
gem 'simple-rss'
gem 'google_places'
Expand Down
18 changes: 10 additions & 8 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ GIT
GEM
remote: https://rubygems.org/
specs:
activesupport (5.2.1)
activesupport (6.0.3.1)
concurrent-ruby (~> 1.0, >= 1.0.2)
i18n (>= 0.7, < 2)
minitest (~> 5.1)
tzinfo (~> 1.1)
zeitwerk (~> 2.2, >= 2.2.2)
addressable (2.7.0)
public_suffix (>= 2.0.2, < 5.0)
bcp47 (0.3.3)
Expand Down Expand Up @@ -52,7 +53,7 @@ GEM
domain_name (~> 0.5)
httparty (0.14.0)
multi_xml (>= 0.5.2)
i18n (1.5.1)
i18n (1.8.2)
concurrent-ruby (~> 1.0)
icalendar (2.4.1)
inifile (3.0.0)
Expand Down Expand Up @@ -98,17 +99,17 @@ GEM
mime-types (3.2.2)
mime-types-data (~> 3.2015)
mime-types-data (3.2018.0812)
mini_portile2 (2.3.0)
minitest (5.11.3)
mini_portile2 (2.4.0)
minitest (5.14.1)
multi_json (1.14.1)
multi_xml (0.5.5)
net (0.3.3)
activesupport
net-http-persistent (3.1.0)
connection_pool (~> 2.2)
netrc (0.11.0)
nokogiri (1.8.5)
mini_portile2 (~> 2.3.0)
nokogiri (1.10.9)
mini_portile2 (~> 2.4.0)
nokogumbo (1.5.0)
nokogiri
public_suffix (3.1.1)
Expand Down Expand Up @@ -201,11 +202,12 @@ GEM
temple (0.8.2)
thread_safe (0.3.6)
tilt (2.0.10)
tzinfo (1.2.5)
tzinfo (1.2.7)
thread_safe (~> 0.1)
unf (0.1.4)
unf_ext
unf_ext (0.0.7.5)
zeitwerk (2.3.0)

PLATFORMS
ruby
Expand All @@ -217,7 +219,7 @@ DEPENDENCIES
icalendar
inifile
linkeddata (~> 2.0)
nokogiri (~> 1.8.1)
nokogiri (~> 1.10.9)
redcarpet
sanitize
simple-rss
Expand Down
1 change: 1 addition & 0 deletions app/scrapers/edinburgh_scraper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def scrape
json = json[1].gsub('<!--//--><![CDATA[// ><!--', '')
json = json.gsub('//--><!]]>', '')
json = json.gsub('<br />', '')
json = json.gsub("\n", "")
event = Tess::Rdf::EventExtractor.new(json, :jsonld).extract { |p| Tess::API::Event.new(p) }.first
event.content_provider = cp
event.event_types = [:workshops_and_courses]
Expand Down
86 changes: 55 additions & 31 deletions app/scrapers/intermine_scraper.rb
Original file line number Diff line number Diff line change
@@ -1,40 +1,64 @@

class IntermineScraper < Tess::Scrapers::Scraper

def self.config
{
name: 'Intermine Scraper',
index_page: 'http://intermine.org/tutorials/'
name: "Intermine Scraper",
index_page: "http://intermine.org/tutorials/",
}
end

def scrape
cp = add_content_provider(Tess::API::ContentProvider.new(
title: "InterMine", #name
url: "http://intermine.org", #url
image_url: "https://cdn.rawgit.com/intermine/design-materials/ff1ec6bf/logos/intermine/intermine-300x37.png", #logo
description: "nterMine integrates biological data sources, making it easy to query and analyse data.", #description
content_provider_type: :project,
node_name: :'UK'
))
cp = add_content_provider(Tess::API::ContentProvider.new(
title: "InterMine", #name
url: "http://intermine.org", #url
image_url: "https://cdn.rawgit.com/intermine/design-materials/ff1ec6bf/logos/intermine/intermine-300x37.png", #logo
description: "nterMine integrates biological data sources, making it easy to query and analyse data.", #description
content_provider_type: :project,
node_name: :'UK',
))

#page = Nokogiri::HTML.parse(open(config[:index_page]).read)
index = "http://intermine.org/tutorials/"
page = Nokogiri::HTML.parse(open(index).read)
tutorials = page.xpath("/html/body/main/ul/li")

#page = Nokogiri::HTML.parse(open(config[:index_page]).read)
index = 'http://intermine.org/tutorials/'
page = Nokogiri::HTML.parse(open(index).read)
tutorials = page.xpath('/html/body/main/ul/li')

tutorials.each do |tutorial|
url = tutorial.children.first.attributes['href'].value
title = tutorial.text.split('-').first
description = tutorial.text
event = Tess::API::Material.new(
title: title,
url: url,
content_provider: cp,
short_description: description,
keywords: title.split(' ').first
)
add_material(event)
end
end
tutorials.each do |tutorial|
if !tutorial.children.first.attributes["href"].nil?
url = tutorial.children.first.attributes["href"].value
title = tutorial.text.split("-").first
description = tutorial.text
event = Tess::API::Material.new(
title: title,
url: url,
content_provider: cp,
short_description: description,
keywords: title.split(" ").first,
)
add_material(event)
end
end

#page = Nokogiri::HTML.parse(open(config[:index_page]).read)
index = "http://intermine.org/training-workshops/"
page = Nokogiri::HTML.parse(open(index).read)
tutorials = page.xpath("/html/body/div/section/ul/li/a")
tutorials.each do |tutorial|
if !tutorial.attributes["href"].nil?
url = tutorial.attributes["href"].value
# Complete relative URL paths
if !url.include? "http"
url = index + url
end
title = tutorial.children.text
description = title
event = Tess::API::Material.new(
title: title,
url: url,
content_provider: cp,
short_description: description,
keywords: title.split(" ").first,
)
add_material(event)
end
end
end
end
27 changes: 14 additions & 13 deletions app/scrapers/portugal_events_scraper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,16 @@ class PortugalEventsScraper < Tess::Scrapers::Scraper
def self.config
{
name: 'ELIXIR Portugal',
root_url: 'http://elixir-portugal.org/events',
base_url: 'http://elixir-portugal.org'
# TODO: change URL to https://biodata.pt/events before deploying
root_url: 'https://biodata.pt/past-events',
base_url: 'https://biodata.pt'
}
end

def scrape
cp = add_content_provider(Tess::API::ContentProvider.new(
{ title: "ELIXIR Portugal",
url: "https://elixir-portugal.org",
url: "https://biodata.pt",
image_url: "https://tess.elixir-europe.org/assets/nodes/logos/PT-9f2611b1953f3109fa81668d960e7068390f4ef69be8f4b950ec0e8d7b106503.png",
description: "ELIXIR Portugal is organized as a consortium of Portuguese research institutions which are part of the national biological information network, BioData.pt. Like ELIXIR itself, the Portuguese node is a decentralized network of specialized centers, under a common hardware and software infrastructure, and with shared training and industry/entrepreneurship programmes.",
content_provider_type: :organisation,
Expand All @@ -22,21 +23,21 @@ def scrape



index = Nokogiri::HTML(open(config[:root_url]))
urls = index.search('tbody a').map{|x| x.values}.flatten
index = Nokogiri::HTML(open(config[:root_url]))
urls = index.search('tbody td.views-field-title a').map{|x| x.values}.flatten

urls.each do |url|
#url =
html = open(config[:base_url] + url).read
#extract JSON with regex because passing whole JSON to RDFEventExtractor throws errors up.
a = /<script type="application\/ld\+json">(.*)<\/script>/m.match(html)
#url =
html = open(config[:base_url] + url).read
#extract JSON with regex because passing whole JSON to RDFEventExtractor throws errors up.
a = /<script type="application\/ld\+json">(.*?)<\/script>/m.match(html)

if a
events = Tess::Rdf::EventExtractor.new(a[1], :jsonld).extract { |p| Tess::API::Event.new(p) }
events = Tess::Rdf::EventExtractor.new(a[1], :jsonld).extract { |p| Tess::API::Event.new(p) }

events.each do |event|
event.content_provider = cp
add_event(event)
events.each do |event|
event.content_provider = cp
add_event(event)
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion run_scrapers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
NbisEventsScraper,
NgsRegistryScraper,
OpenTargetJsonScraper,
PortugalEventsScraper,
#PortugalEventsScraper,
PraceEventsScraper,
RssScraper,
SheffieldScraper,
Expand Down

0 comments on commit 9ce9af8

Please sign in to comment.