Skip to content

Commit 0228c62

Browse files
authored
Merge pull request #3730 from mlibrary/HELIO-4125/index_epub_accessibility_metadata
HELIO-4125 - index EPUB accessibility metadata
2 parents 9e50892 + b0b5eaf commit 0228c62

5 files changed

+446
-4
lines changed

app/indexers/monograph_indexer.rb

+9
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,9 @@ def generate_solr_document # rubocop:disable Metrics/PerceivedComplexity, Metric
8686

8787
# HELIO-2428 index the "full" doi url if there's a doi
8888
solr_doc['doi_url_ssim'] = "https://doi.org/" + object.doi if object.doi.present?
89+
90+
# HELIO-4125 - Extract EPUB metadata on ingest and store in Solr
91+
maybe_index_accessibility_metadata(solr_doc)
8992
end
9093
end
9194

@@ -170,4 +173,10 @@ def all_product_names_for_monograph(obj)
170173
all_product_names << "Unrestricted" if all_product_ids.include?(0)
171174
all_product_names + Greensub::Product.where(id: all_product_ids).map { |product| product.name || product.identifier }
172175
end
176+
177+
def maybe_index_accessibility_metadata(solr_doc)
178+
epub_fr = FeaturedRepresentative.where(work_id: object.id, kind: 'epub')&.first
179+
return if epub_fr.blank?
180+
EpubAccessibilityMetadataIndexingService.index(epub_fr.file_set_id, solr_doc)
181+
end
173182
end
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# frozen_string_literal: true
2+
3+
class EpubAccessibilityMetadataIndexingService
4+
class << self
5+
# This will be called from MonographIndexer. Given that nothing indexed here is crucial to Fulcrum's core features,...
6+
# no errors should be raised that would prevent the Monograph document from being created.
7+
# For all of the fields here I'll use string (not English text), stored and indexed and use the expected cardinality.
8+
# This should allow any/all to be used in Blacklight facets.
9+
def index(epub_file_set_id, solr_doc)
10+
root_path = UnpackService.root_path_from_noid(epub_file_set_id, 'epub')
11+
return unless Dir.exist? root_path
12+
13+
container_file = File.join(root_path, "META-INF/container.xml")
14+
return unless File.exist? container_file
15+
16+
container = Nokogiri::XML(File.open(container_file)).remove_namespaces!
17+
container.xpath("//rootfile").length > 1 ? 'yes' : 'no'
18+
# Accessibility metadata extraction requires one, and only one, rendition to be present.
19+
return unless container.xpath("//rootfile").length == 1
20+
21+
content_file = container.xpath("//rootfile/@full-path").text
22+
content_file = File.join(root_path, content_file)
23+
return unless File.exist? content_file
24+
25+
content = Nokogiri::XML(File.open(content_file)).remove_namespaces!
26+
package_element = content.at_css('package')
27+
epub_version = package_element['version']&.strip if package_element.present?
28+
return if epub_version.blank?
29+
30+
# Indexing this for no particular reason. Might be useful at some point.
31+
solr_doc['epub_version_ssi'] = epub_version
32+
33+
# This should allow the relevant values to be detected for both EPUB 2 and EPUB 3, though only the latter should...
34+
# be present on Fulcrum. See Daisy links below.
35+
@epub_2 = epub_version.start_with?('2')
36+
@meta_attribute = @epub_2 ? 'name' : 'property'
37+
38+
@content_metadata = content.at_css('metadata')
39+
return if @content_metadata.blank?
40+
41+
# for these schema values see https://kb.daisy.org/publishing/docs/metadata/schema.org/
42+
solr_doc['epub_a11y_accessibility_summary_ssi'] = accessibility_summary
43+
solr_doc['epub_a11y_accessibility_features_ssim'] = accessibility_features
44+
solr_doc['epub_a11y_access_mode_ssim'] = access_mode
45+
@access_mode_sufficient = access_mode_sufficient
46+
solr_doc['epub_a11y_access_mode_sufficient_ssim'] = @access_mode_sufficient
47+
48+
# for these evaluation values see https://kb.daisy.org/publishing/docs/metadata/evaluation.html
49+
solr_doc['epub_a11y_conforms_to_ssi'] = conforms_to
50+
solr_doc['epub_a11y_certified_by_ssi'] = certified_by
51+
solr_doc['epub_a11y_certifier_credential_ssi'] = certifier_credential
52+
53+
# This is a derived value for convenience in Fulcrum UI use.
54+
solr_doc['epub_a11y_screen_reader_friendly_ssi'] = screen_reader_friendly
55+
end
56+
57+
def accessibility_summary
58+
value = @content_metadata.at_css("meta[#{@meta_attribute}='schema:accessibilitySummary']")
59+
return nil if value.blank?
60+
@epub_2 ? value['content']&.strip : value.text&.strip
61+
end
62+
63+
def accessibility_features
64+
# this involves multiple entries in separate meta tags
65+
values = @content_metadata.css("meta[#{@meta_attribute}='schema:accessibilityFeature']")
66+
67+
values = if @epub_2
68+
values&.map { |value| value['content']&.strip }
69+
else
70+
values&.map { |value| value&.text&.strip }
71+
end
72+
# want to ensure the indexer is set to nil not [] if these are not present, keeping the field off the doc entirely
73+
values.presence
74+
end
75+
76+
def access_mode
77+
# this involves multiple entries in separate meta tags
78+
values = @content_metadata.css("meta[#{@meta_attribute}='schema:accessMode']")
79+
80+
values = if @epub_2
81+
values&.map { |value| value['content']&.strip }
82+
else
83+
values&.map { |value| value&.text&.strip }
84+
end
85+
# want to ensure the indexer is set to nil not [] if these are not present, keeping the field off the doc entirely
86+
values.presence
87+
end
88+
89+
def access_mode_sufficient
90+
# this one has multiple entries in one value, comma separated
91+
values = @content_metadata.at_css("meta[#{@meta_attribute}='schema:accessModeSufficient']")
92+
return nil if values.blank?
93+
values = @epub_2 ? values['content']&.split(',') : values.text&.split(',')
94+
values&.reject(&:blank?)&.map(&:strip)
95+
end
96+
97+
def conforms_to
98+
value = @content_metadata.at_css("meta[#{@meta_attribute}='dcterms:conformsTo']")
99+
return nil if value.blank?
100+
@epub_2 ? value['content']&.strip : value.text&.strip
101+
end
102+
103+
def certified_by
104+
value = @content_metadata.at_css("meta[#{@meta_attribute}='a11y:certifiedBy']")
105+
return nil if value.blank?
106+
@epub_2 ? value['content']&.strip : value.text&.strip
107+
end
108+
109+
def certifier_credential
110+
value = @content_metadata.at_css("meta[#{@meta_attribute}='a11y:certifierCredential']")
111+
return nil if value.blank?
112+
@epub_2 ? value['content']&.strip : value.text&.strip
113+
end
114+
115+
def screen_reader_friendly
116+
if @access_mode_sufficient.present?
117+
if @access_mode_sufficient.count == 1 && @access_mode_sufficient[0] == 'textual'
118+
'true'
119+
else
120+
'false'
121+
end
122+
else
123+
# I guess it's OK that this will always have a value even if all the other a11y metadata is missing.
124+
'unknown'
125+
end
126+
end
127+
end
128+
end

spec/indexers/monograph_indexer_spec.rb

+13-4
Original file line numberDiff line numberDiff line change
@@ -85,14 +85,23 @@
8585
end
8686
end
8787

88-
context "ebook representative table of contents" do
88+
context "ebook metadata indexed at the Monograph level" do
8989
before do
9090
create(:featured_representative, work_id: monograph.id, file_set_id: file_set.id, kind: "epub")
91-
UnpackJob.perform_now(file_set.id, "epub") # to index the epub's table of contents HELIO-3870
91+
UnpackJob.perform_now(file_set.id, "epub") # to index the epub's table of contents HELIO-3870 and a11y metadata HELIO-4125
9292
end
9393

94-
it "indexes the epub/pdf_ebook's ToC if there is one" do
95-
expect(subject['table_of_contents_tesim']).to include("Chapter 73. Stubb and Flask Kill a Right Whale; and Then Have a Talk")
94+
context 'ebook representative table of contents' do
95+
it "indexes the epub/pdf_ebook's ToC if there is one" do
96+
expect(subject['table_of_contents_tesim']).to include("Chapter 73. Stubb and Flask Kill a Right Whale; and Then Have a Talk")
97+
end
98+
end
99+
100+
context 'Runs EpubAccessibilityMetadataIndexingService' do
101+
it 'indexes fields from the EPUB OPF "package" file on the Monograph' do
102+
# checking that `version` was indexed wll suffice, EpubAccessibilityMetadataIndexingServiceSpec covers them all
103+
expect(subject['epub_version_ssi']).to eq("3.0")
104+
end
96105
end
97106
end
98107

spec/jobs/unpack_job_spec.rb

+12
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,15 @@
2424
monograph.ordered_members << reflowable_epub
2525
monograph.save!
2626
reflowable_epub.save!
27+
# the `find` and `parent` stubs are necessary for the `update_index` stub to work
28+
allow(FileSet).to receive(:find).with(reflowable_epub.id).and_return(reflowable_epub)
29+
allow(reflowable_epub).to receive(:parent).and_return(monograph)
30+
allow(monograph).to receive(:update_index)
2731
end
2832

2933
it "unzips the epub, caches the ToC, creates the search database and doesn't make chapter files derivatives" do
34+
# check that we're reindexing to get the ToC and accessibility metadata on the parent Monograph's Solr doc
35+
expect(monograph).to receive(:update_index)
3036
described_class.perform_now(reflowable_epub.id, 'epub')
3137
expect(JSON.parse(EbookTableOfContentsCache.find_by(noid: reflowable_epub.id).toc).length).to eq 3
3238
expect(JSON.parse(EbookTableOfContentsCache.find_by(noid: reflowable_epub.id).toc)[0]["title"]).to eq "Damage report!"
@@ -48,9 +54,15 @@
4854
monograph.ordered_members << fixed_layout_epub
4955
monograph.save!
5056
fixed_layout_epub.save!
57+
# the `find` and `parent` stubs are necessary for the `update_index` stub to work
58+
allow(FileSet).to receive(:find).with(fixed_layout_epub.id).and_return(fixed_layout_epub)
59+
allow(fixed_layout_epub).to receive(:parent).and_return(monograph)
60+
allow(monograph).to receive(:update_index)
5161
end
5262

5363
it "unzips the epub, caches the ToC, creates the search database and makes the chapter files derivatives" do
64+
# check that we're reindexing to get the ToC and accessibility metadata on the parent Monograph's Solr doc
65+
expect(monograph).to receive(:update_index)
5466
described_class.perform_now(fixed_layout_epub.id, 'epub')
5567
expect(File.exist?(File.join(root_path, fixed_layout_epub.id + '.db'))).to be true
5668
expect(Dir.exist?(chapters_dir)).to be true

0 commit comments

Comments
 (0)