-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbtaa_validate_ebook
116 lines (98 loc) · 3.3 KB
/
btaa_validate_ebook
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env ruby
# frozen_string_literal: true
# Script inputs a Fulcrum monograph metadata CSV file and a
# list of ebooks and validates the title text found
# within the ebook with the title in the metadata.
require 'optparse'
require 'ostruct'
require 'os'
# Determine the root directory of the code base.
script_dir = File.expand_path(File.dirname(__FILE__))
root_dir = File.dirname(script_dir)
require_relative File.join(root_dir, "lib", "logger")
script_logger = UMPTG::Logger.create(logger_fp: STDOUT)
# Process the script parameters.
options = OpenStruct.new
options.write_mono_file = false
option_parser = OptionParser.new do |opts|
opts.banner = "Usage: #{File.basename(__FILE__)} <csv_file> <ebook_file> [<ebook_file>..]"
opts.on_tail('-h', '--help', 'Print this help message') do
script_logger.info(opts)
exit 0
end
end
option_parser.parse!(ARGV)
if ARGV.count < 2
script_logger.info(option_parser.help)
exit 0
end
csv_file = ARGV[0]
ebook_file_list = ARGV[1..-1]
require 'csv'
require 'origami'
require_relative File.join(root_dir, 'lib', 'epub')
csv_file = File.expand_path(csv_file)
unless File.exist?(csv_file)
script_logger.error("invalid CSV file path #{csv_file}.")
exit 1
end
script_logger.info("*** processing #{File.basename(csv_file)} ***")
STDOUT.flush
=begin
# This should be removed once the metadata
# is updated.
isbn_map = {
"9780299316983" => "9780299316990",
"9780299150839" => "9780299150891",
"9780299311186" => "9780299311193"
}
=end
fulcrum_body = File.read(csv_file)
fulcrum_csv = CSV.parse(
fulcrum_body,
headers: true,
return_headers: false
)
ebook_file_list.each do |ebook_file|
ebook_file = File.expand_path(ebook_file)
unless File.file?(ebook_file)
script_logger.error("directory \"#{File.basename(ebook_file)}\" does not exist.")
next
end
script_logger.info("processing directory \"#{File.basename(ebook_file)}\"")
isbn = File.basename(ebook_file, ".*")
#isbn = isbn_map[isbn] if isbn_map.include?(isbn)
# Search CSV for monograph metadata. Directory name
# is the search key. First, search HEBID field (HEBID).
# If fails, then search the ISBN fields (ISBN[1-3]_13).
fm_row_list = fulcrum_csv.select do |row|
#(!row['ISBN(s)'].nil? and row['ISBN(s)'].gsub(/\-/,'').include?(isbn))
(!row['ISBN_SEARCH'].nil? and row['ISBN_SEARCH'].include?(isbn))
end
if fm_row_list.empty?
script_logger.warn("no CSV row found for #{isbn}. Skipping.")
next
end
if fm_row_list.count > 1
script_logger.warn("multiple CSV rows found for #{isbn}. Skipping.")
next
end
script_logger.info("Found CSV row for #{isbn}.")
monograph_row = fm_row_list.first
case File.extname(ebook_file)
when '.epub'
epub = UMPTG::EPUB::Archive.new(epub_file: ebook_file)
title_nodelist = epub.opf_doc.xpath("//*[local-name()='metadata']/*[name()='dc:title']")
ebook_title = title_nodelist.first.text
when '.pdf'
pdf = Origami::PDF.read(ebook_file, lazy: true)
ebook_title = pdf.title
ebook_title = pdf.metadata['title'] if ebook_title.nil? or ebook_title.strip.empty?
#puts pdf.metadata
else
ebook_title = 'unknown'
end
script_logger.info("Metadata Title: #{monograph_row['Title']}")
script_logger.info("EBook Title : #{ebook_title}")
STDOUT.flush
end