winterberry/script/btaa_validate_ebook at master · mlibrary/winterberry · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env ruby
# frozen_string_literal: true

# Script inputs a Fulcrum monograph metadata CSV file and a
# list of ebooks and validates the title text found
# within the ebook with the title in the metadata.

require 'optparse'
require 'ostruct'
require 'os'

# Determine the root directory of the code base.
script_dir = File.expand_path(File.dirname(__FILE__))
root_dir = File.dirname(script_dir)

require_relative File.join(root_dir, "lib", "logger")

script_logger = UMPTG::Logger.create(logger_fp: STDOUT)

# Process the script parameters.
options = OpenStruct.new
options.write_mono_file = false
option_parser = OptionParser.new do |opts|
  opts.banner = "Usage: #{File.basename(__FILE__)} <csv_file> <ebook_file> [<ebook_file>..]"
  opts.on_tail('-h', '--help', 'Print this help message') do
    script_logger.info(opts)
    exit 0
  end
end
option_parser.parse!(ARGV)
if ARGV.count < 2
  script_logger.info(option_parser.help)
  exit 0
end

csv_file = ARGV[0]
ebook_file_list = ARGV[1..-1]

require 'csv'
require 'origami'

require_relative File.join(root_dir, 'lib', 'epub')

csv_file = File.expand_path(csv_file)
unless File.exist?(csv_file)
  script_logger.error("invalid CSV file path #{csv_file}.")
  exit 1
end
script_logger.info("*** processing #{File.basename(csv_file)} ***")
STDOUT.flush

=begin
# This should be removed once the metadata
# is updated.
isbn_map = {
        "9780299316983" => "9780299316990",
        "9780299150839" => "9780299150891",
        "9780299311186" => "9780299311193"
    }
=end

fulcrum_body = File.read(csv_file)

fulcrum_csv = CSV.parse(
          fulcrum_body,
          headers: true,
          return_headers: false
          )

ebook_file_list.each do |ebook_file|
  ebook_file = File.expand_path(ebook_file)
  unless File.file?(ebook_file)
    script_logger.error("directory \"#{File.basename(ebook_file)}\" does not exist.")
    next
  end
  script_logger.info("processing directory \"#{File.basename(ebook_file)}\"")

  isbn = File.basename(ebook_file, ".*")
  #isbn = isbn_map[isbn] if isbn_map.include?(isbn)

  # Search CSV for monograph metadata. Directory name
  # is the search key. First, search HEBID field (HEBID).
  # If fails, then search the ISBN fields (ISBN[1-3]_13).
  fm_row_list = fulcrum_csv.select do |row|
    #(!row['ISBN(s)'].nil? and row['ISBN(s)'].gsub(/\-/,'').include?(isbn))
    (!row['ISBN_SEARCH'].nil? and row['ISBN_SEARCH'].include?(isbn))
  end
  if fm_row_list.empty?
    script_logger.warn("no CSV row found for #{isbn}. Skipping.")
    next
  end
  if fm_row_list.count > 1
    script_logger.warn("multiple CSV rows found for #{isbn}. Skipping.")
    next
  end
  script_logger.info("Found CSV row for #{isbn}.")
  monograph_row = fm_row_list.first

  case File.extname(ebook_file)
  when '.epub'
    epub = UMPTG::EPUB::Archive.new(epub_file: ebook_file)
    title_nodelist = epub.opf_doc.xpath("//*[local-name()='metadata']/*[name()='dc:title']")
    ebook_title = title_nodelist.first.text
  when '.pdf'
    pdf = Origami::PDF.read(ebook_file, lazy: true)
    ebook_title = pdf.title
    ebook_title = pdf.metadata['title'] if ebook_title.nil? or ebook_title.strip.empty?
    #puts pdf.metadata
  else
    ebook_title = 'unknown'
  end

  script_logger.info("Metadata Title: #{monograph_row['Title']}")
  script_logger.info("EBook Title   : #{ebook_title}")
  STDOUT.flush
end