33
44module OAI
55 module Harvester
6-
76 class Harvest
7+ DIRECTORY_LAYOUT = "%Y/%m" . freeze
88
9- def initialize ( config = nil , directory = nil , date = nil )
9+ def initialize ( config = nil , directory = nil , date = nil , to = nil )
1010 @config = config || Config . load
1111 @directory = directory || @config . storage
1212 @from = date
1313 @from . freeze
14+ @until = to
15+ @until . freeze
1416 @parser = defined? ( XML ::Document ) ? 'libxml' : 'rexml'
1517 end
1618
@@ -30,7 +32,11 @@ def start(sites = nil, interactive = false)
3032
3133 def harvest ( site )
3234 opts = build_options_hash ( @config . sites [ site ] )
33- harvest_time = Time . now . utc
35+ if @until
36+ harvest_time = @until . to_time . utc
37+ else
38+ harvest_time = Time . now . utc
39+ end
3440
3541 if "YYYY-MM-DD" == granularity ( opts [ :url ] )
3642 opts [ :until ] = harvest_time . strftime ( "%Y-%m-%d" )
@@ -43,22 +49,27 @@ def harvest(site)
4349 # Allow a from date to be passed in
4450 opts [ :from ] = earliest ( opts [ :url ] ) unless opts [ :from ]
4551 opts . delete ( :set ) if 'all' == opts [ :set ]
46-
4752 begin
4853 # Connect, and download
4954 file , records = call ( opts . delete ( :url ) , opts )
5055
51- # Move document to storage directory
52- dir = File . join ( @directory , date_based_directory ( harvest_time ) )
53- FileUtils . mkdir_p dir
54- FileUtils . mv ( file . path ,
55- File . join ( dir , "#{ site } -#{ filename ( Time . parse ( opts [ :from ] ) ,
56- harvest_time ) } .xml.gz") )
56+ # Move document to storage directory if configured
57+ if @directory
58+ directory_layout = @config . layouts [ site ] if @config . layouts
59+ dir = File . join ( @directory , date_based_directory ( harvest_time , directory_layout ) )
60+ FileUtils . mkdir_p dir
61+ FileUtils . mv ( file . path ,
62+ File . join ( dir , "#{ site } -#{ filename ( Time . parse ( opts [ :from ] ) ,
63+ harvest_time ) } .xml.gz") )
64+ else
65+ puts "no configured destination for temp file" if @interactive
66+ end
5767 @config . sites [ site ] [ 'last' ] = harvest_time
58- rescue
59- raise $! unless $!. respond_to? ( :code )
60- raise $! if not @interactive || "noRecordsMatch" != $!. code
61- puts "No new records available"
68+ rescue OAI ::NoMatchException
69+ puts "No new records available" if @interactive
70+ rescue OAI ::Exception => ex
71+ raise ex if not @interactive
72+ puts ex . message
6273 end
6374 end
6475
@@ -69,15 +80,15 @@ def call(url, opts)
6980 records = 0 ;
7081 client = OAI ::Client . new ( url , :parser => @parser )
7182 provider_config = client . identify
72-
83+
7384 file = Tempfile . new ( 'oai_data' )
7485 gz = Zlib ::GzipWriter . new ( file )
7586 gz << "<?xml version=\" 1.0\" encoding=\" UTF-8\" ?>\n "
7687 gz << "<records>"
7788 begin
7889 response = client . list_records ( options )
79- get_records ( response . doc ) . each do |rec |
80- gz << rec
90+ response . each do |rec |
91+ gz << rec . _source
8192 records += 1
8293 end
8394 puts "#{ records } records retrieved" if @interactive
@@ -89,8 +100,8 @@ def call(url, opts)
89100 puts "\n resumption token recieved, continuing" if @interactive
90101 response = client . list_records ( :resumption_token =>
91102 response . resumption_token )
92- get_records ( response . doc ) . each do |rec |
93- gz << rec
103+ response . each do |rec |
104+ gz << rec . _source
94105 records += 1
95106 end
96107 puts "#{ records } records retrieved" if @interactive
@@ -118,8 +129,9 @@ def build_options_hash(site)
118129 options
119130 end
120131
121- def date_based_directory ( time )
122- "#{ time . strftime ( DIRECTORY_LAYOUT ) } "
132+ def date_based_directory ( time , directory_layout = nil )
133+ directory_layout ||= Harvest ::DIRECTORY_LAYOUT
134+ "#{ time . strftime ( directory_layout ) } "
123135 end
124136
125137 def filename ( from_time , until_time )
@@ -147,4 +159,4 @@ def earliest(url)
147159 end
148160
149161 end
150- end
162+ end
0 commit comments