-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy paths3_move_local_bitstreams.py
More file actions
executable file
·75 lines (60 loc) · 2.88 KB
/
s3_move_local_bitstreams.py
File metadata and controls
executable file
·75 lines (60 loc) · 2.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python
# Move bitstreams from local assetstore to S3 bucket. If a single internal ID is provided, just moves that one bitstream to S3.
__author__ = 'daisie'
import os
import re
import sys
import json
import hashlib
from sql_utils import list_from_query, sql_query
ASSETSTORE_BUCKET = os.environ['ASSETSTORE_BUCKET']
ASSETSTORE_PATH = '/opt/dryad-data/assetstore/'
def get_assetstore_path(internal_id):
parts = (internal_id[0:2],internal_id[2:4],internal_id[4:6])
return ASSETSTORE_PATH + '/'.join(parts) + '/' + internal_id
def validate_s3_file(bitstream):
cmd = 'aws s3api head-object --bucket %s --key "%s"' % (ASSETSTORE_BUCKET, bitstream['internal_id'])
result = os.popen(cmd).read()
if (result != ""):
metadata = json.loads(result)
if ('md5' in metadata['Metadata']):
if (long(bitstream['size_bytes']) == long(metadata['ContentLength'])) and (bitstream['checksum'] == metadata['Metadata']['md5']):
return True
return False
def update_database(bitstream_id):
print "Updating database..."
print sql_query("update bitstream set store_number=1 where bitstream_id=%s" % (bitstream_id)).read()
def main():
if len(sys.argv) == 2:
bitstream_id = int(sys.argv[1])
bitstreams = list_from_query("select bitstream_id, internal_id, checksum, size_bytes from bitstream where bitstream_id=%s" % (bitstream_id))
else:
print "Gathering bitstreams..."
bitstreams = list_from_query("select bitstream_id, internal_id, checksum, size_bytes from bitstream where deleted=false and store_number=0 order by bitstream_id ASC")
print "Processing %d local bitstreams" % (len(bitstreams))
for bitstream in bitstreams:
internal_id = bitstream['internal_id']
md5 = bitstream['checksum']
bitstream_id = bitstream['bitstream_id']
size = bitstream['size_bytes']
print "Checking to see if %s exists at S3..." % (internal_id)
if (validate_s3_file(bitstream)):
print "File %s already exists at S3" % (internal_id)
update_database(bitstream_id)
else:
print "Copying %s to s3..." % (internal_id)
sys.stdout.flush()
cmd = 'aws s3 cp "%s" "s3://%s/%s" --metadata md5=%s --expected-size=%s' % (get_assetstore_path(internal_id), ASSETSTORE_BUCKET, internal_id, md5, size)
if (os.popen(cmd).close() is None):
print "Verifying file size and md5 of %s..." % (internal_id)
if validate_s3_file(bitstream):
update_database(bitstream_id)
else:
print "S3 copy does not match local copy, skipping database update."
else:
print "AWS copy error, exiting"
exit(1)
sys.stdout.flush()
print "Done."
if __name__ == '__main__':
main()