-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathreplace_largefile_bitstream.py
More file actions
executable file
·169 lines (148 loc) · 5.85 KB
/
replace_largefile_bitstream.py
File metadata and controls
executable file
·169 lines (148 loc) · 5.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env python
__author__ = 'dan'
import re
import os
import sys
import shutil
import hashlib
import mimetypes
from sql_utils import dict_from_query, sql_query
ASSETSTORE_PATH = '/opt/dryad-data/assetstore/'
def check_write_access():
return os.access(ASSETSTORE_PATH,os.W_OK)
def get_bitstream_id():
matched_bitstream_id = None
matchers = (
'http.*bitstream\/id\/(\d+)\/.*',
'http.*bitstreamID=(\d+).*',
'(\d+)',
)
while matched_bitstream_id is None:
raw = raw_input('Enter the bitstream ID or URL: ')
for matcher in matchers:
matched_bitstream_id = re.match(matcher, raw)
if matched_bitstream_id is not None:
break
return int(matched_bitstream_id.group(1))
class bitstream_file(object):
def __init__(self,path):
self.path = path
self.size = os.path.getsize(self.path)
self.name = os.path.basename(self.path)
self.mimetype = mimetypes.guess_type(self.path)[0]
def md5(self):
'''
hashlib md5 file implementation from http://joelverhagen.com/blog/2011/02/md5-hash-of-file-in-python/
'''
with open(self.path, 'rb') as fh:
m = hashlib.md5()
while True:
data = fh.read(65536)
if not data:
break
m.update(data)
return m.hexdigest()
def __unicode__(self):
return u'Name: %s, Size: %d, MD5: %s, Mime-Type: %s' % (self.name, self.size, self.md5(), self.mimetype)
def get_largefile_path():
return raw_input('Enter the path on the filesystem to the large file: ')
def verify_file(bitstream_id, file):
'''
Looks up the file info in the database, verifies its size and md5 sum against what's in postgres
'''
file_dict = query_bitstream_table(bitstream_id)
if file_dict is None:
print "No file found for bitstream_id %d" % bitstream_id
return False
# size
if int(file_dict['size_bytes']) != file.size:
print "Size mismatch: %d / %d" % (int(file_dict['size_bytes']), file.size)
return False
# md5
calculated_md5 = file.md5()
if file_dict['checksum'] != calculated_md5:
print "MD5 mismatch: %s / %s" % ( file_dict['checksum'], calculated_md5)
return False
return True
def get_assetstore_path(bitstream_id):
bitstream_dict = query_bitstream_table(bitstream_id)
if bitstream_dict is None:
raise Exception("Unable to get bitstream info from database")
internal_id = bitstream_dict['internal_id']
parts = (internal_id[0:2],internal_id[2:4],internal_id[4:6])
return ASSETSTORE_PATH + '/'.join(parts) + '/' + internal_id
def place_largefile(bitstream_id, largefile):
destination_path = get_assetstore_path(bitstream_id)
if not os.access(destination_path, os.W_OK):
raise Exception("Unable to get write access on the destination path")
print "Copying '%s' -> '%s'" % (largefile.path, destination_path)
shutil.copyfile(largefile.path, destination_path)
def query_bitstream_table(bitstream_id):
'''
Returns a dict of values for the bitstream id
'''
sql = 'SELECT bitstream_format_id, name, size_bytes, checksum, checksum_algorithm, source, internal_id ' \
'FROM bitstream WHERE bitstream_id = %d' % bitstream_id
return dict_from_query(sql)
def query_bitstream_format(large_file):
# sql = "SELECT bitstream_format_id FROM bitstreamformatregistry where mimetype = '%s'" % mimetype
extension = ""
extension_match = re.match("^.*\.(.+)$", large_file.name)
if extension_match != None:
extension = extension_match.group(1)
sql = "SELECT bitstreamformatregistry.* FROM bitstreamformatregistry, fileextension WHERE fileextension.extension LIKE '%s' AND bitstreamformatregistry.bitstream_format_id=fileextension.bitstream_format_id" % extension
return dict_from_query(sql)
def update_bitstream_table(bitstream_id, large_file):
format_dict = query_bitstream_format(large_file)
if format_dict is None or format_dict['bitstream_format_id'] == "(0 rows)":
format_id = 1
else:
format_id = format_dict['bitstream_format_id'] # stays a string
sql = "UPDATE bitstream set size_bytes=%d, name='%s', source='%s' ,checksum='%s', bitstream_format_id=%s where bitstream_id = %d" % (
large_file.size,
large_file.name,
large_file.name,
large_file.md5(),
format_id,
bitstream_id
)
print "Executing SQL: %s" % sql
print sql_query(sql).read()
def main():
if check_write_access() == False:
print "Cannot get write access to %s, check permissions or user account" % ASSETSTORE_PATH
exit(-1)
bitstream_id = None
largefile_path = ""
if len(sys.argv) == 3:
bitstream_id = int(sys.argv[1])
largefile_path = sys.argv[2]
else:
bitstream_id = get_bitstream_id()
largefile_path = get_largefile_path()
print "Bitstream ID: %d" % bitstream_id
assetstore_path = get_assetstore_path(bitstream_id)
dummyfile, largefile = None, None
try:
largefile = bitstream_file(largefile_path)
dummyfile = bitstream_file(assetstore_path)
except BaseException as e:
print "Unable to read file: %s" % e
exit(-1)
# Files are loaded
# Verify the Dummy file is right
verify_file(bitstream_id, dummyfile)
# Update the bitstream table
place_largefile(bitstream_id, largefile)
update_bitstream_table(bitstream_id, largefile)
# Verify the replaced file is right
replaced_file = bitstream_file(assetstore_path)
result = verify_file(bitstream_id, replaced_file)
if result == True:
print "SUCCESS: %s can be deleted." % largefile_path
sys.exit(0)
else:
print "FAILURE: please report the output of this script to devs."
sys.exit(1)
if __name__ == '__main__':
main()