diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..56dc842 Binary files /dev/null and b/.DS_Store differ diff --git a/README.md b/README.md index dbb6c4a..835e103 100644 --- a/README.md +++ b/README.md @@ -20,11 +20,22 @@ Requires Python >= 3.5 4. Edit configuration file. The file is annotated with descriptions of the configuration options. ## Running -To run batch-loader: +1.To run batch-loader with csv file as input: source ENV/bin/activate python batch_loader.py +2.To run the batch loader with excel file as input + + source ENV/bin/activate + python batch_loader.py --xlsx + +## Adding/Deleting/modifying a required field name + +- The required field names are configurable and can be modified based on user input. To add/delete/modify the required values to be loaded to the program follow the steps provided below - + + 1. Navigate to config.py file + 2. Under 'required=()' add/delete/modify the field name which is to be loaded ## Specification of CSV diff --git a/batch_loader.py b/batch_loader.py index f0cee6c..983c29b 100644 --- a/batch_loader.py +++ b/batch_loader.py @@ -7,21 +7,17 @@ import os import shutil import subprocess +import config + +import xlrd +from collections import OrderedDict log = logging.getLogger(__name__) -required_field_names = ( - 'files', - 'first_file', - 'resource_type1', - 'title1', - 'creator1', - 'license1', - 'rights_statement', - 'object_id' -) +required_field_names = config.required + def load_csv(filepath): """ Reads CSV and returns field names, rows @@ -31,6 +27,30 @@ def load_csv(filepath): reader = csv.DictReader(csvfile) return reader.fieldnames, list(reader) +def load_excel(filepath): + """ + Reads Excel and returns field names, rows + """ + log.debug('Loading excel') + wb = xlrd.open_workbook(filepath) + sheet = wb.sheet_by_index(0) + sheet.cell_value(0, 0) + + list_1 = [] + #list_1 is the list of field names passed in the first row + od = OrderedDict() + list_2 = [] + #list_2 is an OrderedDict + for i in range(sheet.ncols): + list_1.insert(i, sheet.cell_value(0, i)) + + for j in range(1, sheet.nrows): + for i in range(sheet.ncols): + od[sheet.cell_value(0, i)] = sheet.cell_value(j, i) + + list_2.append(od) + + return list_1,list_2 def validate_field_names(field_names): log.debug('Validating field names') @@ -142,21 +162,36 @@ def repo_import(repo_metadata_filepath, title, first_file, other_files, reposito parser = argparse.ArgumentParser(description='Loads into GW Scholarspace from CSV') parser.add_argument('--debug', action='store_true') - parser.add_argument('csv', help='filepath of CSV file') + parser.add_argument('filepath', help='filepath of CSV/Excel file') + + parser.add_argument('--xlsx', action='store_true') args = parser.parse_args() + + if args.xlsx: + + #enter to load_excel method if --xlsx is passed + field_names, rows = load_excel(args.filepath) + + else: + + field_names, rows = load_csv(args.filepath) + + #else load_csv method is passed + logging.basicConfig( level=logging.DEBUG if args.debug else logging.INFO ) logging.basicConfig(level=logging.DEBUG) - field_names, rows = load_csv(args.csv) - log.info('Loading {} object from {}'.format(len(rows), args.csv)) + + #log.info('Loading {} object from {}'.format(len(rows), args.csv)) validate_field_names(field_names) singular_field_names, repeating_field_names = analyze_field_names(field_names) - base_filepath = os.path.dirname(os.path.abspath(args.csv)) + base_filepath = os.path.dirname(os.path.abspath(args.filepath)) + for row in rows: metadata = create_repository_metadata(row, singular_field_names, repeating_field_names) @@ -181,4 +216,4 @@ def repo_import(repo_metadata_filepath, title, first_file, other_files, reposito raise e finally: if (not config.debug_mode) and os.path.exists(metadata_filepath): - shutil.rmtree(metadata_temp_path, ignore_errors=True) + shutil.rmtree(metadata_temp_path, ignore_errors=True) \ No newline at end of file diff --git a/example.config.py b/example.config.py index 0ad1822..df8e183 100644 --- a/example.config.py +++ b/example.config.py @@ -1,4 +1,4 @@ -# GW ScholarSpace ingest configuration +"""# GW ScholarSpace ingest configuration ingest_path = "/opt/scholarspace/scholarspace-hyrax" ingest_command = "rvmsudo RAILS_ENV=production rake gwss:ingest_etd" @@ -6,9 +6,18 @@ # Example for fake_rake: """ -ingest_path = "example/" -# Command location is relative to ingest path -ingest_command = "python ../fake_rake.py" - +ingest_path = "./example/" +# # Command location is relative to ingest path +ingest_command = "python3 ../fake_rake.py" +# debug_mode = True -""" + +# Add/Delete the required values below +required = ('files', + 'first_file', + 'resource_type1', + 'title1', + 'creator1', + 'license1', + 'rights_statement', + 'object_id') diff --git a/example/.DS_Store b/example/.DS_Store new file mode 100644 index 0000000..ac62099 Binary files /dev/null and b/example/.DS_Store differ diff --git a/example/ExcelRead.xlsx b/example/ExcelRead.xlsx new file mode 100644 index 0000000..aa8a63d Binary files /dev/null and b/example/ExcelRead.xlsx differ diff --git a/example/example.csv b/example/example.csv index 57654b4..0e82a99 100644 --- a/example/example.csv +++ b/example/example.csv @@ -1,3 +1,3 @@ files,resource_type1,title1,creator1,creator2,resource_type2,license1,first_file,object_id,depositor,rights_statement -,Treatise,Our Treatise,"Littman, Justin","Trent, Rachel",Journal,http://creativecommons.org/licenses/by/3.0/us/,,d504rk891,kerchner@gwu.edu,http://rightsstatements.org/vocab/InC/1.0/ -my_book,book,My Book,"Littman, Justin",,Journal,http://creativecommons.org/licenses/by/3.0/us/,my_book/intro.txt,,openaccess@gwu.edu,http://rightsstatements.org/vocab/InC/1.0/ +,Treatise,Our Treatise,Littman-Justin,Trent-Rachel,Journal,http://creativecommons.org/licenses/by/3.0/us/,,d504rk891,kerchner@gwu.edu,http://rightsstatements.org/vocab/InC/1.0/ +my_book,book,My Book,Littman-Justin,,Journal,http://creativecommons.org/licenses/by/3.0/us/,my_book/intro.txt,,openaccess@gwu.edu,http://rightsstatements.org/vocab/InC/1.0/ \ No newline at end of file diff --git a/example/example11.csv b/example/example11.csv new file mode 100644 index 0000000..ed5c0e9 Binary files /dev/null and b/example/example11.csv differ diff --git a/example/my_book/.DS_Store b/example/my_book/.DS_Store new file mode 100644 index 0000000..de89453 Binary files /dev/null and b/example/my_book/.DS_Store differ diff --git a/fake_rake.py b/fake_rake.py index 2d1814b..8f43549 100755 --- a/fake_rake.py +++ b/fake_rake.py @@ -5,4 +5,4 @@ if __name__ == '__main__': print(sys.argv, file=sys.stderr) - print(random.randint(1, 1000000)) + print(random.randint(1, 1000000)) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..bd208ae --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +xlrd==1.2.0