Skip to content
This repository was archived by the owner on Dec 12, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,22 @@ Requires Python >= 3.5
4. Edit configuration file. The file is annotated with descriptions of the configuration options.

## Running
To run batch-loader:
1.To run batch-loader with csv file as input:

source ENV/bin/activate
python batch_loader.py <path to csv>

2.To run the batch loader with excel file as input

source ENV/bin/activate
python batch_loader.py <path to excel> --xlsx

## Adding/Deleting/modifying a required field name

- The required field names are configurable and can be modified based on user input. To add/delete/modify the required values to be loaded to the program follow the steps provided below -

1. Navigate to config.py file
2. Under 'required=()' add/delete/modify the field name which is to be loaded

## Specification of CSV

Expand Down
65 changes: 50 additions & 15 deletions batch_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,17 @@
import os
import shutil
import subprocess
import config

import xlrd
from collections import OrderedDict

log = logging.getLogger(__name__)

required_field_names = (
'files',
'first_file',
'resource_type1',
'title1',
'creator1',
'license1',
'rights_statement',
'object_id'
)


required_field_names = config.required

def load_csv(filepath):
"""
Reads CSV and returns field names, rows
Expand All @@ -31,6 +27,30 @@ def load_csv(filepath):
reader = csv.DictReader(csvfile)
return reader.fieldnames, list(reader)

def load_excel(filepath):
"""
Reads Excel and returns field names, rows
"""
log.debug('Loading excel')
wb = xlrd.open_workbook(filepath)
sheet = wb.sheet_by_index(0)
sheet.cell_value(0, 0)

list_1 = []
#list_1 is the list of field names passed in the first row
od = OrderedDict()
list_2 = []
#list_2 is an OrderedDict
for i in range(sheet.ncols):
list_1.insert(i, sheet.cell_value(0, i))

for j in range(1, sheet.nrows):
for i in range(sheet.ncols):
od[sheet.cell_value(0, i)] = sheet.cell_value(j, i)

list_2.append(od)

return list_1,list_2

def validate_field_names(field_names):
log.debug('Validating field names')
Expand Down Expand Up @@ -142,21 +162,36 @@ def repo_import(repo_metadata_filepath, title, first_file, other_files, reposito

parser = argparse.ArgumentParser(description='Loads into GW Scholarspace from CSV')
parser.add_argument('--debug', action='store_true')
parser.add_argument('csv', help='filepath of CSV file')
parser.add_argument('filepath', help='filepath of CSV/Excel file')

parser.add_argument('--xlsx', action='store_true')

args = parser.parse_args()


if args.xlsx:

#enter to load_excel method if --xlsx is passed
field_names, rows = load_excel(args.filepath)

else:

field_names, rows = load_csv(args.filepath)

#else load_csv method is passed

logging.basicConfig(
level=logging.DEBUG if args.debug else logging.INFO
)
logging.basicConfig(level=logging.DEBUG)

field_names, rows = load_csv(args.csv)
log.info('Loading {} object from {}'.format(len(rows), args.csv))

#log.info('Loading {} object from {}'.format(len(rows), args.csv))
validate_field_names(field_names)
singular_field_names, repeating_field_names = analyze_field_names(field_names)

base_filepath = os.path.dirname(os.path.abspath(args.csv))
base_filepath = os.path.dirname(os.path.abspath(args.filepath))


for row in rows:
metadata = create_repository_metadata(row, singular_field_names, repeating_field_names)
Expand All @@ -181,4 +216,4 @@ def repo_import(repo_metadata_filepath, title, first_file, other_files, reposito
raise e
finally:
if (not config.debug_mode) and os.path.exists(metadata_filepath):
shutil.rmtree(metadata_temp_path, ignore_errors=True)
shutil.rmtree(metadata_temp_path, ignore_errors=True)
21 changes: 15 additions & 6 deletions example.config.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
# GW ScholarSpace ingest configuration
"""# GW ScholarSpace ingest configuration
ingest_path = "/opt/scholarspace/scholarspace-hyrax"
ingest_command = "rvmsudo RAILS_ENV=production rake gwss:ingest_etd"

debug_mode = False

# Example for fake_rake:
"""
ingest_path = "example/"
# Command location is relative to ingest path
ingest_command = "python ../fake_rake.py"

ingest_path = "./example/"
# # Command location is relative to ingest path
ingest_command = "python3 ../fake_rake.py"
#
debug_mode = True
"""

# Add/Delete the required values below
required = ('files',
'first_file',
'resource_type1',
'title1',
'creator1',
'license1',
'rights_statement',
'object_id')
Binary file added example/.DS_Store
Binary file not shown.
Binary file added example/ExcelRead.xlsx
Binary file not shown.
4 changes: 2 additions & 2 deletions example/example.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
files,resource_type1,title1,creator1,creator2,resource_type2,license1,first_file,object_id,depositor,rights_statement
,Treatise,Our Treatise,"Littman, Justin","Trent, Rachel",Journal,http://creativecommons.org/licenses/by/3.0/us/,,d504rk891,[email protected],http://rightsstatements.org/vocab/InC/1.0/
my_book,book,My Book,"Littman, Justin",,Journal,http://creativecommons.org/licenses/by/3.0/us/,my_book/intro.txt,,[email protected],http://rightsstatements.org/vocab/InC/1.0/
,Treatise,Our Treatise,Littman-Justin,Trent-Rachel,Journal,http://creativecommons.org/licenses/by/3.0/us/,,d504rk891,[email protected],http://rightsstatements.org/vocab/InC/1.0/
my_book,book,My Book,Littman-Justin,,Journal,http://creativecommons.org/licenses/by/3.0/us/,my_book/intro.txt,,[email protected],http://rightsstatements.org/vocab/InC/1.0/
Binary file added example/example11.csv
Binary file not shown.
Binary file added example/my_book/.DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion fake_rake.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@

if __name__ == '__main__':
print(sys.argv, file=sys.stderr)
print(random.randint(1, 1000000))
print(random.randint(1, 1000000))
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
xlrd==1.2.0