Skip to content

Commit 0abf5fa

Browse files
committed
Allow matcher file
1 parent b03deb4 commit 0abf5fa

File tree

1 file changed

+134
-13
lines changed
  • dspace/src/main/edu/georgetown/library/fileAnalyzer/importer

1 file changed

+134
-13
lines changed

dspace/src/main/edu/georgetown/library/fileAnalyzer/importer/EAD2DAO.java

Lines changed: 134 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,9 @@
22

33
import java.io.File;
44
import java.io.IOException;
5-
import java.text.ParseException;
6-
import java.text.SimpleDateFormat;
7-
import java.util.Date;
85
import java.util.HashMap;
96
import java.util.TreeMap;
107
import java.util.Vector;
11-
import java.util.regex.Pattern;
128

139
import javax.xml.transform.TransformerException;
1410

@@ -19,6 +15,8 @@
1915
import gov.nara.nwts.ftapp.ActionResult;
2016
import gov.nara.nwts.ftapp.FTDriver;
2117
import gov.nara.nwts.ftapp.Timer;
18+
import gov.nara.nwts.ftapp.ftprop.FTPropEnum;
19+
import gov.nara.nwts.ftapp.ftprop.FTPropFile;
2220
import gov.nara.nwts.ftapp.ftprop.FTPropString;
2321
import gov.nara.nwts.ftapp.importer.DefaultImporter;
2422
import gov.nara.nwts.ftapp.importer.DelimitedFileReader;
@@ -37,12 +35,28 @@
3735
*/
3836
public class EAD2DAO extends DefaultImporter {
3937

40-
public static enum EAD2DCStatsItems implements StatsItemEnum {
41-
Record(StatsItem.makeStringStatsItem("Record", 100).setExport(false));
42-
38+
public static final String P_MATCHTYPE = "match-type";
39+
public static final String P_DCCSV = "csv-file";
40+
public static final String P_MATCH = "match-col";
41+
public static final String P_NAME = "name-col";
42+
public static final String P_DAOID = "daoid-col";
43+
public static final String P_LINK = "link-col";
44+
public static final String P_THUMB = "thumb-col";
45+
private FTPropFile dcFile;
46+
public static enum EAD2DAOStatsItems implements StatsItemEnum {
47+
Record(StatsItem.makeStringStatsItem("Record", 100).setExport(false)),
48+
Field_Name(StatsItem.makeStringStatsItem("Field Name").setInitVal("TBD")),
49+
EAD_ID(StatsItem.makeStringStatsItem("EAD ID", 100)),
50+
REF_ID(StatsItem.makeStringStatsItem("REF ID", 150)),
51+
DigitalObjectId(StatsItem.makeStringStatsItem("Digital Object ID", 150)),
52+
DigitalObjectTitle(StatsItem.makeStringStatsItem("Digital Object Title", 150)),
53+
PublishDAO(StatsItem.makeEnumStatsItem(TF.class, "Publish Digital Object Record")),
54+
DAOLink(StatsItem.makeStringStatsItem("File URL of Linked-to digital object", 150)),
55+
DAOThumbnail(StatsItem.makeStringStatsItem("File URL of Thumbnail", 150));
56+
;
4357
StatsItem si;
4458

45-
EAD2DCStatsItems(StatsItem si) {
59+
EAD2DAOStatsItems(StatsItem si) {
4660
this.si = si;
4761
}
4862

@@ -59,11 +73,41 @@ public Stats create(String key) {
5973
}
6074

6175
public static StatsItemConfig details = StatsItemConfig
62-
.create(EAD2DCStatsItems.class);
76+
.create(EAD2DAOStatsItems.class);
77+
78+
public static enum TF {TRUE,FALSE}
79+
public static enum EAD_MATCHER {
80+
TITLE(4),
81+
AS_REFID(2);
82+
int index;
83+
EAD_MATCHER(int index) {
84+
this.index = index;
85+
}
86+
}
6387

6488

6589
public EAD2DAO(FTDriver dt) {
6690
super(dt);
91+
dcFile = new FTPropFile(this.dt, this.getClass().getSimpleName(), P_DCCSV, P_DCCSV, "CSV file containing columns to match, Optional", "");
92+
this.ftprops.add(new FTPropEnum(dt, this.getClass().getSimpleName(),
93+
P_MATCHTYPE, P_MATCHTYPE,
94+
"Name of EAD field to match in CSV file", EAD_MATCHER.values(), EAD_MATCHER.TITLE));
95+
ftprops.add(dcFile);
96+
this.ftprops.add(new FTPropString(dt, this.getClass().getSimpleName(),
97+
P_MATCH, P_MATCH,
98+
"Name of column to match","dc.title[en]"));
99+
this.ftprops.add(new FTPropString(dt, this.getClass().getSimpleName(),
100+
P_DAOID, P_DAOID,
101+
"Name of column to assign as a DAO identifier","dc.identifier.uri[en]"));
102+
this.ftprops.add(new FTPropString(dt, this.getClass().getSimpleName(),
103+
P_NAME, P_NAME,
104+
"Name of column to use as a DAO name","dc.title[en]"));
105+
this.ftprops.add(new FTPropString(dt, this.getClass().getSimpleName(),
106+
P_LINK, P_LINK,
107+
"Name of column to assign as a DAO link","dc.identifier.uri[en]"));
108+
this.ftprops.add(new FTPropString(dt, this.getClass().getSimpleName(),
109+
P_THUMB, P_THUMB,
110+
"Name of column to assign as a thumbnail url","thumbnail-link"));
67111
}
68112

69113
public String toString() {
@@ -77,8 +121,77 @@ public String getShortName() {
77121
return "EAD2DAO";
78122
}
79123

124+
private HashMap<String,Vector<String>> mapVals = new HashMap<>();
125+
private int i_match = -1;
126+
private int i_dao = -1;
127+
private int i_name = -1;
128+
private int i_link = -1;
129+
private int i_thumb = -1;
130+
131+
public void initMapFile() throws IOException {
132+
i_match = -1;
133+
i_dao = -1;
134+
i_name = -1;
135+
i_link = -1;
136+
i_thumb = -1;
137+
mapVals.clear();
138+
File f = dcFile.getFile();
139+
if (f == null) {
140+
return;
141+
}
142+
if (!f.exists()) {
143+
return;
144+
}
145+
DelimitedFileReader dfr = new DelimitedFileReader(f, ",");
146+
Vector<String> header = dfr.getRow();
147+
for(int i=0; i<header.size(); i++) {
148+
String s = header.get(i);
149+
if (this.getProperty(P_MATCH).equals(s)) {
150+
i_match = i;
151+
}
152+
if (this.getProperty(P_DAOID).equals(s)) {
153+
i_dao = i;
154+
}
155+
if (this.getProperty(P_NAME).equals(s)) {
156+
i_name = i;
157+
}
158+
if (this.getProperty(P_LINK).equals(s)) {
159+
i_link = i;
160+
}
161+
if (this.getProperty(P_THUMB).equals(s)) {
162+
i_thumb = i;
163+
}
164+
}
165+
if (i_match == -1) {
166+
return;
167+
}
168+
for(Vector<String>row=dfr.getRow(); row!=null; row=dfr.getRow()) {
169+
String key = normalizeKey(row.get(i_match));
170+
mapVals.put(key, row);
171+
}
172+
}
173+
174+
public String normalizeKey(String s) {
175+
return s.toLowerCase()
176+
.replaceAll("[^a-z0-9]", " ")
177+
.replaceAll(" +", " ");
178+
}
179+
180+
public String getMapValue(String key, int col, String def) {
181+
if (col < 0) {
182+
return def;
183+
}
184+
key = normalizeKey(key);
185+
if (!mapVals.containsKey(key)) {
186+
return def;
187+
}
188+
return mapVals.get(key).get(col);
189+
}
190+
80191
public ActionResult importFile(File selectedFile) throws IOException {
81-
details = StatsItemConfig.create(EAD2DCStatsItems.class);
192+
details = StatsItemConfig.create(EAD2DAOStatsItems.class);
193+
EAD_MATCHER matcher = (EAD_MATCHER)getProperty(P_MATCHTYPE);
194+
initMapFile();
82195
HashMap<String, Object> params = new HashMap<>();
83196
Timer timer = new Timer();
84197
TreeMap<String, Stats> types = new TreeMap<String, Stats>();
@@ -89,14 +202,22 @@ public ActionResult importFile(File selectedFile) throws IOException {
89202
XMLUtil.doTransform(d, csv, "edu/georgetown/library/fileAnalyzer/ead-dao.xsl", params);
90203
DelimitedFileReader dfr = new DelimitedFileReader(csv, ",");
91204
Vector<String> header = dfr.getRow();
92-
for(String col: header) {
93-
details.addStatsItem(col, StatsItem.makeStringStatsItem(col));
94-
}
95205
int rownum = 1_000_000;
96206
for(Vector<String>row=dfr.getRow(); row!=null; row=dfr.getRow()) {
97207
String key = ""+rownum++;
98208
Stats stats = Generator.INSTANCE.create(key);
99209
types.put(key, stats);
210+
if (row.size() >= 8) {
211+
String matchkey = row.get(matcher.index);
212+
stats.setVal(EAD2DAOStatsItems.Field_Name, row.get(0));
213+
stats.setVal(EAD2DAOStatsItems.EAD_ID, row.get(1));
214+
stats.setVal(EAD2DAOStatsItems.REF_ID, row.get(2));
215+
stats.setVal(EAD2DAOStatsItems.DigitalObjectId, getMapValue(matchkey, i_dao, row.get(3)));
216+
stats.setVal(EAD2DAOStatsItems.DigitalObjectTitle, getMapValue(matchkey, i_name, row.get(4)));
217+
stats.setVal(EAD2DAOStatsItems.PublishDAO, row.get(5));
218+
stats.setVal(EAD2DAOStatsItems.DAOLink, getMapValue(matchkey, i_link, row.get(6)));
219+
stats.setVal(EAD2DAOStatsItems.DAOThumbnail, getMapValue(matchkey, i_thumb, row.get(7)));
220+
}
100221
for(int i=0; i<header.size(); i++) {
101222
String s = row.size() > i ? row.get(i) : "";
102223
String col = header.get(i);

0 commit comments

Comments
 (0)