Skip to content

fix: create multiple dataframes from same CSVReader #44

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@
<dependency>
<groupId>de.siegmar</groupId>
<artifactId>fastcsv</artifactId>
<version>2.2.2</version>
<version>3.6.0</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
Expand Down
120 changes: 42 additions & 78 deletions src/main/java/com/cefriel/template/io/csv/CSVReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,113 +16,77 @@

package com.cefriel.template.io.csv;

import com.cefriel.template.io.Reader;
import com.cefriel.template.utils.TemplateFunctions;
import de.siegmar.fastcsv.reader.NamedCsvReader;
import de.siegmar.fastcsv.reader.NamedCsvRow;
import de.siegmar.fastcsv.reader.CsvReader;
import de.siegmar.fastcsv.reader.NamedCsvRecord;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;

public class CSVReader implements Reader {
public class CSVReader extends CSVReaderAbstract {

public NamedCsvReader document;
private boolean hashVariable;
private boolean onlyDistinct;
private final List<NamedCsvRecord> csvRecords;

public CSVReader(File file) throws IOException {
if (Files.exists(file.toPath()))
this.document = NamedCsvReader.builder().build(file.toPath());
else
if (Files.exists(file.toPath())) {
try (CsvReader<NamedCsvRecord> input = CsvReader.builder().ofNamedCsvRecord(file.toPath())) {
this.csvRecords = input.stream().collect(Collectors.toList());
}
headers = csvRecords.get(0).getHeader();
} else {
throw new IllegalArgumentException("File does not exist: " + file.getPath());
}
}

public CSVReader(String csv) {
this.document = NamedCsvReader.builder().build(csv);
}
@Override
public void setQueryHeader(String header) {

}

@Override
public void appendQueryHeader(String s) {

public CSVReader(String csv) throws IOException {
try (CsvReader<NamedCsvRecord> input = CsvReader.builder().ofNamedCsvRecord(csv)) {
this.csvRecords = input.stream().collect(Collectors.toList());
}
headers = csvRecords.get(0).getHeader();
}

public List<Map<String, String>> getDataframe() throws Exception {
Set<String> headers = this.document.getHeader();
String[] columns = headers.toArray(new String[0]);
return getDataframe(columns);
}

@Override
public List<Map<String, String>> getDataframe(String query) throws Exception {
String[] columns = query.split(",");
return getDataframe(columns);
if (csvRecords.isEmpty()) {
return Collections.emptyList();
}
return getDataframe(headers.toArray(new String[0]));
}

public List<Map<String, String>> getDataframe(String... columns) throws Exception {
Set<String> headers = this.document.getHeader();

// Return entire dataframe if no columns are provided or if empty string is provided
if ((columns == null || columns.length == 0) || (columns.length == 1 && columns[0].isEmpty()))
if (csvRecords.isEmpty()) {
return Collections.emptyList();
}

if ((columns == null || columns.length == 0) || (columns.length == 1 && columns[0].isEmpty())) {
return getDataframe();

}

int columnCount = 0;
for(String c : columns) {
if (!headers.contains(c))
for (String c : columns) {
if (!headers.contains(c)) {
throw new IllegalArgumentException("Column " + c + " not found");
columnCount += 1;
}
columnCount++;
}
// TODO Check if rowCount can be obtained to properly initialise the collection capacity
Collection<Map<String,String>> dataframe;
if (onlyDistinct)
dataframe = new ArrayList<>();
else
dataframe = new HashSet<>();
for (NamedCsvRow row : this.document) {
HashMap<String, String> map = new HashMap<>(columnCount);

// initialize collection with max possible size. Could be fewer rows if only distinct rows are requested in the dataframe.
int rowCount = csvRecords.size();
Collection<Map<String, String>> dataframe = onlyDistinct ? new HashSet<>(rowCount) : new ArrayList<>(rowCount);

for (NamedCsvRecord row : csvRecords) {
Map<String, String> map = new HashMap<>(columnCount);
for (String c : columns) {
if(hashVariable)
if (hashVariable) {
map.put(TemplateFunctions.literalHash(c), row.getField(c));
else
} else {
map.put(c, row.getField(c));
}
}
dataframe.add(map);
}
return new ArrayList<>(dataframe);
}

@Override
public void debugQuery(String query, Path destinationPath) throws Exception {

}
@Override
public void setVerbose(boolean verbose) {}

/**
* Not implemented for CSVReader yet.
* @param outputFormat String identifying the output format
*/
@Override
public void setOutputFormat(String outputFormat) { return;}

@Override
public void setHashVariable(boolean hashVariable) {
this.hashVariable = hashVariable;
}

@Override
public void setOnlyDistinct(boolean onlyDistinct) {
this.onlyDistinct = onlyDistinct;
}

@Override
public void shutDown() {

}
}
69 changes: 69 additions & 0 deletions src/main/java/com/cefriel/template/io/csv/CSVReaderAbstract.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Copyright (c) 2019-2023 Cefriel.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.cefriel.template.io.csv;

import com.cefriel.template.io.Reader;
import com.cefriel.template.utils.TemplateFunctions;
import de.siegmar.fastcsv.reader.NamedCsvRecord;

import java.nio.file.Path;
import java.util.*;

public abstract class CSVReaderAbstract implements Reader {

List<String> headers;
boolean hashVariable;
boolean onlyDistinct;

@Override
public void setQueryHeader(String header) {}

@Override
public void appendQueryHeader(String s) {}

public abstract List<Map<String, String>> getDataframe() throws Exception ;

@Override
public List<Map<String, String>> getDataframe(String query) throws Exception {
String[] columns = query.split(",");
return getDataframe(columns);
}

public abstract List<Map<String, String>> getDataframe(String... columns) throws Exception ;

@Override
public void debugQuery(String query, Path destinationPath) throws Exception {}

@Override
public void setVerbose(boolean verbose) {}

@Override
public void setOutputFormat(String outputFormat) {}

@Override
public void setHashVariable(boolean hashVariable) {
this.hashVariable = hashVariable;
}

@Override
public void setOnlyDistinct(boolean onlyDistinct) {
this.onlyDistinct = onlyDistinct;
}

@Override
public void shutDown() {}
}
82 changes: 82 additions & 0 deletions src/main/java/com/cefriel/template/io/csv/CSVStreamReader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Copyright (c) 2019-2023 Cefriel.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.cefriel.template.io.csv;

import com.cefriel.template.utils.TemplateFunctions;
import de.siegmar.fastcsv.reader.CsvReader;
import de.siegmar.fastcsv.reader.NamedCsvRecord;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.*;

public class CSVStreamReader extends CSVReaderAbstract {

public CsvReader<NamedCsvRecord> document;

public CSVStreamReader(File file) throws IOException {
if (Files.exists(file.toPath())) {
try (CsvReader<NamedCsvRecord> input = CsvReader.builder().ofNamedCsvRecord(file.toPath())) {
headers = input.stream().findFirst().orElseThrow().getHeader();
}
this.document = CsvReader.builder().ofNamedCsvRecord(file.toPath());
} else
throw new IllegalArgumentException("File does not exist: " + file.getPath());
}

public CSVStreamReader(String csv) throws IOException {
try (CsvReader<NamedCsvRecord> input = CsvReader.builder().ofNamedCsvRecord(csv)) {
headers = input.stream().findFirst().orElseThrow().getHeader();
}
this.document = CsvReader.builder().ofNamedCsvRecord(csv);
}

public List<Map<String, String>> getDataframe() throws Exception {
String[] columns = headers.toArray(new String[0]);
return getDataframe(columns);
}

public List<Map<String, String>> getDataframe(String... columns) throws Exception {
// Return entire dataframe if no columns are provided or if empty string is provided
if ((columns == null || columns.length == 0) || (columns.length == 1 && columns[0].isEmpty()))
return getDataframe();

int columnCount = 0;
for(String c : columns) {
if (!headers.contains(c))
throw new IllegalArgumentException("Column " + c + " not found");
columnCount += 1;
}
// For stream behaviour rowCount can not be obtained
Collection<Map<String, String>> dataframe = onlyDistinct ? new HashSet<>() : new ArrayList<>();

final int mapSize = columnCount;
this.document.stream().forEach(row -> {
HashMap<String, String> map = new HashMap<>(mapSize);
for (String c : columns) {
if(hashVariable)
map.put(TemplateFunctions.literalHash(c), row.getField(c));
else
map.put(c, row.getField(c));
}
dataframe.add(map);
});

return new ArrayList<>(dataframe);
}
}
6 changes: 1 addition & 5 deletions src/main/java/com/cefriel/template/io/rdf/RDFReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -152,11 +152,7 @@ public List<Map<String,Value>> executeQuery(String query) {
*/
private List<Map<String,String>> getQueryResultsStringValue(String query) {
List<Map<String,Value>> valueResults = executeQuery(query);
Collection<Map<String,String>> dataframe;
if (onlyDistinct)
dataframe = new ArrayList<>();
else
dataframe = new HashSet<>();
Collection<Map<String, String>> dataframe = onlyDistinct ? new HashSet<>() : new ArrayList<>();
for(Map<String,Value> row : valueResults) {
if (hashVariable)
dataframe.add(row.entrySet().stream()
Expand Down
8 changes: 2 additions & 6 deletions src/main/java/com/cefriel/template/io/sql/SQLReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,8 @@ public ResultSet executeQuery(String query) {
private List<Map<String, String>> populateDataframe(int rowCount, ResultSet resultSet, String filterVariables) throws SQLException {
ResultSetMetaData metaData = resultSet.getMetaData();
int columnCount = metaData.getColumnCount();

Collection<Map<String,String>> dataframe;
if (onlyDistinct)
dataframe = new ArrayList<>(rowCount);
else
dataframe = new HashSet<>(rowCount);

Collection<Map<String, String>> dataframe = onlyDistinct ? new HashSet<>(rowCount) : new ArrayList<>(rowCount);

List<String> filters = null;
if (filterVariables != null)
Expand Down
6 changes: 1 addition & 5 deletions src/main/java/com/cefriel/template/io/xml/XMLReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,7 @@ public List<Map<String, String>> getQueryResultsStringValue(String query) throws
XQueryExpression exp = sqc.compileQuery(query);
SequenceIterator iter = exp.iterator(dynamicContext);
// TODO Check if rowCount can be obtained to properly initialise the ArrayList capacity
Collection<Map<String,String>> dataframe;
if (onlyDistinct)
dataframe = new ArrayList<>();
else
dataframe = new HashSet<>();
Collection<Map<String, String>> dataframe = onlyDistinct ? new HashSet<>() : new ArrayList<>();

while (true) {
Item item = iter.next();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -550,11 +550,14 @@ public static String encodeURIComponent(String component) {
} else
component = URLEncoder.encode(component, StandardCharsets.UTF_8);

// TODO Check how to generalize this
for (char c : component.toCharArray()) {
if (c == '+')
builder.append("%20");
else if (c == '*')
builder.append("%2A");
else if (c == '/')
builder.append("%2F");
else
builder.append(c);
}
Expand All @@ -579,7 +582,7 @@ public void setBaseIRI(String baseIRI) {
public String resolveIRI(String s) throws Exception {
if(s != null) {
if (!isAbsoluteURI(s)) {
s = baseIRI + s;
s = baseIRI + encodeURIComponent(s);
s = new URI(s).toString();
} else {
URLComponents url = new URLComponents(s);
Expand Down
Loading