Skip to content

Commit 1eb0dc9

Browse files
committed
HeritrixGatherer: Ignore file not found exceptions while walking tree
1 parent ffe1974 commit 1eb0dc9

File tree

2 files changed

+53
-17
lines changed

2 files changed

+53
-17
lines changed

gatherer/src/pandas/gatherer/core/WorkingArea.java

+6
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,12 @@ public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOEx
225225
Files.deleteIfExists(dir);
226226
return FileVisitResult.CONTINUE;
227227
}
228+
229+
@Override
230+
public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException {
231+
log.warn("Error visiting {}", file, exc);
232+
return FileVisitResult.CONTINUE;
233+
}
228234
});
229235
}
230236

gatherer/src/pandas/gatherer/heritrix/HeritrixGatherer.java

+47-17
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,11 @@
1010
import pandas.gatherer.repository.Repository;
1111

1212
import java.io.IOException;
13-
import java.nio.file.Files;
14-
import java.nio.file.Path;
13+
import java.nio.file.*;
14+
import java.nio.file.attribute.BasicFileAttributes;
1515
import java.util.ArrayList;
1616
import java.util.List;
17+
import java.util.Set;
1718

1819
import static pandas.gatherer.heritrix.HeritrixClient.State.*;
1920

@@ -127,26 +128,55 @@ public void archive(Instance instance) throws IOException {
127128
Path jobDir = jobDir(instance);
128129
List<Path> warcs = new ArrayList<>();
129130
List<Artifact> artifacts = new ArrayList<>();
130-
for (Path file : Files.walk(jobDir).toList()) {
131-
Path relpath = jobDir.relativize(file);
132-
if (Files.isDirectory(file)) continue;
133-
if (Files.isSymbolicLink(file)) continue;
131+
var directoriesToIgnore = Set.of("scratch", "state", "action", "actions-done");
132+
Files.walkFileTree(jobDir, new FileVisitor<>() {
133+
@Override
134+
public FileVisitResult preVisitDirectory(Path path,
135+
BasicFileAttributes basicFileAttributes) throws IOException {
136+
if (directoriesToIgnore.contains(path.getFileName().toString())) {
137+
return FileVisitResult.SKIP_SUBTREE;
138+
}
139+
return FileVisitResult.CONTINUE;
140+
}
141+
142+
@Override
143+
public FileVisitResult visitFile(Path file, BasicFileAttributes basicFileAttributes) throws IOException {
144+
Path relpath = jobDir.relativize(file);
145+
if (Files.isSymbolicLink(file)) return FileVisitResult.CONTINUE;
134146

135-
String filename = relpath.getFileName().toString();
136-
if (filename.endsWith(".lck")) continue;
147+
String filename = relpath.getFileName().toString();
148+
if (filename.endsWith(".lck")) return FileVisitResult.CONTINUE;
137149

138-
String dirname = file.getParent().getFileName().toString();
139-
if (dirname.equals("scratch") || dirname.equals("state") || dirname.equals("action") || dirname.equals("actions-done")) {
140-
continue;
150+
log.debug("Artifact {}", filename);
151+
if (filename.endsWith(".warc.gz")) {
152+
warcs.add(file);
153+
} else {
154+
artifacts.add(new Artifact(relpath.toString(), file));
155+
}
156+
return FileVisitResult.CONTINUE;
141157
}
142158

143-
log.debug("Artifact {}", filename);
144-
if (filename.endsWith(".warc.gz")) {
145-
warcs.add(file);
146-
} else {
147-
artifacts.add(new Artifact(relpath.toString(), file));
159+
@Override
160+
public FileVisitResult visitFileFailed(Path path, IOException e) throws IOException {
161+
if (e instanceof NoSuchFileException) {
162+
log.warn("File not found while walking tree archiving {}", path);
163+
return FileVisitResult.CONTINUE;
164+
}
165+
throw e;
148166
}
149-
}
167+
168+
@Override
169+
public FileVisitResult postVisitDirectory(Path path, IOException e) throws IOException {
170+
if (e != null) {
171+
if (e instanceof NoSuchFileException) {
172+
log.warn("File not found while walking tree archiving {}", path);
173+
return FileVisitResult.CONTINUE;
174+
}
175+
throw e;
176+
}
177+
return FileVisitResult.CONTINUE;
178+
}
179+
});
150180

151181
repository.storeWarcs(instance, warcs);
152182
repository.storeArtifacts(instance, artifacts);

0 commit comments

Comments
 (0)