|
10 | 10 | import pandas.gatherer.repository.Repository;
|
11 | 11 |
|
12 | 12 | import java.io.IOException;
|
13 |
| -import java.nio.file.Files; |
14 |
| -import java.nio.file.Path; |
| 13 | +import java.nio.file.*; |
| 14 | +import java.nio.file.attribute.BasicFileAttributes; |
15 | 15 | import java.util.ArrayList;
|
16 | 16 | import java.util.List;
|
| 17 | +import java.util.Set; |
17 | 18 |
|
18 | 19 | import static pandas.gatherer.heritrix.HeritrixClient.State.*;
|
19 | 20 |
|
@@ -127,26 +128,55 @@ public void archive(Instance instance) throws IOException {
|
127 | 128 | Path jobDir = jobDir(instance);
|
128 | 129 | List<Path> warcs = new ArrayList<>();
|
129 | 130 | List<Artifact> artifacts = new ArrayList<>();
|
130 |
| - for (Path file : Files.walk(jobDir).toList()) { |
131 |
| - Path relpath = jobDir.relativize(file); |
132 |
| - if (Files.isDirectory(file)) continue; |
133 |
| - if (Files.isSymbolicLink(file)) continue; |
| 131 | + var directoriesToIgnore = Set.of("scratch", "state", "action", "actions-done"); |
| 132 | + Files.walkFileTree(jobDir, new FileVisitor<>() { |
| 133 | + @Override |
| 134 | + public FileVisitResult preVisitDirectory(Path path, |
| 135 | + BasicFileAttributes basicFileAttributes) throws IOException { |
| 136 | + if (directoriesToIgnore.contains(path.getFileName().toString())) { |
| 137 | + return FileVisitResult.SKIP_SUBTREE; |
| 138 | + } |
| 139 | + return FileVisitResult.CONTINUE; |
| 140 | + } |
| 141 | + |
| 142 | + @Override |
| 143 | + public FileVisitResult visitFile(Path file, BasicFileAttributes basicFileAttributes) throws IOException { |
| 144 | + Path relpath = jobDir.relativize(file); |
| 145 | + if (Files.isSymbolicLink(file)) return FileVisitResult.CONTINUE; |
134 | 146 |
|
135 |
| - String filename = relpath.getFileName().toString(); |
136 |
| - if (filename.endsWith(".lck")) continue; |
| 147 | + String filename = relpath.getFileName().toString(); |
| 148 | + if (filename.endsWith(".lck")) return FileVisitResult.CONTINUE; |
137 | 149 |
|
138 |
| - String dirname = file.getParent().getFileName().toString(); |
139 |
| - if (dirname.equals("scratch") || dirname.equals("state") || dirname.equals("action") || dirname.equals("actions-done")) { |
140 |
| - continue; |
| 150 | + log.debug("Artifact {}", filename); |
| 151 | + if (filename.endsWith(".warc.gz")) { |
| 152 | + warcs.add(file); |
| 153 | + } else { |
| 154 | + artifacts.add(new Artifact(relpath.toString(), file)); |
| 155 | + } |
| 156 | + return FileVisitResult.CONTINUE; |
141 | 157 | }
|
142 | 158 |
|
143 |
| - log.debug("Artifact {}", filename); |
144 |
| - if (filename.endsWith(".warc.gz")) { |
145 |
| - warcs.add(file); |
146 |
| - } else { |
147 |
| - artifacts.add(new Artifact(relpath.toString(), file)); |
| 159 | + @Override |
| 160 | + public FileVisitResult visitFileFailed(Path path, IOException e) throws IOException { |
| 161 | + if (e instanceof NoSuchFileException) { |
| 162 | + log.warn("File not found while walking tree archiving {}", path); |
| 163 | + return FileVisitResult.CONTINUE; |
| 164 | + } |
| 165 | + throw e; |
148 | 166 | }
|
149 |
| - } |
| 167 | + |
| 168 | + @Override |
| 169 | + public FileVisitResult postVisitDirectory(Path path, IOException e) throws IOException { |
| 170 | + if (e != null) { |
| 171 | + if (e instanceof NoSuchFileException) { |
| 172 | + log.warn("File not found while walking tree archiving {}", path); |
| 173 | + return FileVisitResult.CONTINUE; |
| 174 | + } |
| 175 | + throw e; |
| 176 | + } |
| 177 | + return FileVisitResult.CONTINUE; |
| 178 | + } |
| 179 | + }); |
150 | 180 |
|
151 | 181 | repository.storeWarcs(instance, warcs);
|
152 | 182 | repository.storeArtifacts(instance, artifacts);
|
|
0 commit comments