Skip to content
This repository has been archived by the owner on Jul 7, 2023. It is now read-only.

Commit

Permalink
Added tests and initial fix for opening uncompressed WARCs
Browse files Browse the repository at this point in the history
This initial unit test just tests that the first record from
compressed and uncompressed WARCs can be accessed.
  • Loading branch information
anjackson committed Mar 7, 2014
1 parent 17af411 commit 08b9442
Show file tree
Hide file tree
Showing 3 changed files with 3,197 additions and 3 deletions.
10 changes: 7 additions & 3 deletions src/main/java/org/archive/io/warc/WARCReaderFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,13 @@ public static ArchiveReader get(final String s, final InputStream is,
protected ArchiveReader getArchiveReader(final String f,
final InputStream is, final boolean atFirstRecord)
throws IOException {
// For now, assume stream is compressed. Later add test of input
// stream or handle exception thrown when figure not compressed stream.
return new CompressedWARCReader(f, is, atFirstRecord);
// Check if it's compressed:
// TODO Currently relies on the file extension, but this should all really sniff the content properly.
if( f.endsWith(".gz") ) {
return new CompressedWARCReader(f, is, atFirstRecord);
} else {
return new UncompressedWARCReader(f, is);
}
}

public static WARCReader get(final URL arcUrl, final long offset)
Expand Down
34 changes: 34 additions & 0 deletions src/test/java/org/archive/io/warc/WARCReaderFactoryTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package org.archive.io.warc;

import java.io.FileInputStream;
import java.io.IOException;

import org.archive.format.warc.WARCConstants;
import org.archive.format.warc.WARCConstants.WARCRecordType;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveRecord;

import junit.framework.TestCase;

public class WARCReaderFactoryTest extends TestCase {

// Test files:
String[] files = new String[] {
"src/test/resources/org/archive/format/gzip/IAH-urls-wget.warc.gz",
"src/test/resources/org/archive/format/warc/IAH-urls-wget.warc"
};

public void testGetStringInputstreamBoolean() throws IOException {
// Check the test files can be opened:
for( String file : files ) {
FileInputStream is = new FileInputStream(file);
ArchiveReader ar = WARCReaderFactory.get(file, is, true);
ArchiveRecord r = ar.get();
String type = (String) r.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE);
// Check the first record comes out as a 'warcinfo' record.
assertEquals(WARCRecordType.warcinfo.name(), type);
}
}


}
Loading

0 comments on commit 08b9442

Please sign in to comment.