File tree Expand file tree Collapse file tree 3 files changed +16
-8
lines changed
main/java/com/github/bottomlessarchive/warc/service
test/java/com/github/bottomlessarchive/warc/test Expand file tree Collapse file tree 3 files changed +16
-8
lines changed Original file line number Diff line number Diff line change @@ -24,6 +24,16 @@ public static <T extends WarcContentBlock> Stream<WarcRecord<T>> streamOf(@NotNu
24
24
return WarcRecordStreamFactory .streamOf (url , EVERY_WARC_RECORD_TYPE );
25
25
}
26
26
27
+ public static <T extends WarcContentBlock > Stream <WarcRecord <T >> streamOf (@ NotNull @ NonNull final URL url ,
28
+ @ NotNull @ NonNull final WarcRecordType ... requiredRecordTypes ) {
29
+ try {
30
+ return streamOf (new AvailableInputStream (new BufferedInputStream (url .openStream ())),
31
+ WarcReader .DEFAULT_CHARSET , true , List .of (requiredRecordTypes ));
32
+ } catch (IOException e ) {
33
+ throw new WarcNetworkException ("Unable to open WARC location: " + url + "!" , e );
34
+ }
35
+ }
36
+
27
37
public static <T extends WarcContentBlock > Stream <WarcRecord <T >> streamOf (@ NotNull @ NonNull final URL url ,
28
38
@ NotNull @ NonNull final List <WarcRecordType > requiredRecordTypes ) {
29
39
try {
Original file line number Diff line number Diff line change 6
6
import com .github .bottomlessarchive .warc .service .content .response .domain .ResponseContentBlock ;
7
7
import com .github .bottomlessarchive .warc .service .record .domain .WarcRecord ;
8
8
9
- import java .io .File ;
10
9
import java .io .FileInputStream ;
11
10
import java .util .Optional ;
12
11
13
12
public class TestFileWarcReader {
14
13
15
14
public static void main (final String ... arg ) throws Exception {
16
15
final WarcReader warcReader = new WarcReader (new FileInputStream (
17
- new File ( "C:\\ warc-test\\ CC-MAIN-20180716232549-20180717012549-00001.warc.gz" ) ));
16
+ "C:\\ warc-test\\ CC-MAIN-20180716232549-20180717012549-00001.warc.gz" ));
18
17
19
18
boolean hasNext = true ;
20
19
while (hasNext ) {
@@ -23,8 +22,7 @@ public static void main(final String... arg) throws Exception {
23
22
24
23
optionalWarcRecord
25
24
.filter (WarcRecord ::isResponse )
26
- .map (warcRecord -> ((ResponseContentBlock ) warcRecord .getWarcContentBlock ())
27
- .getPayloadAsString ())
25
+ .map (warcRecord -> ((ResponseContentBlock ) warcRecord .getContentBlock ()).getPayloadAsString ())
28
26
.ifPresent (System .out ::println );
29
27
30
28
hasNext = optionalWarcRecord .isPresent ();
Original file line number Diff line number Diff line change 2
2
3
3
import com .github .bottomlessarchive .warc .service .WarcRecordStreamFactory ;
4
4
import com .github .bottomlessarchive .warc .service .content .response .domain .ResponseContentBlock ;
5
- import com .github .bottomlessarchive .warc .service .record .domain .WarcRecord ;
5
+ import com .github .bottomlessarchive .warc .service .record .domain .WarcRecordType ;
6
+
6
7
import java .net .URL ;
7
8
8
9
public class TestUrlWarcReader {
@@ -11,9 +12,8 @@ public static void main(final String... arg) throws Exception {
11
12
final URL warcUrl = new URL (
12
13
"https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2018-43/segments/1539583508988.18/warc/CC-MAIN-20181015080248-20181015101748-00000.warc.gz" );
13
14
14
- WarcRecordStreamFactory .streamOf (warcUrl )
15
- .filter (WarcRecord ::isResponse )
16
- .map (entry -> ((ResponseContentBlock ) entry .getWarcContentBlock ()).getPayloadAsString ())
15
+ WarcRecordStreamFactory .streamOf (warcUrl , WarcRecordType .RESPONSE )
16
+ .map (entry -> ((ResponseContentBlock ) entry .getContentBlock ()).getPayloadAsString ())
17
17
.forEach (System .out ::println );
18
18
}
19
19
}
You can’t perform that action at this time.
0 commit comments