Skip to content

Commit 8a91daa

Browse files
author
Josh Devins
committed
Final fixes for demo
1 parent 2baaac9 commit 8a91daa

File tree

5 files changed

+41
-10
lines changed

5 files changed

+41
-10
lines changed

README.md

+15
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,18 @@ Unit testing of Pig is currently done using PigUnit, a new xUnit style testing h
2121
* heap space is at a premium when running the Pig scripts, so max heap space JVM parameters need to be set (to something like`-Xmx1024m`)
2222
* running tests from Maven: configured in the Maven `pom` to run the Surefire plugin with a fixed max heap space setting
2323
* running tests from Eclipse: use the [JUnit Lanch Fixer](http://code.google.com/p/junitlaunchfixer) plugin and set the max heap space for all JUnit executions automatically (note that if you have previous failed launches, you should delete them before running again)
24+
25+
Running
26+
---
27+
28+
Checkout the source and build it with the Maven assembly plugin.
29+
30+
mvn assembly:assembly
31+
32+
The demo is done using the Cloudera [training VM v0.3.5](http://cloudera-vm.s3.amazonaws.com/cloudera-demo-0.3.5.tar.bz2?downloads) with CDH3b3. Here are the steps to run the demo. This assumes that you move/copy/mount the Git directory/checkout onto the VM.
33+
34+
hadoop fs -rmr access-log-throughput-mr; hadoop jar target/hadoop-getting-started-1-SNAPSHOT-jar-with-dependencies.jar net.joshdevins.talks.hadoopstart.mr.AccessLogThroughputDriver
35+
hadoop fs -cat access-log-throughput-mr/part-* | more
36+
37+
hadoop fs -rmr access-log-throughput; java -cp target/hadoop-getting-started-1-SNAPSHOT-jar-with-dependencies.jar:/etc/hadoop/conf org.apache.pig.Main -logfile target/pig.log src/main/pig/access-log-throughput.pig
38+
hadoop fs -cat access-log-throughput/part-* | more

src/main/java/net/joshdevins/talks/hadoopstart/mr/AccessLogThroughputDriver.java

+4-5
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,8 @@ public class AccessLogThroughputDriver extends Configured implements Tool {
2020
@Override
2121
public int run(final String[] args) throws Exception {
2222

23-
if (args.length != 2) {
24-
System.err.printf("Usage: %s [generic options] <input> <output>\n", AccessLogThroughputDriver.class
25-
.getSimpleName()); // NOPMD
23+
if (args.length != 0) {
24+
System.err.printf("Usage: %s [generic options]\n", AccessLogThroughputDriver.class.getSimpleName()); // NOPMD
2625
ToolRunner.printGenericCommandUsage(System.err);
2726
return -1;
2827
}
@@ -31,8 +30,8 @@ public int run(final String[] args) throws Exception {
3130
job.setJarByClass(AccessLogThroughputDriver.class);
3231

3332
// set the input and output paths
34-
FileInputFormat.setInputPaths(job, args[0]);
35-
FileOutputFormat.setOutputPath(job, new Path(args[1]));
33+
FileInputFormat.setInputPaths(job, "data/logs/*-access.log");
34+
FileOutputFormat.setOutputPath(job, new Path("access-log-throughput-mr"));
3635

3736
// submit job and wait for completion
3837
boolean success = job.waitForCompletion(true);

src/main/java/net/joshdevins/talks/hadoopstart/mr/ApacheCombinedAccessLogParser.java

+7-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,13 @@ public static ApacheCombinedAccessLogEntry parse(final String raw) {
3838
entry.setUri(matcher.group(6));
3939
entry.setProtocol(matcher.group(7));
4040
entry.setStatusCode(Integer.parseInt(matcher.group(8)));
41-
entry.setBytes(Long.parseLong(matcher.group(9)));
41+
42+
try {
43+
entry.setBytes(Long.parseLong(matcher.group(9)));
44+
} catch (NumberFormatException nfe) {
45+
// just ignore this and don't set the bytes
46+
}
47+
4248
entry.setReferrer(matcher.group(10));
4349
entry.setUserAgent(matcher.group(11));
4450

src/main/pig/access-log-throughput.pig

+2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
set job.name 'Apache Access Log Throughput'
22
set default_parallel 2
33

4+
REGISTER target/hadoop-getting-started-1-SNAPSHOT-jar-with-dependencies.jar
5+
46
-- PiggyBank UDFs
57
DEFINE ApacheCombinedLogLoader org.apache.pig.piggybank.storage.apachelog.CombinedLogLoader();
68
DEFINE SUBSTRING org.apache.pig.piggybank.evaluation.string.SUBSTRING();

src/test/java/net/joshdevins/talks/hadoopstart/mr/ApacheCombinedAccessLogParserTest.java

+13-4
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,18 @@ public void testParseValidLine() throws Exception {
3030
Assert.assertEquals(200, entry.getStatusCode());
3131
Assert.assertEquals(3190L, entry.getBytes());
3232
Assert.assertEquals("-", entry.getReferrer());
33-
Assert
34-
.assertEquals(
35-
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_4; en-us) AppleWebKit/525.18 (KHTML, like Gecko) Version/3.1.2 Safari/525.20.1",
36-
entry.getUserAgent());
33+
Assert.assertEquals(
34+
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_4; en-us) AppleWebKit/525.18 (KHTML, like Gecko) Version/3.1.2 Safari/525.20.1",
35+
entry.getUserAgent());
36+
}
37+
38+
@Test
39+
public void testParseValidLine_NoBytes() {
40+
41+
ApacheCombinedAccessLogEntry entry = ApacheCombinedAccessLogParser
42+
.parse("1.2.3.4 - - [30/Sep/2008:15:07:53 -0400] \"GET / HTTP/1.1\" 200 - \"-\" \"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_4; en-us) AppleWebKit/525.18 (KHTML, like Gecko) Version/3.1.2 Safari/525.20.1\"");
43+
44+
Assert.assertNotNull(entry);
45+
Assert.assertEquals(0L, entry.getBytes());
3746
}
3847
}

0 commit comments

Comments
 (0)