Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
UdeshikaDissa authored Jun 26, 2020
1 parent 4c4e530 commit e6439b5
Show file tree
Hide file tree
Showing 25 changed files with 934 additions and 0 deletions.
43 changes: 43 additions & 0 deletions Java Source Code/dependency-reduced-pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>edu.rmit.cosc2637.s3400652</groupId>
<artifactId>Assignment</artifactId>
<name>Assignment</name>
<version>0.0.1-SNAPSHOT</version>
<url>http://maven.apache.org</url>
<build>
<plugins>
<plugin>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.1</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer>
<mainClass>edu.rmit.cosc2637.s3400652.Assignment.NYCDriver</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
</dependencies>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
</project>
23 changes: 23 additions & 0 deletions Java Source Code/note.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
hadoop fs -copyToLocal /user/s3400652/Assignment-0.0.1-SNAPSHOT.jar /home/hadoop/
hadoop jar Assignment-0.0.1-SNAPSHOT.jar /user/s3400652/green_tripdata_2019-01_sample.csv /user/s3400652/outputNYC3 -Dmapred.map.tasks=2 -Dmapred.reduce.tasks=4
hadoop jar Assignment-0.0.1-SNAPSHOT.jar /user/s3400652/fhv_tripdata_2015-01.csv /user/s3400652/outputNYC55

hadoop jar Assignment-0.0.1-SNAPSHOT.jar arn:aws:s3:::nyc-tlc/trip+data/yellow_tripdata_2019-01.csv /user/s3400652/outputNYC5


arn:aws:s3:::nyc-tlc/trip+data/yellow_tripdata_2019-01.csv

hadoop fs -cat /user/s3400652/outputNYC/part-r-00000 | sort -n -k2 -r | head -n3
hadoop fs -cat /user/s3400652/outputNYC/part-r-0000* | sort -n -k2 -r | head -n5 //concatanate output

+++++++++++++++++++++++++++++

tranfer dataset to HDFS from AWS s3 bucket
hadoop distcp s3a://nyc-tlc/"trip data"/yellow_tripdata_2018-12.csv /user/s3400652/

copy jar file to master node
hadoop fs -copyToLocal /user/s3400652/Assignment-0.0.1-SNAPSHOT.jar /home/hadoop/

Deploy jar file
hadoop jar Assignment-0.0.1-SNAPSHOT.jar edu.rmit.cosc2637.s3400652.Assignment.NYCDriver

68 changes: 68 additions & 0 deletions Java Source Code/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>edu.rmit.cosc2637.s3400652</groupId>
<artifactId>Assignment</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>

<name>Assignment</name>
<url>http://maven.apache.org</url>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencies>

<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>3.2.0</version>
</dependency>



<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.2.0</version>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.1</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>edu.rmit.cosc2637.s3400652.Assignment.NYCDriver</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* --------------------------------------------------------------------
* Developer Name : Udeshika Dissanayake
* Subject : COSC2637 Big Data Processing
* Assignment : Assignment 1 - Semester 2, 2019
* Student Number : s3400652
* Date : 12/10/2019 *
*--------------------------------------------------------------------
*/

package edu.rmit.cosc2637.s3400652.Assignment;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.io.FloatWritable;



public class NYCDriver
{

public static void main( String[] args ) throws IOException, ClassNotFoundException, InterruptedException
{

/*//for hardcording data and output paths
Path dataPath = new Path("/user/s3400652/green_tripdata_2019-01_sample.csv");
Path outputDir = new Path("/user/s3400652/OutputNYC");
*/

//Define configuration File for MapReduce Drive
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "NYC Taxi Analysis");

job.setJarByClass(NYCDriver.class);

//*****Selection of Mapper Class. [comment only one line out of below two lines]*****
//job.setMapperClass(NYCMapper.class); //Uncomment this for standard Mapper; Comment the below line
job.setMapperClass(NYCMapper_IMC.class); //Uncomment this for In-Mapper Combiner; comment the above line
//**********************************************************************************

//*****Selection of Combiner Class. [uncomment only if standard combiner is used]
//job.setCombinerClass(NYCReducer.class); //Uncomment this for standard Combiner
//**********************************************************************************

//*****Selection of Reducer Class;
job.setReducerClass(NYCReducer.class);
//***************************************

job.setOutputKeyClass(Text.class);
//job.setOutputValueClass(IntWritable.class); // for PULocation counter
job.setOutputValueClass(FloatWritable.class); //for Total fare
job.setMapOutputKeyClass(Text.class);
//job.setMapOutputValueClass(IntWritable.class); // for PULocation counter
job.setMapOutputValueClass(FloatWritable.class); //for Total fare

//setting arguments for input and output paths
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));

System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* --------------------------------------------------------------------
* Developer Name : Udeshika Dissanayake
* Subject : COSC2637 Big Data Processing
* Assignment : Assignment 1 - Semester 2, 2019
* Student Number : s3400652
* Date : 12/10/2019 *
*--------------------------------------------------------------------
*/

package edu.rmit.cosc2637.s3400652.Assignment;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

//Mapper Class

//------Data Pattern of Green Taxi data
//2,21/12/2018 15:17,21/12/2018 15:18,N,1,264,264,5,0,3,0.5,0.5,0,0,,0.3,4.3,2,1,
//6th Column = PickUp Location; 7th Column = Drop Off Location
//17th Column = Total fare; 4th Column = Number of Passengers

//------Data Pattern of Yellow Taxi data
//1 01-02-19 0:59 01-02-19 1:07 1 2.1 1 N 48 234 1 9 0.5 0.5 2 0 0.3 12.3 0
//8th Column = PickUp Location; 9th Column = Drop Off Location
//17th Column = Total fare; 8th Column = Number of Passengers

//public class NYCMapper extends Mapper<LongWritable, Text, Text, IntWritable> //for number of Pick Up Locations
public class NYCMapper extends Mapper<LongWritable, Text, Text, FloatWritable> //for total fare
{
private final static IntWritable one = new IntWritable(1);
private FloatWritable Total_fare = new FloatWritable();

// ----mapper to count number of Pick Up Locations-----
// Out - (PULoc_264, 1)
/*
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException
{
String line = value.toString();
String[] fields = line.split(",");
if(fields.length > 6)
{
Text PULocation = new Text("PULoc_"+fields[5]);
context.write(PULocation, one);
}
}
*/

// ----mapper to get total fare for each Pick Up Locations-----
// out - (PULoc_264, 4.3)
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException
{
String line = value.toString();
String[] fields = line.split(",");

if(fields.length > 16)
{
Text PULocation = new Text("PULoc_"+fields[7]);
if(fields[16].matches("\\d+.+")) //check whether its numeric
{
float f = Float.parseFloat(fields[16]);
Total_fare.set(f);
}
context.write(PULocation, Total_fare);
}


}

}


Loading

0 comments on commit e6439b5

Please sign in to comment.