Professional Documents
Culture Documents
3.0.1. Components/steps/tasks:
1. Map output key
The key will be the empNo as it is the join key for the datasets employee and salary
[Implementation: in the mapper]
2. Tagging the data with the dataset identity
Add an attribute called srcIndex to tag the identity of the data (1=employee, 2=salary, 3=salary history)
[Implementation: in the mapper]
3. Discarding unwanted atributes
[Implementation: in the mapper]
4. Composite key
Make the map output key a composite of empNo and srcIndex
[Implementation: create custom writable]
5. Partitioner
Partition the data on natural key of empNo
[Implementation: create custom partitioner class]
5. Sorting
Sort the data on empNo first, and then source index
[Implementation: create custom sorting comparator class]
6. Grouping
Group the data based on natural key
[Implementation: create custom grouping comparator class]
7. Joining
Iterate through the values for a key and complete the join for employee and salary data, perform lookup
of department to include department name in the output
[Implementation: in the reducer]
3.0.2a. Data pipeline for cardinality of 1..1 between employee and salary data:
3.0.2b. Data pipeline for cardinality of 1..many between employee and salary data:
//********************************************************************************
//Class:
MapperRSJ
//Purpose: Mapper
//Author:
Anagha Khanolkar
//********************************************************************************
*
package khanolkar.mapreduce.join.samples.reducesidejoin;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import
import
import
import
org.apache.hadoop.io.LongWritable;
org.apache.hadoop.io.Text;
org.apache.hadoop.mapreduce.Mapper;
org.apache.hadoop.mapreduce.lib.input.FileSplit;
1
3 public class MapperRSJ extends
Mapper<LongWritable, Text, CompositeKeyWritableRSJ, Text> {
1
4
CompositeKeyWritableRSJ ckwKey = new CompositeKeyWritableRSJ();
Text txtValue = new Text("");
1
int intSrcIndex = 0;
5
StringBuilder strMapValueBuilder = new StringBuilder("");
1
List<Integer> lstRequiredAttribList = new ArrayList<Integer>();
6
1
@Override
protected void setup(Context context) throws IOException,
7
InterruptedException {
1
8
// {{
1
// Get the source index; (employee = 1, salary = 2)
// Added as configuration in driver
9
FileSplit fsFileSplit = (FileSplit) context.getInputSplit();
2
intSrcIndex = Integer.parseInt(context.getConfiguration().get(
0
fsFileSplit.getPath().getName()));
2
// }}
1
// {{
2
// Initialize the list of fields to emit as output based on
2
// intSrcIndex (1=employee, 2=current salary, 3=historical salary)
2
if (intSrcIndex == 1) // employee
3
{
lstRequiredAttribList.add(2); // FName
2
lstRequiredAttribList.add(3); // LName
4
lstRequiredAttribList.add(4); // Gender
2
lstRequiredAttribList.add(6); // DeptNo
5
} else // salary
2
{
lstRequiredAttribList.add(1); // Salary
6
lstRequiredAttribList.add(3); // Effective-to-date (Value
2
of
7
2
8
}
2
// }}
9
3
}
0
private String buildMapValue(String arrEntityAttributesList[]) {
3
// This method returns csv list of values to emit based on data
1
entity
3
2
strMapValueBuilder.setLength(0);// Initialize
3
// Build list of attributes to output based on source 3
employee/salary
3
for (int i = 1; i < arrEntityAttributesList.length; i++) {
4
// If the field is in the list of required output
3
// append to stringbuilder
if (lstRequiredAttribList.contains(i)) {
5
3
strMapValueBuilder.append(arrEntityAttributesList[i]).append(
6
",");
3
}
7
}
if (strMapValueBuilder.length() > 0) {
3
// Drop last comma
8
strMapValueBuilder.setLength(strMapValueBuilder.length() 3 1);
9
}
4
return strMapValueBuilder.toString();
0
}
4
1
@Override
4
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
2
4
if (value.toString().length() > 0) {
3
String arrEntityAttributes[] = value.toString().split(",");
4
4
ckwKey.setjoinKey(arrEntityAttributes[0].toString());
ckwKey.setsourceIndex(intSrcIndex);
4
txtValue.set(buildMapValue(arrEntityAttributes));
5
4
context.write(ckwKey, txtValue);
6
}
4
}
7
}
4
8
4
9
5
view raw 06-Mapper hosted with by GitHub
// 9999-01// salary)
3 import org.apache.hadoop.io.WritableComparator;
4
//********************************************************************************
5 //Class:
GroupingComparatorRSJ
6 //Purpose: For use as grouping comparator
7 //Author: Anagha Khanolkar
8 //********************************************************************************
9 *
1 public class GroupingComparatorRSJ extends WritableComparator {
0
protected GroupingComparatorRSJ() {
super(CompositeKeyWritableRSJ.class, true);
1
}
1
1
@Override
2
public int compare(WritableComparable w1, WritableComparable w2) {
// The grouping comparator is the joinKey (Employee ID)
1
CompositeKeyWritableRSJ key1 = (CompositeKeyWritableRSJ) w1;
3
CompositeKeyWritableRSJ key2 = (CompositeKeyWritableRSJ) w2;
1
return key1.getjoinKey().compareTo(key2.getjoinKey());
4
}
1 }
5
1
6
view raw 09-GroupingComparator hosted with by GitHub
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
10
0
10
1
10
2
10
3
10
4
10
5
10
6
10
7
10
.getConfiguration());
for (Path eachPath : cacheFilesLocal) {
if (eachPath.getName().toString().trim()
.equals("departments_map.tar.gz")) {
URI uriUncompressedFile = new
File(eachPath.toString()
+ "/departments_map").toURI();
initializeDepartmentsMap(uriUncompressedFile,
context);
}
}
// }}
}
@SuppressWarnings("deprecation")
private void initializeDepartmentsMap(URI uriUncompressedFile, Context
context)
throws IOException {
// {{
// Initialize the reader of the map file (side data)
FileSystem dfs = FileSystem.get(context.getConfiguration());
try {
deptMapReader = new MapFile.Reader(dfs,
uriUncompressedFile.toString(),
context.getConfiguration());
} catch (Exception e) {
e.printStackTrace();
}
// }}
}
private StringBuilder buildOutputValue(CompositeKeyWritableRSJ key,
StringBuilder reduceValueBuilder, Text value) {
if (key.getsourceIndex() == 1) {
// Employee data
// {{
// Get the department name from the MapFile in
distributedCache
// Insert the joinKey (empNo) to beginning of the
stringBuilder
reduceValueBuilder.append(key.getjoinKey()).append(strSeparator);
String arrEmpAttributes[] = value.toString().split(",");
txtMapFileLookupKey.set(arrEmpAttributes[3].toString());
try {
deptMapReader.get(txtMapFileLookupKey,
txtMapFileLookupValue);
} catch (Exception e) {
txtMapFileLookupValue.set("");
} finally {
txtMapFileLookupValue
.set((txtMapFileLookupValue.equals(null) || txtMapFileLookupValue
.equals("")) ?
"NOT-FOUND"
:
txtMapFileLookupValue.toString());
}
// }}
// {{
// Append the department name to the map values to form a
complete
// CSV of employee attributes
reduceValueBuilder.append(value.toString()).append(strSeparator)
.append(txtMapFileLookupValue.toString())
.append(strSeparator);
// }}
} else if (key.getsourceIndex() == 2) {
// Current recent salary data (1..1 on join key)
// Salary data; Just append the salary, drop the
effective-to-date
String arrSalAttributes[] = value.toString().split(",");
reduceValueBuilder.append(arrSalAttributes[0].toString()).append(
strSeparator);
} else // key.getsourceIndex() == 3; Historical salary data
{
// {{
// Get the salary data but extract only current salary
// (to_date='9999-01-01')
String arrSalAttributes[] = value.toString().split(",");
if (arrSalAttributes[1].toString().equals("9999-01-01")) {
// Salary data; Just append
reduceValueBuilder.append(arrSalAttributes[0].toString())
.append(strSeparator);
8
10
9
11
0
11
1
11
2
11
3
11
4
11
5 {
11
6
11
7
11
8
11
9
12
0
1);
12
1
12
2
12
3
12
4
12
5
12
6
12
7
12
8
12 }
9
13
0
13
1
13
2
13
3
13
4
13
5
13
6
13
7
13
8
13
9
14
0
14
1
14
2
14
3
14
4
14
5
14
6
14
}
// }}
}
// {{
// Reset
txtMapFileLookupKey.set("");
txtMapFileLookupValue.set("");
// }}
return reduceValueBuilder;
}
@Override
public void reduce(CompositeKeyWritableRSJ key, Iterable<Text> values,
Context context) throws IOException, InterruptedException
// Iterate through values; First set is csv of employee data
// second set is salary data; The data is already ordered
// by virtue of secondary sort; Append each value;
for (Text value : values) {
buildOutputValue(key, reduceValueBuilder, value);
}
// Drop last comma, set value, and emit output
if (reduceValueBuilder.length() > 1) {
reduceValueBuilder.setLength(reduceValueBuilder.length() // Emit output
reduceOutputValue.set(reduceValueBuilder.toString());
context.write(nullWritableKey, reduceOutputValue);
} else {
System.out.println("Key=" + key.getjoinKey() + "src="
+ key.getsourceIndex());
}
// Reset variables
reduceValueBuilder.setLength(0);
reduceOutputValue.set("");
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
deptMapReader.close();
}
7
view raw 10-Reducer hosted with by GitHub
71
job.setMapperClass(MapperRSJ.class);
72
job.setMapOutputKeyClass(CompositeKeyWritableRSJ.class);
73
job.setMapOutputValueClass(Text.class);
74
job.setPartitionerClass(PartitionerRSJ.class);
75
job.setSortComparatorClass(SortingComparatorRSJ.class);
76
job.setGroupingComparatorClass(GroupingComparatorRSJ.class);
77
78
job.setNumReduceTasks(4);
79
job.setReducerClass(ReducerRSJ.class);
job.setOutputKeyClass(NullWritable.class);
80
job.setOutputValueClass(Text.class);
81
// }}
82
83
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
84
}
85
86
public static void main(String[] args) throws Exception {
87
int exitCode = ToolRunner.run(new Configuration(), new
88 DriverRSJ(),
args);
89
System.exit(exitCode);
90
}
91 }
92
93
94
95
96
97
98
99
10
0
10
1
10
2
view raw 11-Driver hosted with by GitHub
2
2
view raw 12-PigScript hosted with by GitHub
Output:
**********************
Output of pig script
**********************
$ hadoop fs -cat joinProject/output/pig-RSJ/part* | less
Facello Georgi M
d005
88958
10002
Simmel Bezalel F
d007
72527
10003
Bamford Parto
M
d004
43311
10004
Koblick Chirstian
M
d004
74057
.........
1 2 3 4 5 6 7 8 9 1010001