You are on page 1of 11

Hadoop 12

2013-08-16

1. jobtracker ID2. MapReduce 3.4. ID jobtracker 5. jobtracker jobtracker Hadoop wordcount


hadoop jar hadoop-examples-1.0.4.jar wordcount /usr/input /usr/output

HADOOP_HOME/BIN/hadoop
elif [ "$COMMAND" = "jar" ] ; then CLASS=org.apache.hadoop.util.RunJar HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" # run it exec "$JAVA" -Dproc_$COMMAND $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"

org.apache.hadoop.util.RunJar main RunJar jar hadoop-examples-1.0.4.jar WordCountWordCount


public class WordCount { public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{ private final static IntWritable one = new IntWritable(1); private Text word = new Text();
1

/** * Mapper map * void map(K1 key, V1 value, OutputCollector<K2,V2> output, Reporter reporter) * k/v k/v * 0 * OutputCollector Mapper Reducer <k,v> * OutputCollector collect(k, v):(k,v) output map value key key StringTokenizer write word write (,1) context */ public void map(Object key, Text value, Context context ) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } } public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> { private IntWritable result = new IntWritable(); /** * Reducer reduce * void reduce(Text key, Iterable<IntWritable> values, Context context) * k/v map context,(combiner), context */ public void reduce(Text key, Iterable<IntWritable> values, Context context ) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); }
2

} public static void main(String[] args) throws Exception { //Configurationmap/reduce j hadoop map-reduce Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2); } Job job = new Job(conf, "word count");// job job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); // job Mapper job.setCombinerClass(IntSumReducer.class); // job Combiner job.setReducerClass(IntSumReducer.class); // job Reduce job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // map-reduce OutputFormat /** * InputFormat map-reduce job * setInputPaths(): map-reduce job * setInputPath() map-reduce job */ FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); // job job.runJob(conf); } }

Main org.apache.hadoop.mapreduce.Job waitForCompletion(true)


public class Job extends JobContext { // JobContext public static enum JobState {DEFINE, RUNNING}; //job private JobState state = JobState.DEFINE; // DEFINE private JobClient jobClient; // /** * Submit the job to the cluster and wait for it to finish. * @param verbose print the progress to the user * @return true if the job succeeded * @throws IOException thrown if the communication with the * <code>JobTracker</code> is lost */
3

public boolean waitForCompletion(boolean verbose ) throws IOException, InterruptedException, ClassNotFoundException { if (state == JobState.DEFINE) { // job DEFINE submit();// } if (verbose) {// jobClient.monitorAndPrintJob(conf, info); // } else { info.waitForCompletion();// } return isSuccessful();// } }

waitForCompletion(true)MapReduce submit():
/** * Submit the job to the cluster and return immediately. * @throws IOException */ public void submit() throws IOException, InterruptedException, ClassNotFoundException { ensureState(JobState.DEFINE); // DEFINE setUseNewAPI();// // Connect to the JobTracker and submit the job connect();// JobTracker JobClient info = jobClient.submitJobInternal(conf); // super.setJobID(info.getID());// ID state = JobState.RUNNING; // Job RUNNING }

connect() JobClient JobClient private JobSubmissionProtocol jobSubmitClient; JobTracker DataNode namenode JobClient RPC jobSubmitClient connect()-->(JobContext : UserGroupInformation) ugi doAs()
4

PrivilegedExceptionAction run() -->new JobClient((JobConf) getConfiguration())-->JobClient init(JobConf conf) -->this.jobSubmitClient = createRPCProxy(JobTracker.getAddress(conf), conf);--> (JobSubmissionProtocol) RPC.getProxy();
Job.connect() | |-->UserGroupInformation.doAs() | | | |-->PrivilegedExceptionAction<Object>() |<--| | | | |run() | | | |-->JobClient.JobClient(JobConf conf) | | | | | |-->JobClient.setConf(Configuration conf) | | |<---| | | | | | |-->JobClient.init(JobConf conf) | | | | | | | |-->JobClient.createRPCProxy(InetSocketAddress addr, | | | | Configuration conf) | | | | | | | | | |-->(JobSubmissionProtocol) RPC.getProxy() | | | | |<---------------------------------| | | | | | | | | |<--| | | |<---| | | |<---| | | | | | | | | | | | |

jobClient.submitJobInternal(conf);
/** * Internal method for submitting jobs to the system. * @param job the configuration to submit * @return a proxy object for the running job * @throws FileNotFoundException * @throws ClassNotFoundException * @throws InterruptedException * @throws IOException
5

*/ public RunningJob submitJobInternal(final JobConf job ) throws FileNotFoundException, ClassNotFoundException, InterruptedException, IOException { /* * configure the command line options correctly on the submitting dfs */ return ugi.doAs(new PrivilegedExceptionAction<RunningJob>() { public RunningJob run() throws FileNotFoundException, ClassNotFoundException, InterruptedException, IOException{ JobConf jobCopy = job; Path jobStagingArea = JobSubmissionFiles.getStagingDir(JobClient.this, jobCopy); // jobtracker ID JobID jobId = jobSubmitClient.getNewJobId(); Path submitJobDir = new Path(jobStagingArea, jobId.toString()); jobCopy.set("mapreduce.job.dir", submitJobDir.toString()); JobStatus status = null; try { populateTokenCache(jobCopy, jobCopy.getCredentials()); copyAndConfigureFiles(jobCopy, submitJobDir); // get delegation token for the dir TokenCache.obtainTokensForNamenodes(jobCopy.getCredentials(), new Path [] {submitJobDir}, jobCopy); Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir); int reduces = jobCopy.getNumReduceTasks(); InetAddress ip = InetAddress.getLocalHost(); if (ip != null) { job.setJobSubmitHostAddress(ip.getHostAddress()); job.setJobSubmitHostName(ip.getHostName()); } JobContext context = new JobContext(jobCopy, jobId); // Check the output specification // // MapReduce
6

if (reduces == 0 ? jobCopy.getUseNewMapper() : jobCopy.getUseNewReducer()) { org.apache.hadoop.mapreduce.OutputFormat<?,?> output = ReflectionUtils.newInstance(context.getOutputFormatClass(), jobCopy); output.checkOutputSpecs(context); } else { jobCopy.getOutputFormat().checkOutputSpecs(fs, jobCopy); } jobCopy = (JobConf)context.getConfiguration(); // Create the splits for the job FileSystem fs = submitJobDir.getFileSystem(jobCopy); LOG.debug("Creating splits at " + fs.makeQualified(submitJobDir)); int maps = writeSplits(context, submitJobDir); jobCopy.setNumMapTasks(maps); // write "queue admins of the queue to which job is being submitted" // to job file. String queue = jobCopy.getQueueName(); AccessControlList acl = jobSubmitClient.getQueueAdmins(queue); jobCopy.set(QueueManager.toFullPropertyName(queue, QueueACL.ADMINISTER_JOBS.getAclName()), acl.getACLString()); // Write job file to JobTracker's fs // ID jobtracker FSDataOutputStream out = FileSystem.create(fs, submitJobFile, new FsPermission(JobSubmissionFiles.JOB_FILE_PERMISSION)); try { jobCopy.writeXml(out); } finally { out.close(); } // // Now, actually submit the job (using the submit name) // jobtracker printTokens(jobId, jobCopy.getCredentials()); status = jobSubmitClient.submitJob( jobId, submitJobDir.toString(), jobCopy.getCredentials()); JobProfile prof = jobSubmitClient.getJobProfile(jobId);
7

if (status != null && prof != null) { return new NetworkedJob(status, prof, jobSubmitClient); } else { throw new IOException("Could not launch job"); } } finally { if (status == null) { LOG.info("Cleaning up the staging area " + submitJobDir); if (fs != null && submitJobDir != null) fs.delete(submitJobDir, true); } } } }); }

jobtracker ID JobTracker getNewJobId() OutputFormat checkOutputSpecs org.apache.hadoop.mapreduce.


OutputFormat / org.apache.hadoop.mapred. OutputFormat

ID jobtracker FileSystem jobtracker jobSubmitClient.submitJob(jobId, submitJobDir.toString(), jobCopy.getCredentials());


/** * JobTracker.submitJob() kicks off a new job. * * Create a 'JobInProgress' object, which contains both JobProfile * and JobStatus. Those two sub-objects are sometimes shipped outside * of the JobTracker. But JobInProgress adds info that's useful for * the JobTracker alone. */ public JobStatus submitJob(JobID jobId, String jobSubmitDir, Credentials ts) throws IOException {
9

JobInfo jobInfo = null; UserGroupInformation ugi = UserGroupInformation.getCurrentUser(); synchronized (this) { if (jobs.containsKey(jobId)) { // job already running, don't start twice return jobs.get(jobId).getStatus(); } jobInfo = new JobInfo(jobId, new Text(ugi.getShortUserName()), new Path(jobSubmitDir)); } // Create the JobInProgress, do not lock the JobTracker since // we are about to copy job.xml from HDFS JobInProgress job = null; try { job = new JobInProgress(this, this.conf, jobInfo, 0, ts); } catch (Exception e) { throw new IOException(e); } synchronized (this) { // check if queue is RUNNING String queue = job.getProfile().getQueueName(); if (!queueManager.isRunning(queue)) { throw new IOException("Queue \"" + queue + "\" is not running"); } try { aclsManager.checkAccess(job, ugi, Operation.SUBMIT_JOB); } catch (IOException ioe) { LOG.warn("Access denied for user " + job.getJobConf().getUser() + ". Ignoring job " + jobId, ioe); job.fail(); throw ioe; } // Check the job if it cannot run in the cluster because of invalid memory // requirements. try { checkMemoryRequirements(job); } catch (IOException ioe) { throw ioe; } boolean recovered = true; // TODO: Once the Job recovery code is there, // (MAPREDUCE-873) we
10

// must pass the "recovered" flag accurately. // This is handled in the trunk/0.22 if (!recovered) { // Store the information in a file so that the job can be recovered // later (if at all) Path jobDir = getSystemDirectoryForJob(jobId); FileSystem.mkdirs(fs, jobDir, new FsPermission(SYSTEM_DIR_PERMISSION)); FSDataOutputStream out = fs.create(getSystemFileForJob(jobId)); jobInfo.write(out); out.close(); } // Submit the job JobStatus status; try { status = addJob(jobId, job); } catch (IOException ioe) { LOG.info("Job " + jobId + " submission failed!", ioe); status = job.getStatus(); status.setFailureInfo(StringUtils.stringifyException(ioe)); failJob(job); throw ioe; } return status; } }

jobtracker

11