You are on page 1of 33

Hadoop 5 HDFS

2013-07-30

DataNode DataXceiverServer DataXceiverDataNode / RPC RPC DataNode DataXceiverServer DataXceiver DataXceiver BlockSender BlockReceiver

DataXceiverServer DataXceiver socket DataXceiverServer run DataXceiverServer socket DataXceiver socket DataXceiver
1

DataXceiver DataXceiver
OP_WRITE_BLOCK (80) OP_READ_BLOCK (81) OP_READ_METADATA (82) OP_REPLACE_BLOCK (83) OP_COPY_BLOCK (84) OP_BLOCK_CHECKSUM (85)

DataXceiver $HADOOP_HOME/bin/hadoop fs -put <localsrc> <dst>

$HADOOP_HOME/bin/hadoop fs -copyFromLocal <localsrc> <dst> (OP_WRITE_BLOCK (80) ) namenode Hadoop append namenode namdnode block hdfs datanode namenode IOUtils.copyBytes() client packet namenode datenodes blocksnamenode datanodes blocks client datanode 3 datanode datanode datanode datanode datanode ACK client

DistributedFileSystem create() DistributedFileSystem namenode RPC namenode DistributedFileSystem datanode namenode FSDataOutputStream DFSOutputStream (data queue) DataStreamer datenode namenode datanode (pipeline) DataStreamer 1 datanode DFSOutputStream (ack queue) datanode datanode close()
3

datanode namenode Namenode hadoop client


org.apache.hadoop.fs. FsShell: public int run(String argv[]) throws Exception { if ("-put".equals(cmd) || "-copyFromLocal".equals(cmd)) { Path[] srcs = new Path[argv.length-2]; for (int j=0 ; i < argv.length-1 ;) srcs[j++] = new Path(argv[i++]); copyFromLocal(srcs, argv[i++]); } } org.apache.hadoop.fs. FsShell: void copyFromLocal(Path[] srcs, String dstf) throws IOException { Path dstPath = new Path(dstf); FileSystem dstFs = dstPath.getFileSystem(getConf()); if (srcs.length == 1 && srcs[0].toString().equals("-")) copyFromStdin(dstPath, dstFs); else dstFs.copyFromLocalFile(false, false, srcs, dstPath); } org.apache.hadoop.fs. FileSystem: public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path[] srcs, Path dst) throws IOException { Configuration conf = getConf(); FileUtil.copy(getLocal(conf), srcs, this, dst, delSrc, overwrite, conf); } org.apache.hadoop.fs. FileUtil: public static boolean copy(FileSystem srcFS, Path[] srcs, FileSystem dstFS, Path dst, boolean deleteSource, boolean overwrite, Configuration conf)
4

throws IOException { if (srcs.length == 1) return copy(srcFS, srcs[0], dstFS, dst, deleteSource, overwrite, conf); for (Path src : srcs) { try { if (!copy(srcFS, src, dstFS, dst, deleteSource, overwrite, conf)) returnVal = false; } catch (IOException e) { gotException = true; exceptions.append(e.getMessage()); exceptions.append("\n"); } } return returnVal; }

FsShell hadoop run() hadoop shell shell -put -copyFromLocal copyFromLocal() shell copyFromLocalFile() FileUtil.copy() copy()
org.apache.hadoop.fs. FileUtil: public static boolean copy(FileSystem srcFS, Path src, FileSystem dstFS, Path dst, boolean deleteSource, boolean overwrite, Configuration conf) throws IOException { dst = checkDest(src.getName(), dstFS, dst, overwrite); if (srcFS.getFileStatus(src).isDir()) { checkDependencies(srcFS, src, dstFS, dst); if (!dstFS.mkdirs(dst)) { return false; } FileStatus contents[] = srcFS.listStatus(src); for (int i = 0; i < contents.length; i++) {
5

copy(srcFS, contents[i].getPath(), dstFS, new Path(dst, contents[i].getPath().getName()), deleteSource, overwrite, conf); } } else if (srcFS.isFile(src)) { InputStream in=null; OutputStream out = null; try { in = srcFS.open(src); out = dstFS.create(dst, overwrite); IOUtils.copyBytes(in, out, conf, true); } catch (IOException e) { IOUtils.closeStream(out); IOUtils.closeStream(in); throw e; } } else { throw new IOException(src.toString() + ": No such file or directory"); } if (deleteSource) { return srcFS.delete(src, true); } else { return true; } }

copy() conf in out IOUtils.copyBytes() dstFS.create()

FileSystem

DistributedFileSystem

dstFS.create(dst, overwrite);

return create(f, overwrite, getConf().getInt("io.file.buffer.size", 4096),getDefaultReplication(),getDefaultBlockSize());

return create(f, overwrite, bufferSize, replication, blockSize, null);

return this.create(f, FsPermission.getDefault(),overwrite, bufferSize, replication, blockSize, progress); 1 return new FSDataOutputStream (dfs.create(getPathName(f), permission, overwrite, true, replication, blockSize, progress, bufferSize), statistics);

// public abstract FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, int bufferSize, short replication, long blockSize, Progressable progress) throws IOException;

create() FileSystem FSDataOutputStream create() FSDateOutputStream 2 FS HDFS DistributedFileSystem create() FSDataOutputStream OutputStream dfs.create() DFSOutputStream

DFSClient

DFSOutputStream

ClientProtocol(NameNode)

DFSOutputStream(String src, FsPermission masked, boolean overwrite, boolean createParent, short replication, long blockSize, Progressable progress, int buffersize, int bytesPerChecksum) throws IOException { this(src, blockSize, progress, bytesPerChecksum, replication); computePacketChunkSize(writePacketSize, bytesPerChecksum); try { namenode.create( src, masked, clientName, overwrite, createParent, replication, blockSize); } catch(RemoteException re) { throw re.unwrapRemoteException(AccessControlException.class, FileAlreadyExistsException.class, FileNotFoundException.class, NSQuotaExceededException.class, DSQuotaExceededException.class); } streamer.start(); } new

return new FSDataOutputStream (dfs.create(getPathName(f), permission, overwrite, true, replication, blockSize, progress, bufferSize), statistics);

dfs.create() DFSClient create() OutputStream DFSOutputStreamDFSOutputStream namenode streamer.start() pipeline DataStreamer data queue block 64M 64K packet 1000 packets/block DataStreamer namenode
org.apache.hadoop.hdfs.server.namenode. NameNode: public void create(String src, FsPermission masked, String clientName, boolean overwrite, boolean createParent, short replication, long blockSize ) throws IOException { String clientMachine = getClientMachine();
8

if (stateChangeLog.isDebugEnabled()) { stateChangeLog.debug("*DIR* NameNode.create: file " +src+" for "+clientName+" at "+clientMachine); } if (!checkPathLength(src)) { throw new IOException("create: Pathname too long. Limit " + MAX_PATH_LENGTH + " characters, " + MAX_PATH_DEPTH + " levels."); } namesystem.startFile(src, new PermissionStatus(UserGroupInformation.getCurrentUser().getShortUserName(), null, masked), clientName, clientMachine, overwrite, createParent, replication, blockSize); myMetrics.incrNumFilesCreated(); myMetrics.incrNumCreateFileOps(); } org.apache.hadoop.hdfs.server.namenode. FSNamesystem void startFile(String src, PermissionStatus permissions, String holder, String clientMachine, boolean overwrite, boolean createParent, short replication, long blockSize ) throws IOException { startFileInternal(src, permissions, holder, clientMachine, overwrite, false, createParent, replication, blockSize); getEditLog().logSync(); if (auditLog.isInfoEnabled() && isExternalInvocation()) { final HdfsFileStatus stat = dir.getFileInfo(src); logAuditEvent(UserGroupInformation.getCurrentUser(), Server.getRemoteIp(), "create", src, null, stat); } } org.apache.hadoop.hdfs.server.namenode. FSNamesystem private synchronized void startFileInternal(String src, PermissionStatus permissions, String holder, String clientMachine, boolean overwrite, boolean append, boolean createParent, short replication, long blockSize ) throws IOException {
9

DatanodeDescriptor clientNode = host2DataNodeMap.getDatanodeByHost(clientMachine); if (append) { // // Replace current node with a INodeUnderConstruction. // Recreate in-memory lease record. // INodeFile node = (INodeFile) myFile; INodeFileUnderConstruction cons = new INodeFileUnderConstruction( node.getLocalNameBytes(), node.getReplication(), node.getModificationTime(), node.getPreferredBlockSize(), node.getBlocks(), node.getPermissionStatus(), holder, clientMachine, clientNode); dir.replaceNode(src, node, cons); leaseManager.addLease(cons.clientName, src); } else { // Now we can add the name to the filesystem. This file has no // blocks associated with it. // checkFsObjectLimit(); // increment global generation stamp long genstamp = nextGenerationStamp(); INodeFileUnderConstruction newNode = dir.addFile(src, permissions, replication, blockSize, holder, clientMachine, clientNode, genstamp); if (newNode == null) { throw new IOException("DIR* NameSystem.startFile: " + "Unable to add file to namespace."); } leaseManager.addLease(newNode.clientName, src); if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: " +"add "+src+" to namespace for "+holder); } }
10

namenode create() FSNameSystem startFileInternale() hadoop append INode node under construction blocks stamp client IOUtils.copyBytes() client & block

IOUtils

FSOutputSummer

DFSClient.DFSOutputStream

IOUtils.copyBytes(in, out, conf, true);

copyBytes(in, out, conf.getInt("io.file.buffer.size", 4096), close); copyBytes(in, out, buffSize); out.write(buf, 0, bytesRead); for (int n=0;n<len;n+=write1(b, off+n, len-n))

public static void copyBytes(InputStream in, OutputStream out, int buffSize) throws IOException { PrintStream ps = out instanceof PrintStream ? (PrintStream)out : null; byte buf[] = new byte[buffSize]; int bytesRead = in.read(buf); while (bytesRead >= 0) { out.write(buf, 0, bytesRead); if ((ps != null) && ps.checkError()) { throw new IOException("Unable to write to output stream."); } bytesRead = in.read(buf); } }

write1(b, off+n, len-n) writeChecksumChunk(b, off, length, false); writeChunk(b, off, len, checksum);

IOUtils copyBytes() FSOutputSummer checksum writeChecksumChunk() DFSClient DFSOutputStream writeChunk()

11

OutputStream

FilterOutputStream FileSystem

FSOutputSummer

DataOutputStream DistributedFileSystem

DFSOutputStream

datanode DataStreamer

create()

FSDataOutputStream(out)

DFSOutputStream FSOutputSummer DFSOutputStream writeChunk() DistributedFileSystem create() DFSOutputStream FSDataOutputStream DFSOutputStream writeChunk() client block packet 3 datanode1datanode2 datanode3 client datanode1 packet1 datanode1 datanode1 datanode2 packet1 datanode2 client packet2 datanode1 datanode2 datanode3 packet1 datanode3 client packet3 datanode1datanode1 packet2 datanode2 datanode datanode3 packet1 ack datanode2 datanode2 ack datanode1 client packet

12

org.apache.hadoop.hdfs.DFSClient.DFSOutputStream // @see FSOutputSummer#writeChunk() @Override protected synchronized void writeChunk(byte[] b, int offset, int len, byte[] checksum) throws IOException { checkOpen(); isClosed(); int cklen = checksum.length; int bytesPerChecksum = this.checksum.getBytesPerChecksum(); if (len > bytesPerChecksum) { throw new IOException("writeChunk() buffer size is " + len + " is larger than supported bytesPerChecksum " + bytesPerChecksum); } if (checksum.length != this.checksum.getChecksumSize()) { throw new IOException("writeChunk() checksum size is supposed to be " + this.checksum.getChecksumSize() + " but found to be " + checksum.length); } synchronized (dataQueue) { // If queue is full, then wait till we can create enough space while (!closed && dataQueue.size() + ackQueue.size() > maxPackets) { try { dataQueue.wait(); } catch (InterruptedException e) { } } isClosed(); if (currentPacket == null) { currentPacket = new Packet(packetSize, chunksPerPacket, bytesCurBlock); if (LOG.isDebugEnabled()) { LOG.debug("DFSClient writeChunk allocating new packet seqno=" + currentPacket.seqno + ", src=" + src + ", packetSize=" + packetSize + ", chunksPerPacket=" + chunksPerPacket + ", bytesCurBlock=" + bytesCurBlock); } }
13

currentPacket.writeChecksum(checksum, 0, cklen); currentPacket.writeData(b, offset, len); currentPacket.numChunks++; bytesCurBlock += len; // If packet is full, enqueue it for transmission // if (currentPacket.numChunks == currentPacket.maxChunks || bytesCurBlock == blockSize) { if (LOG.isDebugEnabled()) { LOG.debug("DFSClient writeChunk packet full seqno=" + currentPacket.seqno + ", src=" + src + ", bytesCurBlock=" + bytesCurBlock + ", blockSize=" + blockSize + ", appendChunk=" + appendChunk); } // // if we allocated a new packet because we encountered a block // boundary, reset bytesCurBlock. // if (bytesCurBlock == blockSize) { currentPacket.lastPacketInBlock = true; bytesCurBlock = 0; lastFlushOffset = 0; } enqueueCurrentPacket(); // If this was the first write after reopening a file, then the above // write filled up any partial chunk. Tell the summer to generate full // crc chunks from now on. if (appendChunk) { appendChunk = false; resetChecksumChunk(bytesPerChecksum); } int psize = Math.min((int)(blockSize-bytesCurBlock), writePacketSize); computePacketChunkSize(psize, bytesPerChecksum); } } //LOG.debug("DFSClient writeChunk done length " + len + // " checksum length " + cklen); }

14

org.apache.hadoop.hdfs.DFSClient.DFSOutputStream private synchronized void enqueueCurrentPacket() { synchronized (dataQueue) { if (currentPacket == null) return; dataQueue.addLast(currentPacket); dataQueue.notifyAll(); lastQueuedSeqno = currentPacket.seqno; currentPacket = null; } }

DFSOutputStream
org.apache.hadoop.hdfs.DFSClient.DFSOutputStream private LinkedList<Packet> dataQueue = new LinkedList<Packet>(); private LinkedList<Packet> ackQueue = new LinkedList<Packet>();

writeChunk() data queue packet currentPacket new Packet packet checksum packet data queue data queue DataStreamer
org.apache.hadoop.hdfs.DFSClient.DFSOutputStream.DataStreamer public void run() { long lastPacket = 0; while (!closed && clientRunning) { // if the Responder encountered an error, shutdown Responder if (hasError && response != null) { try { response.close(); response.join(); response = null; } catch (InterruptedException e) { } } Packet one = null; synchronized (dataQueue) { // process IO errors if any
15

boolean doSleep = processDatanodeError(hasError, false); // wait for a packet to be sent. long now = System.currentTimeMillis(); while ((!closed && !hasError && clientRunning && dataQueue.size() == 0 && (blockStream == null || ( blockStream != null && now - lastPacket < timeoutValue/2))) || doSleep) { long timeout = timeoutValue/2 - (now-lastPacket); timeout = timeout <= 0 ? 1000 : timeout; try { dataQueue.wait(timeout); now = System.currentTimeMillis(); } catch (InterruptedException e) { } doSleep = false; } if (closed || hasError || !clientRunning) { continue; } try { // get packet to be sent. if (dataQueue.isEmpty()) { one = new Packet(); // heartbeat packet } else { one = dataQueue.getFirst(); // regular data packet } long offsetInBlock = one.offsetInBlock; // get new block from namenode. if (blockStream == null) { LOG.debug("Allocating new block"); nodes = nextBlockOutputStream(src); this.setName("DataStreamer for file " + src + " block " + block); response = new ResponseProcessor(nodes); response.start(); } if (offsetInBlock >= blockSize) {
16

throw new IOException("BlockSize " + blockSize + " is smaller than data size. " + " Offset of packet in block " + offsetInBlock + " Aborting file " + src); } ByteBuffer buf = one.getBuffer(); // move packet from dataQueue to ackQueue if (!one.isHeartbeatPacket()) { dataQueue.removeFirst(); dataQueue.notifyAll(); synchronized (ackQueue) { ackQueue.addLast(one); ackQueue.notifyAll(); } } // write out data to remote datanode blockStream.write(buf.array(), buf.position(), buf.remaining()); if (one.lastPacketInBlock) { blockStream.writeInt(0); // indicate end-of-block } blockStream.flush(); lastPacket = System.currentTimeMillis(); if (LOG.isDebugEnabled()) { LOG.debug("DataStreamer block " + block + " wrote packet seqno:" + one.seqno + " size:" + buf.remaining() + " offsetInBlock:" + one.offsetInBlock + " lastPacketInBlock:" + one.lastPacketInBlock); } } catch (Throwable e) { LOG.warn("DataStreamer Exception: " + StringUtils.stringifyException(e)); if (e instanceof IOException) { setLastException((IOException)e); } hasError = true; } }
17

if (closed || hasError || !clientRunning) { continue; } // Is this block full? if (one.lastPacketInBlock) { synchronized (ackQueue) { while (!hasError && ackQueue.size() != 0 && clientRunning) { try { ackQueue.wait(); // wait for acks to arrive from datanodes } catch (InterruptedException e) { } } } LOG.debug("Closing old block " + block); this.setName("DataStreamer for file " + src); response.close(); // ignore all errors in Response try { response.join(); response = null; } catch (InterruptedException e) { } if (closed || hasError || !clientRunning) { continue; } synchronized (dataQueue) { IOUtils.cleanup(LOG, blockStream, blockReplyStream); nodes = null; response = null; blockStream = null; blockReplyStream = null; } } if (progress != null) { progress.progress(); } // This is used by unit test to trigger race conditions. if (artificialSlowdown != 0 && clientRunning) { LOG.debug("Sleeping for artificial slowdown of " + artificialSlowdown + "ms"); try {
18

Thread.sleep(artificialSlowdown); } catch (InterruptedException e) {} } } }

DataStreamer run() packet 1s packet nextBlockOutPutStream() namenode datanodes blocks


org.apache.hadoop.hdfs.DFSClient.DFSOutputStream private DatanodeInfo[] nextBlockOutputStream(String client) throws IOException { LocatedBlock lb = null; boolean retry = false; DatanodeInfo[] nodes; int count = conf.getInt("dfs.client.block.write.retries", 3); boolean success; do { hasError = false; lastException = null; errorIndex = 0; retry = false; nodes = null; success = false; long startTime = System.currentTimeMillis(); DatanodeInfo[] excluded = excludedNodes.toArray(new DatanodeInfo[0]); lb = locateFollowingBlock(startTime, excluded.length > 0 ? excluded : null); block = lb.getBlock(); accessToken = lb.getBlockToken(); nodes = lb.getLocations(); // // Connect to first DataNode in the list. // success = createBlockOutputStream(nodes, clientName, false); if (!success) { LOG.info("Abandoning block " + block); namenode.abandonBlock(block, src, clientName); if (errorIndex < nodes.length) { LOG.info("Excluding datanode " + nodes[errorIndex]);
19

excludedNodes.add(nodes[errorIndex]); } // Connection failed. Let's wait a little bit and retry retry = true; } } while (retry && --count >= 0); if (!success) { throw new IOException("Unable to create new block."); } return nodes; }

nextBlockOutputStream() 3 datanodes blockslocateFollowingBlock() datanode createBlockOutputStream()


org.apache.hadoop.hdfs.DFSClient.DFSOutputStream private LocatedBlock locateFollowingBlock(long start, DatanodeInfo[] excludedNodes ) throws IOException { int retries = conf.getInt("dfs.client.block.write.locateFollowingBlock.retries", 5); long sleeptime = 400; while (true) { long localstart = System.currentTimeMillis(); while (true) { try { if (serverSupportsHdfs630) { return namenode.addBlock(src, clientName, excludedNodes); } else { return namenode.addBlock(src, clientName); } } catch (RemoteException e) { } } }

locateFollowingBlock() 5 namenode datanodes blocks namenode datanodes blocks


20

namenode client addBlock() FSNamesystem.getAdditionalBlock() DatanodeDescriptor targets[] block datanodesInode[] pathINodes path INode INode pendingFile under construction INode newBlock block LocatedBlock() org.apache.hadoop.hdfs.DFSClient.DFSOutputStream. nextBlockOutputStream() lb client org.apache.hadoop.hdfs.DFSClient.DFSOutputStream createBlockOutputStream()client datanode
org.apache.hadoop.hdfs.DFSClient.DFSOutputStream // connects to the first datanode in the pipeline // Returns true if success, otherwise return failure. // private boolean createBlockOutputStream(DatanodeInfo[] nodes, String client, boolean recoveryFlag) { short pipelineStatus = (short)DataTransferProtocol.OP_STATUS_SUCCESS; String firstBadLink = ""; if (LOG.isDebugEnabled()) { for (int i = 0; i < nodes.length; i++) { LOG.debug("pipeline = " + nodes[i].getName()); } } // persist blocks on namenode on next flush persistBlocks = true; boolean result = false; try { LOG.debug("Connecting to " + nodes[0].getName()); InetSocketAddress target = NetUtils.createSocketAddr(nodes[0].getName()); s = socketFactory.createSocket(); timeoutValue = 3000 * nodes.length + socketTimeout; NetUtils.connect(s, target, timeoutValue); s.setSoTimeout(timeoutValue); s.setSendBufferSize(DEFAULT_DATA_SOCKET_SIZE); LOG.debug("Send buf size " + s.getSendBufferSize()); long writeTimeout = HdfsConstants.WRITE_TIMEOUT_EXTENSION * nodes.length
21

+ datanodeWriteTimeout; // // Xmit header info to datanode // DataOutputStream out = new DataOutputStream( new BufferedOutputStream(NetUtils.getOutputStream(s, writeTimeout), DataNode.SMALL_BUFFER_SIZE)); blockReplyStream = new DataInputStream(NetUtils.getInputStream(s)); out.writeShort( DataTransferProtocol.DATA_TRANSFER_VERSION ); out.write( DataTransferProtocol.OP_WRITE_BLOCK ); out.writeLong( block.getBlockId() ); out.writeLong( block.getGenerationStamp() ); out.writeInt( nodes.length ); out.writeBoolean( recoveryFlag ); // recovery flag Text.writeString( out, client ); out.writeBoolean(false); // Not sending src node information out.writeInt( nodes.length - 1 ); for (int i = 1; i < nodes.length; i++) { nodes[i].write(out); } accessToken.write(out); checksum.writeHeader( out ); out.flush(); // receive ack for connect pipelineStatus = blockReplyStream.readShort(); firstBadLink = Text.readString(blockReplyStream); if (pipelineStatus != DataTransferProtocol.OP_STATUS_SUCCESS) { if (pipelineStatus DataTransferProtocol.OP_STATUS_ERROR_ACCESS_TOKEN) { throw new InvalidBlockTokenException( "Got access token error for connect ack with firstBadLink as " + firstBadLink); } else { throw new IOException("Bad connect ack with firstBadLink as " + firstBadLink); } } blockStream = out; result = true; // success
22

==

} catch (IOException ie) { } finally { } return result; }

nodes[0] pipeline datanode stamp datanode datanodes datanode for i 1 datanode pipeline
org.apache.hadoop.hdfs.DFSClient.DFSOutputStream.DataStreamer.run()

datanodes blocks datanode one data queue ack queue ack OK ack queue datanode ack queue data queue blockStream.write() datanode block packet block packet 0 block datanode & datanode DataTransferProtocol.OP_WRITE_BLOCK datanode DataXceiver writeBlock()
org.apache.hadoop.hdfs.server.datanode.DataXceiver private void writeBlock(DataInputStream in) throws IOException { DatanodeInfo srcDataNode = null; LOG.debug("writeBlock receive buf size " + s.getReceiveBufferSize() + " tcp no delay " + s.getTcpNoDelay()); // // Read in the header // Block block = new Block(in.readLong(), dataXceiverServer.estimateBlockSize, in.readLong()); LOG.info("Receiving block " + block + " src: " + remoteAddress +
23

" dest: " + localAddress); int pipelineSize = in.readInt(); // num of datanodes in entire pipeline boolean isRecovery = in.readBoolean(); // is this part of recovery? String client = Text.readString(in); // working on behalf of this client boolean hasSrcDataNode = in.readBoolean(); // is src node info present if (hasSrcDataNode) { srcDataNode = new DatanodeInfo(); srcDataNode.readFields(in); } int numTargets = in.readInt(); if (numTargets < 0) { throw new IOException("Mislabelled incoming datastream."); } DatanodeInfo targets[] = new DatanodeInfo[numTargets]; for (int i = 0; i < targets.length; i++) { DatanodeInfo tmp = new DatanodeInfo(); tmp.readFields(in); targets[i] = tmp; } Token<BlockTokenIdentifier> accessToken = new Token<BlockTokenIdentifier>(); accessToken.readFields(in); DataOutputStream replyOut = null; // stream to prev target replyOut = new DataOutputStream( NetUtils.getOutputStream(s, datanode.socketWriteTimeout)); if (datanode.isBlockTokenEnabled) { try { datanode.blockTokenSecretManager.checkAccess(accessToken, null, block, BlockTokenSecretManager.AccessMode.WRITE); } catch (InvalidToken e) { try { if (client.length() != 0) { replyOut.writeShort((short)DataTransferProtocol.OP_STATUS_ERROR_ACCESS_TOKEN); Text.writeString(replyOut, datanode.dnRegistration.getName()); replyOut.flush(); } throw new IOException("Access token verification failed, for client " + remoteAddress + " for OP_WRITE_BLOCK for block " + block); } finally { IOUtils.closeStream(replyOut); } } }

24

DataOutputStream mirrorOut = null; // stream to next target DataInputStream mirrorIn = null; // reply from next target Socket mirrorSock = null; // socket to next target BlockReceiver blockReceiver = null; // responsible for data handling String mirrorNode = null; // the name:port of next target String firstBadLink = ""; // first datanode that failed in connection setup short mirrorInStatus = (short)DataTransferProtocol.OP_STATUS_SUCCESS; try { // open a block receiver and check if the block does not exist blockReceiver = new BlockReceiver(block, in, s.getRemoteSocketAddress().toString(), s.getLocalSocketAddress().toString(), isRecovery, client, srcDataNode, datanode); // // Open network conn to backup machine, if // appropriate // if (targets.length > 0) { InetSocketAddress mirrorTarget = null; // Connect to backup machine mirrorNode = targets[0].getName(); mirrorTarget = NetUtils.createSocketAddr(mirrorNode); mirrorSock = datanode.newSocket(); try { int timeoutValue = datanode.socketTimeout + (HdfsConstants.READ_TIMEOUT_EXTENSION numTargets); int writeTimeout = datanode.socketWriteTimeout + (HdfsConstants.WRITE_TIMEOUT_EXTENSION numTargets); NetUtils.connect(mirrorSock, mirrorTarget, timeoutValue); mirrorSock.setSoTimeout(timeoutValue); mirrorSock.setSendBufferSize(DEFAULT_DATA_SOCKET_SIZE); mirrorOut = new DataOutputStream( new BufferedOutputStream( NetUtils.getOutputStream(mirrorSock, writeTimeout), SMALL_BUFFER_SIZE)); mirrorIn = new DataInputStream(NetUtils.getInputStream(mirrorSock)); // Write header: Copied from DFSClient.java! mirrorOut.writeShort( DataTransferProtocol.DATA_TRANSFER_VERSION ); mirrorOut.write( DataTransferProtocol.OP_WRITE_BLOCK ); mirrorOut.writeLong( block.getBlockId() );
25

mirrorOut.writeLong( block.getGenerationStamp() ); mirrorOut.writeInt( pipelineSize ); mirrorOut.writeBoolean( isRecovery ); Text.writeString( mirrorOut, client ); mirrorOut.writeBoolean(hasSrcDataNode); if (hasSrcDataNode) { // pass src node information srcDataNode.write(mirrorOut); } mirrorOut.writeInt( targets.length - 1 ); for ( int i = 1; i < targets.length; i++ ) { targets[i].write( mirrorOut ); } accessToken.write(mirrorOut); blockReceiver.writeChecksumHeader(mirrorOut); mirrorOut.flush(); // read connect ack (only for clients, not for replication req) if (client.length() != 0) { mirrorInStatus = mirrorIn.readShort(); firstBadLink = Text.readString(mirrorIn); if (LOG.isDebugEnabled() || mirrorInStatus DataTransferProtocol.OP_STATUS_SUCCESS) { LOG.info("Datanode " + targets.length + " got response for connect ack " + " from downstream datanode with firstbadlink as " + firstBadLink); } } } catch (IOException e) { } } // send connect ack back to source (only for clients) if (client.length() != 0) { if (LOG.isDebugEnabled() || mirrorInStatus DataTransferProtocol.OP_STATUS_SUCCESS) { LOG.info("Datanode " + targets.length + " forwarding connect ack to upstream firstbadlink is " + firstBadLink); } replyOut.writeShort(mirrorInStatus);

!=

!=

26

Text.writeString(replyOut, firstBadLink); replyOut.flush(); } // receive the block and mirror to the next target String mirrorAddr = (mirrorSock == null) ? null : mirrorNode; blockReceiver.receiveBlock(mirrorOut, mirrorIn, replyOut, mirrorAddr, null, targets.length); // if this write is for a replication request (and not // from a client), then confirm block. For client-writes, // the block is finalized in the PacketResponder. if (client.length() == 0) { datanode.notifyNamenodeReceivedBlock(block, DataNode.EMPTY_DEL_HINT); LOG.info("Received block " + block + " src: " + remoteAddress + " dest: " + localAddress + " of size " + block.getNumBytes()); } if (datanode.blockScanner != null) { datanode.blockScanner.addBlock(block); } } catch (IOException ioe) { } finally { } }

datanode datanodes DatanodeInfo targets[] client datanode replyOut datanode datanode BlockReceiver, DataInputStream in DataNode DataOutputStream mirrorOut DataNode OutputStream out datanodetargets.length>0 datanode 1 datanode datanodes
27

datanode receiveBlock() datanode


org.apache.hadoop.hdfs.server.datanode.BlockReceiver void receiveBlock( DataOutputStream mirrOut, // output to next datanode DataInputStream mirrIn, // input from next datanode DataOutputStream replyOut, // output to previous datanode String mirrAddr, BlockTransferThrottler throttlerArg, int numTargets) throws IOException { mirrorOut = mirrOut; mirrorAddr = mirrAddr; throttler = throttlerArg; try { // write data chunk header if (!finalized) { BlockMetadataHeader.writeHeader(checksumOut, checksum); } if (clientName.length() > 0) { responder = new Daemon(datanode.threadGroup, new PacketResponder(this, block, mirrIn, replyOut, numTargets, Thread.currentThread())); responder.start(); // start thread to processes reponses } /* * Receive until packet length is zero. */ while (receivePacket() > 0) {} // flush the mirror out if (mirrorOut != null) { try { mirrorOut.writeInt(0); // mark the end of the block mirrorOut.flush(); } catch (IOException e) { handleMirrorOutError(e); } } // wait for all outstanding packet responses. And then
28

// indicate responder to gracefully shutdown. if (responder != null) { ((PacketResponder)responder.getRunnable()).close(); } // if this write is for a replication request (and not // from a client), then finalize block. For client-writes, // the block is finalized in the PacketResponder. if (clientName.length() == 0) { // close the block/crc files close(); // Finalize the block. Does this fsync()? block.setNumBytes(offsetInBlock); datanode.data.finalizeBlock(block); datanode.myMetrics.incrBlocksWritten(); } } catch (IOException ioe) { } finally { } } org.apache.hadoop.hdfs.server.datanode.BlockReceiver private int receivePacket() throws IOException { int payloadLen = readNextPacket(); if (payloadLen <= 0) { return payloadLen; } buf.mark(); //read the header buf.getInt(); // packet length offsetInBlock = buf.getLong(); // get offset of packet in block long seqno = buf.getLong(); // get seqno boolean lastPacketInBlock = (buf.get() != 0); int endOfHeader = buf.position(); buf.reset();

29

if (LOG.isDebugEnabled()){ LOG.debug("Receiving one packet for block " + block + " of length " + payloadLen + " seqno " + seqno + " offsetInBlock " + offsetInBlock + " lastPacketInBlock " + lastPacketInBlock); } setBlockPosition(offsetInBlock); // First write the packet to the mirror: if (mirrorOut != null && !mirrorError) { try { mirrorOut.write(buf.array(), buf.position(), buf.remaining()); mirrorOut.flush(); } catch (IOException e) { handleMirrorOutError(e); } } buf.position(endOfHeader); int len = buf.getInt(); if (len < 0) { throw new IOException("Got wrong length during writeBlock(" + block + ") from " + inAddr + " at offset " + offsetInBlock + ": " + len); } if (len == 0) { LOG.debug("Receiving empty packet for block " + block); } else { offsetInBlock += len; int checksumLen = ((len + bytesPerChecksum - 1)/bytesPerChecksum)* checksumSize; if ( buf.remaining() != (checksumLen + len)) { throw new IOException("Data remaining in packet does not match " + "sum of checksumLen and dataLen"); } int checksumOff = buf.position(); int dataOff = checksumOff + checksumLen; byte pktBuf[] = buf.array();
30

buf.position(buf.limit()); // move to the end of the data. /* skip verifying checksum iff this is not the last one in the * pipeline and clientName is non-null. i.e. Checksum is verified * on all the datanodes when the data is being written by a * datanode rather than a client. Whe client is writing the data, * protocol includes acks and only the last datanode needs to verify * checksum. */ if (mirrorOut == null || clientName.length() == 0) { verifyChunks(pktBuf, dataOff, len, pktBuf, checksumOff); } try { if (!finalized) { //finally write to the disk : out.write(pktBuf, dataOff, len); // If this is a partial chunk, then verify that this is the only // chunk in the packet. Calculate new crc for this chunk. if (partialCrc != null) { if (len > bytesPerChecksum) { throw new IOException("Got wrong length during writeBlock(" + block + ") from " + inAddr + " " + "A packet can have only one partial chunk."+ " len = " + len + " bytesPerChecksum " + bytesPerChecksum); } partialCrc.update(pktBuf, dataOff, len); byte[] buf = FSOutputSummer.convertToByteStream(partialCrc, checksumSize); checksumOut.write(buf); LOG.debug("Writing out partial crc for data len " + len); partialCrc = null; } else { checksumOut.write(pktBuf, checksumOff, checksumLen); } datanode.myMetrics.incrBytesWritten(len); /// flush entire packet before sending ack flush(); // update length only after flush to disk datanode.data.setVisibleLength(block, offsetInBlock);
31

} } catch (IOException iex) { datanode.checkDiskError(iex); throw iex; } } // put in queue for pending acks if (responder != null) { ((PacketResponder)responder.getRunnable()).enqueue(seqno, lastPacketInBlock); } if (throttler != null) { // throttle I/O throttler.throttle(payloadLen); } return payloadLen; }

receiveBlock() receivePacket() packet 0 client queue datanode ack datanode clientreceivePacket() packet packet datanode client ack org.apache.hadoop.hdfs.DFSClient.DFSOutputStream.ResponseProcessor.run() packet ack queue OP_READ_BLOCK (81)

32

FileSystem open() DistributedFileSystem RPC namenode namenode datanode DistributedFileSystem FSDataInputStream FSDataInputStream datanode namenode I/O DFSInputStream read() datanode DFSInputStream datanode read() datanode DFSInputStream datanode datanode FSDataInputStream close()

33