Search in sources :

Example 1 with PacketHeader

use of org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader in project hadoop by apache.

the class BlockSender method writePacketHeader.

/**
   * Write packet header into {@code pkt},
   * return the length of the header written.
   */
private int writePacketHeader(ByteBuffer pkt, int dataLen, int packetLen) {
    pkt.clear();
    // both syncBlock and syncPacket are false
    PacketHeader header = new PacketHeader(packetLen, offset, seqno, (dataLen == 0), dataLen, false);
    int size = header.getSerializedSize();
    pkt.position(PacketHeader.PKT_MAX_HEADER_LEN - size);
    header.putInBuffer(pkt);
    return size;
}
Also used : PacketHeader(org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader)

Example 2 with PacketHeader

use of org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader in project hadoop by apache.

the class BlockReceiver method receivePacket.

/** 
   * Receives and processes a packet. It can contain many chunks.
   * returns the number of data bytes that the packet has.
   */
private int receivePacket() throws IOException {
    // read the next packet
    packetReceiver.receiveNextPacket(in);
    PacketHeader header = packetReceiver.getHeader();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Receiving one packet for block " + block + ": " + header);
    }
    // Sanity check the header
    if (header.getOffsetInBlock() > replicaInfo.getNumBytes()) {
        throw new IOException("Received an out-of-sequence packet for " + block + "from " + inAddr + " at offset " + header.getOffsetInBlock() + ". Expecting packet starting at " + replicaInfo.getNumBytes());
    }
    if (header.getDataLen() < 0) {
        throw new IOException("Got wrong length during writeBlock(" + block + ") from " + inAddr + " at offset " + header.getOffsetInBlock() + ": " + header.getDataLen());
    }
    long offsetInBlock = header.getOffsetInBlock();
    long seqno = header.getSeqno();
    boolean lastPacketInBlock = header.isLastPacketInBlock();
    final int len = header.getDataLen();
    boolean syncBlock = header.getSyncBlock();
    // avoid double sync'ing on close
    if (syncBlock && lastPacketInBlock) {
        this.syncOnClose = false;
    }
    // update received bytes
    final long firstByteInBlock = offsetInBlock;
    offsetInBlock += len;
    if (replicaInfo.getNumBytes() < offsetInBlock) {
        replicaInfo.setNumBytes(offsetInBlock);
    }
    // put in queue for pending acks, unless sync was requested
    if (responder != null && !syncBlock && !shouldVerifyChecksum()) {
        ((PacketResponder) responder.getRunnable()).enqueue(seqno, lastPacketInBlock, offsetInBlock, Status.SUCCESS);
    }
    // Drop heartbeat for testing.
    if (seqno < 0 && len == 0 && DataNodeFaultInjector.get().dropHeartbeatPacket()) {
        return 0;
    }
    //First write the packet to the mirror:
    if (mirrorOut != null && !mirrorError) {
        try {
            long begin = Time.monotonicNow();
            // For testing. Normally no-op.
            DataNodeFaultInjector.get().stopSendingPacketDownstream(mirrorAddr);
            packetReceiver.mirrorPacketTo(mirrorOut);
            mirrorOut.flush();
            long now = Time.monotonicNow();
            setLastSentTime(now);
            long duration = now - begin;
            DataNodeFaultInjector.get().logDelaySendingPacketDownstream(mirrorAddr, duration);
            trackSendPacketToLastNodeInPipeline(duration);
            if (duration > datanodeSlowLogThresholdMs) {
                LOG.warn("Slow BlockReceiver write packet to mirror took " + duration + "ms (threshold=" + datanodeSlowLogThresholdMs + "ms)");
            }
        } catch (IOException e) {
            handleMirrorOutError(e);
        }
    }
    ByteBuffer dataBuf = packetReceiver.getDataSlice();
    ByteBuffer checksumBuf = packetReceiver.getChecksumSlice();
    if (lastPacketInBlock || len == 0) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Receiving an empty packet or the end of the block " + block);
        }
        // sync block if requested
        if (syncBlock) {
            flushOrSync(true);
        }
    } else {
        final int checksumLen = diskChecksum.getChecksumSize(len);
        final int checksumReceivedLen = checksumBuf.capacity();
        if (checksumReceivedLen > 0 && checksumReceivedLen != checksumLen) {
            throw new IOException("Invalid checksum length: received length is " + checksumReceivedLen + " but expected length is " + checksumLen);
        }
        if (checksumReceivedLen > 0 && shouldVerifyChecksum()) {
            try {
                verifyChunks(dataBuf, checksumBuf);
            } catch (IOException ioe) {
                // checksum error detected locally. there is no reason to continue.
                if (responder != null) {
                    try {
                        ((PacketResponder) responder.getRunnable()).enqueue(seqno, lastPacketInBlock, offsetInBlock, Status.ERROR_CHECKSUM);
                        // Wait until the responder sends back the response
                        // and interrupt this thread.
                        Thread.sleep(3000);
                    } catch (InterruptedException e) {
                    }
                }
                throw new IOException("Terminating due to a checksum error." + ioe);
            }
            if (needsChecksumTranslation) {
                // overwrite the checksums in the packet buffer with the
                // appropriate polynomial for the disk storage.
                translateChunks(dataBuf, checksumBuf);
            }
        }
        if (checksumReceivedLen == 0 && !streams.isTransientStorage()) {
            // checksum is missing, need to calculate it
            checksumBuf = ByteBuffer.allocate(checksumLen);
            diskChecksum.calculateChunkedSums(dataBuf, checksumBuf);
        }
        // by this point, the data in the buffer uses the disk checksum
        final boolean shouldNotWriteChecksum = checksumReceivedLen == 0 && streams.isTransientStorage();
        try {
            long onDiskLen = replicaInfo.getBytesOnDisk();
            if (onDiskLen < offsetInBlock) {
                // Normally the beginning of an incoming packet is aligned with the
                // existing data on disk. If the beginning packet data offset is not
                // checksum chunk aligned, the end of packet will not go beyond the
                // next chunk boundary.
                // When a failure-recovery is involved, the client state and the
                // the datanode state may not exactly agree. I.e. the client may
                // resend part of data that is already on disk. Correct number of
                // bytes should be skipped when writing the data and checksum
                // buffers out to disk.
                long partialChunkSizeOnDisk = onDiskLen % bytesPerChecksum;
                long lastChunkBoundary = onDiskLen - partialChunkSizeOnDisk;
                boolean alignedOnDisk = partialChunkSizeOnDisk == 0;
                boolean alignedInPacket = firstByteInBlock % bytesPerChecksum == 0;
                // If the end of the on-disk data is not chunk-aligned, the last
                // checksum needs to be overwritten.
                boolean overwriteLastCrc = !alignedOnDisk && !shouldNotWriteChecksum;
                // If the starting offset of the packat data is at the last chunk
                // boundary of the data on disk, the partial checksum recalculation
                // can be skipped and the checksum supplied by the client can be used
                // instead. This reduces disk reads and cpu load.
                boolean doCrcRecalc = overwriteLastCrc && (lastChunkBoundary != firstByteInBlock);
                // chunk boundary.
                if (!alignedInPacket && len > bytesPerChecksum) {
                    throw new IOException("Unexpected packet data length for " + block + " from " + inAddr + ": a partial chunk must be " + " sent in an individual packet (data length = " + len + " > bytesPerChecksum = " + bytesPerChecksum + ")");
                }
                // If the last portion of the block file is not a full chunk,
                // then read in pre-existing partial data chunk and recalculate
                // the checksum so that the checksum calculation can continue
                // from the right state. If the client provided the checksum for
                // the whole chunk, this is not necessary.
                Checksum partialCrc = null;
                if (doCrcRecalc) {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("receivePacket for " + block + ": previous write did not end at the chunk boundary." + " onDiskLen=" + onDiskLen);
                    }
                    long offsetInChecksum = BlockMetadataHeader.getHeaderSize() + onDiskLen / bytesPerChecksum * checksumSize;
                    partialCrc = computePartialChunkCrc(onDiskLen, offsetInChecksum);
                }
                // The data buffer position where write will begin. If the packet
                // data and on-disk data have no overlap, this will not be at the
                // beginning of the buffer.
                int startByteToDisk = (int) (onDiskLen - firstByteInBlock) + dataBuf.arrayOffset() + dataBuf.position();
                // Actual number of data bytes to write.
                int numBytesToDisk = (int) (offsetInBlock - onDiskLen);
                // Write data to disk.
                long begin = Time.monotonicNow();
                streams.writeDataToDisk(dataBuf.array(), startByteToDisk, numBytesToDisk);
                long duration = Time.monotonicNow() - begin;
                if (duration > datanodeSlowLogThresholdMs) {
                    LOG.warn("Slow BlockReceiver write data to disk cost:" + duration + "ms (threshold=" + datanodeSlowLogThresholdMs + "ms)");
                }
                if (duration > maxWriteToDiskMs) {
                    maxWriteToDiskMs = duration;
                }
                final byte[] lastCrc;
                if (shouldNotWriteChecksum) {
                    lastCrc = null;
                } else {
                    int skip = 0;
                    byte[] crcBytes = null;
                    // First, prepare to overwrite the partial crc at the end.
                    if (overwriteLastCrc) {
                        // not chunk-aligned on disk
                        // prepare to overwrite last checksum
                        adjustCrcFilePosition();
                    }
                    // CRC by reading the rest of the chunk, then write it out.
                    if (doCrcRecalc) {
                        // Calculate new crc for this chunk.
                        int bytesToReadForRecalc = (int) (bytesPerChecksum - partialChunkSizeOnDisk);
                        if (numBytesToDisk < bytesToReadForRecalc) {
                            bytesToReadForRecalc = numBytesToDisk;
                        }
                        partialCrc.update(dataBuf.array(), startByteToDisk, bytesToReadForRecalc);
                        byte[] buf = FSOutputSummer.convertToByteStream(partialCrc, checksumSize);
                        crcBytes = copyLastChunkChecksum(buf, checksumSize, buf.length);
                        checksumOut.write(buf);
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Writing out partial crc for data len " + len + ", skip=" + skip);
                        }
                        //  For the partial chunk that was just read.
                        skip++;
                    }
                    // Determine how many checksums need to be skipped up to the last
                    // boundary. The checksum after the boundary was already counted
                    // above. Only count the number of checksums skipped up to the
                    // boundary here.
                    long skippedDataBytes = lastChunkBoundary - firstByteInBlock;
                    if (skippedDataBytes > 0) {
                        skip += (int) (skippedDataBytes / bytesPerChecksum) + ((skippedDataBytes % bytesPerChecksum == 0) ? 0 : 1);
                    }
                    // Convert to number of bytes
                    skip *= checksumSize;
                    // write the rest of checksum
                    final int offset = checksumBuf.arrayOffset() + checksumBuf.position() + skip;
                    final int end = offset + checksumLen - skip;
                    // more to write after that.
                    if (offset >= end && doCrcRecalc) {
                        lastCrc = crcBytes;
                    } else {
                        final int remainingBytes = checksumLen - skip;
                        lastCrc = copyLastChunkChecksum(checksumBuf.array(), checksumSize, end);
                        checksumOut.write(checksumBuf.array(), offset, remainingBytes);
                    }
                }
                /// flush entire packet, sync if requested
                flushOrSync(syncBlock);
                replicaInfo.setLastChecksumAndDataLen(offsetInBlock, lastCrc);
                datanode.metrics.incrBytesWritten(len);
                datanode.metrics.incrTotalWriteTime(duration);
                manageWriterOsCache(offsetInBlock);
            }
        } catch (IOException iex) {
            // Volume error check moved to FileIoProvider
            throw iex;
        }
    }
    // (after the fsync finished)
    if (responder != null && (syncBlock || shouldVerifyChecksum())) {
        ((PacketResponder) responder.getRunnable()).enqueue(seqno, lastPacketInBlock, offsetInBlock, Status.SUCCESS);
    }
    /*
     * Send in-progress responses for the replaceBlock() calls back to caller to
     * avoid timeouts due to balancer throttling. HDFS-6247
     */
    if (isReplaceBlock && (Time.monotonicNow() - lastResponseTime > responseInterval)) {
        BlockOpResponseProto.Builder response = BlockOpResponseProto.newBuilder().setStatus(Status.IN_PROGRESS);
        response.build().writeDelimitedTo(replyOut);
        replyOut.flush();
        lastResponseTime = Time.monotonicNow();
    }
    if (throttler != null) {
        // throttle I/O
        throttler.throttle(len);
    }
    return lastPacketInBlock ? -1 : len;
}
Also used : Checksum(java.util.zip.Checksum) DataChecksum(org.apache.hadoop.util.DataChecksum) BlockOpResponseProto(org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto) PacketHeader(org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader) IOException(java.io.IOException) ByteBuffer(java.nio.ByteBuffer)

Example 3 with PacketHeader

use of org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader in project hadoop by apache.

the class TestDataTransferProtocol method testPacketHeader.

@Test
public void testPacketHeader() throws IOException {
    PacketHeader hdr = new PacketHeader(// size of packet
    4, // OffsetInBlock
    1024, // sequencenumber
    100, // lastPacketInBlock
    false, // chunk length
    4096, false);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    hdr.write(new DataOutputStream(baos));
    // Read back using DataInput
    PacketHeader readBack = new PacketHeader();
    ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
    readBack.readFields(new DataInputStream(bais));
    assertEquals(hdr, readBack);
    // Read back using ByteBuffer
    readBack = new PacketHeader();
    readBack.readFields(ByteBuffer.wrap(baos.toByteArray()));
    assertEquals(hdr, readBack);
    assertTrue(hdr.sanityCheck(99));
    assertFalse(hdr.sanityCheck(100));
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) DataOutputStream(java.io.DataOutputStream) PacketHeader(org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DataInputStream(java.io.DataInputStream) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) Test(org.junit.Test)

Example 4 with PacketHeader

use of org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader in project hadoop by apache.

the class TestDataTransferProtocol method testDataTransferProtocol.

@Test
public void testDataTransferProtocol() throws IOException {
    Random random = new Random();
    int oneMil = 1024 * 1024;
    Path file = new Path("dataprotocol.dat");
    int numDataNodes = 1;
    Configuration conf = new HdfsConfiguration();
    conf.setInt(DFSConfigKeys.DFS_REPLICATION_KEY, numDataNodes);
    MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDataNodes).build();
    try {
        cluster.waitActive();
        datanode = cluster.getFileSystem().getDataNodeStats(DatanodeReportType.LIVE)[0];
        dnAddr = NetUtils.createSocketAddr(datanode.getXferAddr());
        FileSystem fileSys = cluster.getFileSystem();
        int fileLen = Math.min(conf.getInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 4096), 4096);
        DFSTestUtil.createFile(fileSys, file, fileLen, fileLen, fileSys.getDefaultBlockSize(file), fileSys.getDefaultReplication(file), 0L);
        // get the first blockid for the file
        final ExtendedBlock firstBlock = DFSTestUtil.getFirstBlock(fileSys, file);
        final String poolId = firstBlock.getBlockPoolId();
        long newBlockId = firstBlock.getBlockId() + 1;
        recvBuf.reset();
        sendBuf.reset();
        // bad version
        recvOut.writeShort((short) (DataTransferProtocol.DATA_TRANSFER_VERSION - 1));
        sendOut.writeShort((short) (DataTransferProtocol.DATA_TRANSFER_VERSION - 1));
        sendRecvData("Wrong Version", true);
        // bad ops
        sendBuf.reset();
        sendOut.writeShort((short) DataTransferProtocol.DATA_TRANSFER_VERSION);
        sendOut.writeByte(Op.WRITE_BLOCK.code - 1);
        sendRecvData("Wrong Op Code", true);
        /* Test OP_WRITE_BLOCK */
        sendBuf.reset();
        DataChecksum badChecksum = Mockito.spy(DEFAULT_CHECKSUM);
        Mockito.doReturn(-1).when(badChecksum).getBytesPerChecksum();
        writeBlock(poolId, newBlockId, badChecksum);
        recvBuf.reset();
        sendResponse(Status.ERROR, null, null, recvOut);
        sendRecvData("wrong bytesPerChecksum while writing", true);
        sendBuf.reset();
        recvBuf.reset();
        writeBlock(poolId, ++newBlockId, DEFAULT_CHECKSUM);
        PacketHeader hdr = new PacketHeader(// size of packet
        4, // offset in block,
        0, // seqno
        100, // last packet
        false, // bad datalen
        -1 - random.nextInt(oneMil), false);
        hdr.write(sendOut);
        sendResponse(Status.SUCCESS, "", null, recvOut);
        new PipelineAck(100, new int[] { PipelineAck.combineHeader(PipelineAck.ECN.DISABLED, Status.ERROR) }).write(recvOut);
        sendRecvData("negative DATA_CHUNK len while writing block " + newBlockId, true);
        // test for writing a valid zero size block
        sendBuf.reset();
        recvBuf.reset();
        writeBlock(poolId, ++newBlockId, DEFAULT_CHECKSUM);
        hdr = new PacketHeader(// size of packet
        8, // OffsetInBlock
        0, // sequencenumber
        100, // lastPacketInBlock
        true, // chunk length
        0, false);
        hdr.write(sendOut);
        // zero checksum
        sendOut.writeInt(0);
        sendOut.flush();
        //ok finally write a block with 0 len
        sendResponse(Status.SUCCESS, "", null, recvOut);
        new PipelineAck(100, new int[] { PipelineAck.combineHeader(PipelineAck.ECN.DISABLED, Status.SUCCESS) }).write(recvOut);
        sendRecvData("Writing a zero len block blockid " + newBlockId, false);
        /* Test OP_READ_BLOCK */
        String bpid = cluster.getNamesystem().getBlockPoolId();
        ExtendedBlock blk = new ExtendedBlock(bpid, firstBlock.getLocalBlock());
        long blkid = blk.getBlockId();
        // bad block id
        sendBuf.reset();
        recvBuf.reset();
        blk.setBlockId(blkid - 1);
        sender.readBlock(blk, BlockTokenSecretManager.DUMMY_TOKEN, "cl", 0L, fileLen, true, CachingStrategy.newDefaultStrategy());
        sendRecvData("Wrong block ID " + newBlockId + " for read", false);
        // negative block start offset -1L
        sendBuf.reset();
        blk.setBlockId(blkid);
        sender.readBlock(blk, BlockTokenSecretManager.DUMMY_TOKEN, "cl", -1L, fileLen, true, CachingStrategy.newDefaultStrategy());
        sendRecvData("Negative start-offset for read for block " + firstBlock.getBlockId(), false);
        // bad block start offset
        sendBuf.reset();
        sender.readBlock(blk, BlockTokenSecretManager.DUMMY_TOKEN, "cl", fileLen, fileLen, true, CachingStrategy.newDefaultStrategy());
        sendRecvData("Wrong start-offset for reading block " + firstBlock.getBlockId(), false);
        // negative length is ok. Datanode assumes we want to read the whole block.
        recvBuf.reset();
        BlockOpResponseProto.newBuilder().setStatus(Status.SUCCESS).setReadOpChecksumInfo(ReadOpChecksumInfoProto.newBuilder().setChecksum(DataTransferProtoUtil.toProto(DEFAULT_CHECKSUM)).setChunkOffset(0L)).build().writeDelimitedTo(recvOut);
        sendBuf.reset();
        sender.readBlock(blk, BlockTokenSecretManager.DUMMY_TOKEN, "cl", 0L, -1L - random.nextInt(oneMil), true, CachingStrategy.newDefaultStrategy());
        sendRecvData("Negative length for reading block " + firstBlock.getBlockId(), false);
        // length is more than size of block.
        recvBuf.reset();
        sendResponse(Status.ERROR, null, "opReadBlock " + firstBlock + " received exception java.io.IOException:  " + "Offset 0 and length 4097 don't match block " + firstBlock + " ( blockLen 4096 )", recvOut);
        sendBuf.reset();
        sender.readBlock(blk, BlockTokenSecretManager.DUMMY_TOKEN, "cl", 0L, fileLen + 1, true, CachingStrategy.newDefaultStrategy());
        sendRecvData("Wrong length for reading block " + firstBlock.getBlockId(), false);
        //At the end of all this, read the file to make sure that succeeds finally.
        sendBuf.reset();
        sender.readBlock(blk, BlockTokenSecretManager.DUMMY_TOKEN, "cl", 0L, fileLen, true, CachingStrategy.newDefaultStrategy());
        readFile(fileSys, file, fileLen);
    } finally {
        cluster.shutdown();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Builder(org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto.Builder) ExtendedBlock(org.apache.hadoop.hdfs.protocol.ExtendedBlock) PipelineAck(org.apache.hadoop.hdfs.protocol.datatransfer.PipelineAck) DataChecksum(org.apache.hadoop.util.DataChecksum) Random(java.util.Random) FileSystem(org.apache.hadoop.fs.FileSystem) PacketHeader(org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader) Test(org.junit.Test)

Example 5 with PacketHeader

use of org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader in project hadoop by apache.

the class TestDataTransferProtocol method writeZeroLengthPacket.

private void writeZeroLengthPacket(ExtendedBlock block, String description) throws IOException {
    PacketHeader hdr = new PacketHeader(// size of packet
    8, // OffsetInBlock
    block.getNumBytes(), // sequencenumber
    100, // lastPacketInBlock
    true, // chunk length
    0, // sync block
    false);
    hdr.write(sendOut);
    // zero checksum
    sendOut.writeInt(0);
    //ok finally write a block with 0 len
    sendResponse(Status.SUCCESS, "", null, recvOut);
    new PipelineAck(100, new int[] { PipelineAck.combineHeader(PipelineAck.ECN.DISABLED, Status.SUCCESS) }).write(recvOut);
    sendRecvData(description, false);
}
Also used : PacketHeader(org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader) PipelineAck(org.apache.hadoop.hdfs.protocol.datatransfer.PipelineAck)

Aggregations

PacketHeader (org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader)10 IOException (java.io.IOException)4 ByteBuf (io.netty.buffer.ByteBuf)2 PipelineAck (org.apache.hadoop.hdfs.protocol.datatransfer.PipelineAck)2 DataChecksum (org.apache.hadoop.util.DataChecksum)2 Test (org.junit.Test)2 Channel (io.netty.channel.Channel)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 DataInputStream (java.io.DataInputStream)1 DataOutputStream (java.io.DataOutputStream)1 ByteBuffer (java.nio.ByteBuffer)1 Random (java.util.Random)1 Checksum (java.util.zip.Checksum)1 Configuration (org.apache.hadoop.conf.Configuration)1 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 ExtendedBlock (org.apache.hadoop.hdfs.protocol.ExtendedBlock)1 BlockOpResponseProto (org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto)1