Search in sources :

Example 86 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project nifi by apache.

the class PutHDFS method onTrigger.

@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
    final FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    final FileSystem hdfs = getFileSystem();
    final Configuration configuration = getConfiguration();
    final UserGroupInformation ugi = getUserGroupInformation();
    if (configuration == null || hdfs == null || ugi == null) {
        getLogger().error("HDFS not configured properly");
        session.transfer(flowFile, REL_FAILURE);
        context.yield();
        return;
    }
    ugi.doAs(new PrivilegedAction<Object>() {

        @Override
        public Object run() {
            Path tempDotCopyFile = null;
            FlowFile putFlowFile = flowFile;
            try {
                final String dirValue = context.getProperty(DIRECTORY).evaluateAttributeExpressions(putFlowFile).getValue();
                final Path configuredRootDirPath = new Path(dirValue);
                final String conflictResponse = context.getProperty(CONFLICT_RESOLUTION).getValue();
                final Double blockSizeProp = context.getProperty(BLOCK_SIZE).asDataSize(DataUnit.B);
                final long blockSize = blockSizeProp != null ? blockSizeProp.longValue() : hdfs.getDefaultBlockSize(configuredRootDirPath);
                final Double bufferSizeProp = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B);
                final int bufferSize = bufferSizeProp != null ? bufferSizeProp.intValue() : configuration.getInt(BUFFER_SIZE_KEY, BUFFER_SIZE_DEFAULT);
                final Integer replicationProp = context.getProperty(REPLICATION_FACTOR).asInteger();
                final short replication = replicationProp != null ? replicationProp.shortValue() : hdfs.getDefaultReplication(configuredRootDirPath);
                final CompressionCodec codec = getCompressionCodec(context, configuration);
                final String filename = codec != null ? putFlowFile.getAttribute(CoreAttributes.FILENAME.key()) + codec.getDefaultExtension() : putFlowFile.getAttribute(CoreAttributes.FILENAME.key());
                final Path tempCopyFile = new Path(configuredRootDirPath, "." + filename);
                final Path copyFile = new Path(configuredRootDirPath, filename);
                // Create destination directory if it does not exist
                try {
                    if (!hdfs.getFileStatus(configuredRootDirPath).isDirectory()) {
                        throw new IOException(configuredRootDirPath.toString() + " already exists and is not a directory");
                    }
                } catch (FileNotFoundException fe) {
                    if (!hdfs.mkdirs(configuredRootDirPath)) {
                        throw new IOException(configuredRootDirPath.toString() + " could not be created");
                    }
                    changeOwner(context, hdfs, configuredRootDirPath, flowFile);
                }
                final boolean destinationExists = hdfs.exists(copyFile);
                // If destination file already exists, resolve that based on processor configuration
                if (destinationExists) {
                    switch(conflictResponse) {
                        case REPLACE_RESOLUTION:
                            if (hdfs.delete(copyFile, false)) {
                                getLogger().info("deleted {} in order to replace with the contents of {}", new Object[] { copyFile, putFlowFile });
                            }
                            break;
                        case IGNORE_RESOLUTION:
                            session.transfer(putFlowFile, REL_SUCCESS);
                            getLogger().info("transferring {} to success because file with same name already exists", new Object[] { putFlowFile });
                            return null;
                        case FAIL_RESOLUTION:
                            session.transfer(session.penalize(putFlowFile), REL_FAILURE);
                            getLogger().warn("penalizing {} and routing to failure because file with same name already exists", new Object[] { putFlowFile });
                            return null;
                        default:
                            break;
                    }
                }
                // Write FlowFile to temp file on HDFS
                final StopWatch stopWatch = new StopWatch(true);
                session.read(putFlowFile, new InputStreamCallback() {

                    @Override
                    public void process(InputStream in) throws IOException {
                        OutputStream fos = null;
                        Path createdFile = null;
                        try {
                            if (conflictResponse.equals(APPEND_RESOLUTION_AV.getValue()) && destinationExists) {
                                fos = hdfs.append(copyFile, bufferSize);
                            } else {
                                fos = hdfs.create(tempCopyFile, true, bufferSize, replication, blockSize);
                            }
                            if (codec != null) {
                                fos = codec.createOutputStream(fos);
                            }
                            createdFile = tempCopyFile;
                            BufferedInputStream bis = new BufferedInputStream(in);
                            StreamUtils.copy(bis, fos);
                            bis = null;
                            fos.flush();
                        } finally {
                            try {
                                if (fos != null) {
                                    fos.close();
                                }
                            } catch (RemoteException re) {
                                // when talking to remote HDFS clusters, we don't notice problems until fos.close()
                                if (createdFile != null) {
                                    try {
                                        hdfs.delete(createdFile, false);
                                    } catch (Throwable ignore) {
                                    }
                                }
                                throw re;
                            } catch (Throwable ignore) {
                            }
                            fos = null;
                        }
                    }
                });
                stopWatch.stop();
                final String dataRate = stopWatch.calculateDataRate(putFlowFile.getSize());
                final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS);
                tempDotCopyFile = tempCopyFile;
                if (!conflictResponse.equals(APPEND_RESOLUTION_AV.getValue()) || (conflictResponse.equals(APPEND_RESOLUTION_AV.getValue()) && !destinationExists)) {
                    boolean renamed = false;
                    for (int i = 0; i < 10; i++) {
                        // try to rename multiple times.
                        if (hdfs.rename(tempCopyFile, copyFile)) {
                            renamed = true;
                            // rename was successful
                            break;
                        }
                        // try waiting to let whatever might cause rename failure to resolve
                        Thread.sleep(200L);
                    }
                    if (!renamed) {
                        hdfs.delete(tempCopyFile, false);
                        throw new ProcessException("Copied file to HDFS but could not rename dot file " + tempCopyFile + " to its final filename");
                    }
                    changeOwner(context, hdfs, copyFile, flowFile);
                }
                getLogger().info("copied {} to HDFS at {} in {} milliseconds at a rate of {}", new Object[] { putFlowFile, copyFile, millis, dataRate });
                final String newFilename = copyFile.getName();
                final String hdfsPath = copyFile.getParent().toString();
                putFlowFile = session.putAttribute(putFlowFile, CoreAttributes.FILENAME.key(), newFilename);
                putFlowFile = session.putAttribute(putFlowFile, ABSOLUTE_HDFS_PATH_ATTRIBUTE, hdfsPath);
                final Path qualifiedPath = copyFile.makeQualified(hdfs.getUri(), hdfs.getWorkingDirectory());
                session.getProvenanceReporter().send(putFlowFile, qualifiedPath.toString());
                session.transfer(putFlowFile, REL_SUCCESS);
            } catch (final Throwable t) {
                if (tempDotCopyFile != null) {
                    try {
                        hdfs.delete(tempDotCopyFile, false);
                    } catch (Exception e) {
                        getLogger().error("Unable to remove temporary file {} due to {}", new Object[] { tempDotCopyFile, e });
                    }
                }
                getLogger().error("Failed to write to HDFS due to {}", new Object[] { t });
                session.transfer(session.penalize(putFlowFile), REL_FAILURE);
                context.yield();
            }
            return null;
        }
    });
}
Also used : Path(org.apache.hadoop.fs.Path) FlowFile(org.apache.nifi.flowfile.FlowFile) Configuration(org.apache.hadoop.conf.Configuration) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) FileNotFoundException(java.io.FileNotFoundException) IOException(java.io.IOException) ProcessException(org.apache.nifi.processor.exception.ProcessException) IOException(java.io.IOException) RemoteException(org.apache.hadoop.ipc.RemoteException) FileNotFoundException(java.io.FileNotFoundException) StopWatch(org.apache.nifi.util.StopWatch) ProcessException(org.apache.nifi.processor.exception.ProcessException) BufferedInputStream(java.io.BufferedInputStream) FileSystem(org.apache.hadoop.fs.FileSystem) InputStreamCallback(org.apache.nifi.processor.io.InputStreamCallback) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) RemoteException(org.apache.hadoop.ipc.RemoteException) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation)

Example 87 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project parquet-mr by apache.

the class CodecFactory method getCodec.

/**
 * @param codecName
 *          the requested codec
 * @return the corresponding hadoop codec. null if UNCOMPRESSED
 */
protected CompressionCodec getCodec(CompressionCodecName codecName) {
    String codecClassName = codecName.getHadoopCompressionCodecClassName();
    if (codecClassName == null) {
        return null;
    }
    CompressionCodec codec = CODEC_BY_NAME.get(codecClassName);
    if (codec != null) {
        return codec;
    }
    try {
        Class<?> codecClass = Class.forName(codecClassName);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, configuration);
        CODEC_BY_NAME.put(codecClassName, codec);
        return codec;
    } catch (ClassNotFoundException e) {
        throw new BadConfigurationException("Class " + codecClassName + " was not found", e);
    }
}
Also used : CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 88 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project tez by apache.

the class TestShuffleUtils method testInternalErrorTranslation.

@Test
public void testInternalErrorTranslation() throws Exception {
    String codecErrorMsg = "codec failure";
    CompressionInputStream mockCodecStream = mock(CompressionInputStream.class);
    when(mockCodecStream.read(any(byte[].class), anyInt(), anyInt())).thenThrow(new InternalError(codecErrorMsg));
    Decompressor mockDecoder = mock(Decompressor.class);
    CompressionCodec mockCodec = mock(CompressionCodec.class);
    when(mockCodec.createDecompressor()).thenReturn(mockDecoder);
    when(mockCodec.createInputStream(any(InputStream.class), any(Decompressor.class))).thenReturn(mockCodecStream);
    byte[] header = new byte[] { (byte) 'T', (byte) 'I', (byte) 'F', (byte) 1 };
    try {
        ShuffleUtils.shuffleToMemory(new byte[1024], new ByteArrayInputStream(header), 1024, 128, mockCodec, false, 0, mock(Logger.class), null);
        Assert.fail("shuffle was supposed to throw!");
    } catch (IOException e) {
        Assert.assertTrue(e.getCause() instanceof InternalError);
        Assert.assertTrue(e.getMessage().contains(codecErrorMsg));
    }
}
Also used : Decompressor(org.apache.hadoop.io.compress.Decompressor) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) InputStream(java.io.InputStream) ByteString(com.google.protobuf.ByteString) Mockito.anyString(org.mockito.Mockito.anyString) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) IOException(java.io.IOException) FetchStatsLogger(org.apache.tez.runtime.library.common.shuffle.ShuffleUtils.FetchStatsLogger) Logger(org.slf4j.Logger) Test(org.junit.Test)

Example 89 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project tez by apache.

the class TestUnorderedPartitionedKVWriter method baseTestWithPipelinedTransfer.

@SuppressWarnings("unchecked")
private void baseTestWithPipelinedTransfer(int numRecords, int numPartitions, Set<Integer> skippedPartitions, boolean shouldCompress) throws IOException, InterruptedException {
    PartitionerForTest partitioner = new PartitionerForTest();
    ApplicationId appId = ApplicationId.newInstance(10000000, 1);
    TezCounters counters = new TezCounters();
    String uniqueId = UUID.randomUUID().toString();
    int dagId = 1;
    String auxiliaryService = defaultConf.get(TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID, TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID_DEFAULT);
    OutputContext outputContext = createMockOutputContext(counters, appId, uniqueId, auxiliaryService);
    Configuration conf = createConfiguration(outputContext, IntWritable.class, LongWritable.class, shouldCompress, -1);
    conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_ENABLE_FINAL_MERGE_IN_OUTPUT, false);
    conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_PIPELINED_SHUFFLE_ENABLED, true);
    CompressionCodec codec = null;
    if (shouldCompress) {
        codec = new DefaultCodec();
        ((Configurable) codec).setConf(conf);
    }
    int numOutputs = numPartitions;
    long availableMemory = 2048;
    int numRecordsWritten = 0;
    UnorderedPartitionedKVWriter kvWriter = new UnorderedPartitionedKVWriterForTest(outputContext, conf, numOutputs, availableMemory);
    int sizePerBuffer = kvWriter.sizePerBuffer;
    // IntW + LongW
    int sizePerRecord = 4 + 8;
    // Record + META_OVERHEAD
    int sizePerRecordWithOverhead = sizePerRecord + 12;
    BitSet partitionsWithData = new BitSet(numPartitions);
    IntWritable intWritable = new IntWritable();
    LongWritable longWritable = new LongWritable();
    for (int i = 0; i < numRecords; i++) {
        intWritable.set(i);
        longWritable.set(i);
        int partition = partitioner.getPartition(intWritable, longWritable, numOutputs);
        if (skippedPartitions != null && skippedPartitions.contains(partition)) {
            continue;
        }
        partitionsWithData.set(partition);
        kvWriter.write(intWritable, longWritable);
        numRecordsWritten++;
    }
    int recordsPerBuffer = sizePerBuffer / sizePerRecordWithOverhead;
    int numExpectedSpills = numRecordsWritten / recordsPerBuffer;
    ArgumentCaptor<List> eventCaptor = ArgumentCaptor.forClass(List.class);
    List<Event> lastEvents = kvWriter.close();
    if (numPartitions == 1) {
        assertEquals(false, kvWriter.skipBuffers);
    }
    // no events are sent to kvWriter upon close with pipelining
    assertTrue(lastEvents.size() == 0);
    verify(outputContext, atLeast(numExpectedSpills)).sendEvents(eventCaptor.capture());
    int numOfCapturedEvents = eventCaptor.getAllValues().size();
    lastEvents = eventCaptor.getAllValues().get(numOfCapturedEvents - 1);
    VertexManagerEvent VMEvent = (VertexManagerEvent) lastEvents.get(0);
    for (int i = 0; i < numOfCapturedEvents; i++) {
        List<Event> events = eventCaptor.getAllValues().get(i);
        if (i < numOfCapturedEvents - 1) {
            assertTrue(events.size() == 1);
            assertTrue(events.get(0) instanceof CompositeDataMovementEvent);
        } else {
            assertTrue(events.size() == 2);
            assertTrue(events.get(0) instanceof VertexManagerEvent);
            assertTrue(events.get(1) instanceof CompositeDataMovementEvent);
        }
    }
    verifyPartitionStats(VMEvent, partitionsWithData);
    verify(outputContext, never()).reportFailure(any(TaskFailureType.class), any(Throwable.class), any(String.class));
    assertNull(kvWriter.currentBuffer);
    assertEquals(0, kvWriter.availableBuffers.size());
    // Verify the counters
    TezCounter outputRecordBytesCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES);
    TezCounter outputRecordsCounter = counters.findCounter(TaskCounter.OUTPUT_RECORDS);
    TezCounter outputBytesWithOverheadCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES_WITH_OVERHEAD);
    TezCounter fileOutputBytesCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES_PHYSICAL);
    TezCounter spilledRecordsCounter = counters.findCounter(TaskCounter.SPILLED_RECORDS);
    TezCounter additionalSpillBytesWritternCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILLS_BYTES_WRITTEN);
    TezCounter additionalSpillBytesReadCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILLS_BYTES_READ);
    TezCounter numAdditionalSpillsCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILL_COUNT);
    assertEquals(numRecordsWritten * sizePerRecord, outputRecordBytesCounter.getValue());
    assertEquals(numRecordsWritten, outputRecordsCounter.getValue());
    assertEquals(numRecordsWritten * sizePerRecordWithOverhead, outputBytesWithOverheadCounter.getValue());
    long fileOutputBytes = fileOutputBytesCounter.getValue();
    if (numRecordsWritten > 0) {
        assertTrue(fileOutputBytes > 0);
        if (!shouldCompress) {
            assertTrue(fileOutputBytes > outputRecordBytesCounter.getValue());
        }
    } else {
        assertEquals(0, fileOutputBytes);
    }
    // due to multiple threads, buffers could be merged in chunks in scheduleSpill.
    assertTrue(recordsPerBuffer * numExpectedSpills >= spilledRecordsCounter.getValue());
    long additionalSpillBytesWritten = additionalSpillBytesWritternCounter.getValue();
    long additionalSpillBytesRead = additionalSpillBytesReadCounter.getValue();
    // No additional spill bytes written when final merge is disabled.
    assertEquals(additionalSpillBytesWritten, 0);
    // No additional spills when final merge is disabled.
    assertTrue(additionalSpillBytesWritten == additionalSpillBytesRead);
    // No additional spills when final merge is disabled.
    assertEquals(numAdditionalSpillsCounter.getValue(), 0);
    assertTrue(lastEvents.size() > 0);
    // Get the last event
    int index = lastEvents.size() - 1;
    assertTrue(lastEvents.get(index) instanceof CompositeDataMovementEvent);
    CompositeDataMovementEvent cdme = (CompositeDataMovementEvent) lastEvents.get(index);
    assertEquals(0, cdme.getSourceIndexStart());
    assertEquals(numOutputs, cdme.getCount());
    DataMovementEventPayloadProto eventProto = DataMovementEventPayloadProto.parseFrom(ByteString.copyFrom(cdme.getUserPayload()));
    // Ensure that this is the last event
    assertTrue(eventProto.getLastEvent());
    verifyEmptyPartitions(eventProto, numRecordsWritten, numPartitions, skippedPartitions);
    verify(outputContext, atLeast(1)).notifyProgress();
    // Verify if all spill files are available.
    TezTaskOutput taskOutput = new TezTaskOutputFiles(conf, uniqueId, dagId);
    if (numRecordsWritten > 0) {
        int numSpills = kvWriter.numSpills.get();
        for (int i = 0; i < numSpills; i++) {
            Path outputFile = taskOutput.getSpillFileForWrite(i, 10);
            Path indexFile = taskOutput.getSpillIndexFileForWrite(i, 10);
            assertTrue(localFs.exists(outputFile));
            assertTrue(localFs.exists(indexFile));
            assertEquals("Incorrect output permissions", (short) 0640, localFs.getFileStatus(outputFile).getPermission().toShort());
            assertEquals("Incorrect index permissions", (short) 0640, localFs.getFileStatus(indexFile).getPermission().toShort());
        }
    } else {
        return;
    }
}
Also used : TezTaskOutputFiles(org.apache.tez.runtime.library.common.task.local.output.TezTaskOutputFiles) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TezRuntimeConfiguration(org.apache.tez.runtime.library.api.TezRuntimeConfiguration) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) ByteString(com.google.protobuf.ByteString) Configurable(org.apache.hadoop.conf.Configurable) TezCounter(org.apache.tez.common.counters.TezCounter) List(java.util.List) DataMovementEventPayloadProto(org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads.DataMovementEventPayloadProto) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) LongWritable(org.apache.hadoop.io.LongWritable) IntWritable(org.apache.hadoop.io.IntWritable) Path(org.apache.hadoop.fs.Path) BitSet(java.util.BitSet) TezCounters(org.apache.tez.common.counters.TezCounters) OutputContext(org.apache.tez.runtime.api.OutputContext) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) TaskFailureType(org.apache.tez.runtime.api.TaskFailureType) Event(org.apache.tez.runtime.api.Event) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) TezTaskOutput(org.apache.tez.runtime.library.common.task.local.output.TezTaskOutput)

Example 90 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project carbondata by apache.

the class AbstractDFSCarbonFile method getDataInputStream.

@Override
public DataInputStream getDataInputStream(String path, FileFactory.FileType fileType, int bufferSize, String compressor) throws IOException {
    path = path.replace("\\", "/");
    Path pt = new Path(path);
    InputStream inputStream;
    FileSystem fs = pt.getFileSystem(FileFactory.getConfiguration());
    if (bufferSize <= 0) {
        inputStream = fs.open(pt);
    } else {
        inputStream = fs.open(pt, bufferSize);
    }
    String codecName = getCodecNameFromCompressor(compressor);
    if (!codecName.isEmpty()) {
        CompressionCodec codec = new CompressionCodecFactory(hadoopConf).getCodecByName(codecName);
        inputStream = codec.createInputStream(inputStream);
    }
    return new DataInputStream(new BufferedInputStream(inputStream));
}
Also used : Path(org.apache.hadoop.fs.Path) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) BufferedInputStream(java.io.BufferedInputStream) DataInputStream(java.io.DataInputStream) BufferedInputStream(java.io.BufferedInputStream) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) InputStream(java.io.InputStream) FileSystem(org.apache.hadoop.fs.FileSystem) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) DataInputStream(java.io.DataInputStream) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Aggregations

CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)111 Path (org.apache.hadoop.fs.Path)54 FileSystem (org.apache.hadoop.fs.FileSystem)41 Configuration (org.apache.hadoop.conf.Configuration)38 CompressionCodecFactory (org.apache.hadoop.io.compress.CompressionCodecFactory)37 InputStream (java.io.InputStream)18 IOException (java.io.IOException)17 Test (org.junit.Test)17 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)15 Text (org.apache.hadoop.io.Text)14 Configurable (org.apache.hadoop.conf.Configurable)10 GzipCodec (org.apache.hadoop.io.compress.GzipCodec)10 JobConf (org.apache.hadoop.mapred.JobConf)10 SequenceFile (org.apache.hadoop.io.SequenceFile)9 OutputStream (java.io.OutputStream)8 DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)8 FileInputStream (java.io.FileInputStream)7 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)6 ByteString (com.google.protobuf.ByteString)5 DataInputStream (java.io.DataInputStream)5