use of org.apache.hadoop.io.compress.CompressionCodec in project jena by apache.
the class AbstractNodeOutputFormat method getRecordWriter.
@Override
public RecordWriter<NodeWritable, TValue> getRecordWriter(TaskAttemptContext context) throws IOException {
Configuration config = context.getConfiguration();
boolean isCompressed = getCompressOutput(context);
CompressionCodec codec = null;
String extension = this.getFileExtension();
if (isCompressed) {
Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(context, GzipCodec.class);
codec = ReflectionUtils.newInstance(codecClass, config);
extension += codec.getDefaultExtension();
}
Path file = getDefaultWorkFile(context, extension);
LOG.info("Writing output to file " + file);
FileSystem fs = file.getFileSystem(config);
if (!isCompressed) {
FSDataOutputStream fileOut = fs.create(file, false);
return this.getRecordWriter(new OutputStreamWriter(fileOut), config);
} else {
FSDataOutputStream fileOut = fs.create(file, false);
return this.getRecordWriter(new OutputStreamWriter(codec.createOutputStream(fileOut)), config);
}
}
use of org.apache.hadoop.io.compress.CompressionCodec in project jena by apache.
the class AbstractNodeTupleOutputFormat method getRecordWriter.
@Override
public RecordWriter<TKey, T> getRecordWriter(TaskAttemptContext context) throws IOException {
Configuration config = context.getConfiguration();
boolean isCompressed = getCompressOutput(context);
CompressionCodec codec = null;
// Build the output file path
String extension = this.getFileExtension();
if (isCompressed) {
// Add compression extension if applicable
Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(context, GzipCodec.class);
codec = ReflectionUtils.newInstance(codecClass, config);
extension += codec.getDefaultExtension();
}
Path file = getDefaultWorkFile(context, extension);
LOG.info("Writing output to file " + file);
// Open the file appropriately and create a record writer for it
FileSystem fs = file.getFileSystem(config);
if (!isCompressed) {
FSDataOutputStream fileOut = fs.create(file, false);
return this.getRecordWriter(new OutputStreamWriter(fileOut), config, file);
} else {
FSDataOutputStream fileOut = fs.create(file, false);
return this.getRecordWriter(new OutputStreamWriter(codec.createOutputStream(fileOut)), config, file);
}
}
use of org.apache.hadoop.io.compress.CompressionCodec in project flink by apache.
the class SequenceFileWriter method open.
@Override
public void open(FileSystem fs, Path path) throws IOException {
super.open(fs, path);
if (keyClass == null) {
throw new IllegalStateException("Key Class has not been initialized.");
}
if (valueClass == null) {
throw new IllegalStateException("Value Class has not been initialized.");
}
CompressionCodec codec = null;
Configuration conf = HadoopFileSystem.getHadoopConfiguration();
if (!compressionCodecName.equals("None")) {
CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
codec = codecFactory.getCodecByName(compressionCodecName);
if (codec == null) {
throw new RuntimeException("Codec " + compressionCodecName + " not found.");
}
}
// the non-deprecated constructor syntax is only available in recent hadoop versions...
writer = SequenceFile.createWriter(conf, getStream(), keyClass, valueClass, compressionType, codec);
}
use of org.apache.hadoop.io.compress.CompressionCodec in project nifi by apache.
the class GetHDFS method processBatchOfFiles.
protected void processBatchOfFiles(final List<Path> files, final ProcessContext context, final ProcessSession session) {
// process the batch of files
InputStream stream = null;
CompressionCodec codec = null;
Configuration conf = getConfiguration();
FileSystem hdfs = getFileSystem();
final boolean keepSourceFiles = context.getProperty(KEEP_SOURCE_FILE).asBoolean();
final Double bufferSizeProp = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B);
int bufferSize = bufferSizeProp != null ? bufferSizeProp.intValue() : conf.getInt(BUFFER_SIZE_KEY, BUFFER_SIZE_DEFAULT);
final Path rootDir = new Path(context.getProperty(DIRECTORY).evaluateAttributeExpressions().getValue());
final CompressionType compressionType = CompressionType.valueOf(context.getProperty(COMPRESSION_CODEC).toString());
final boolean inferCompressionCodec = compressionType == CompressionType.AUTOMATIC;
if (inferCompressionCodec || compressionType != CompressionType.NONE) {
codec = getCompressionCodec(context, getConfiguration());
}
final CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
for (final Path file : files) {
try {
if (!getUserGroupInformation().doAs((PrivilegedExceptionAction<Boolean>) () -> hdfs.exists(file))) {
// if file is no longer there then move on
continue;
}
final String originalFilename = file.getName();
final String relativePath = getPathDifference(rootDir, file);
stream = getUserGroupInformation().doAs((PrivilegedExceptionAction<FSDataInputStream>) () -> hdfs.open(file, bufferSize));
final String outputFilename;
// Check if we should infer compression codec
if (inferCompressionCodec) {
codec = compressionCodecFactory.getCodec(file);
}
// Check if compression codec is defined (inferred or otherwise)
if (codec != null) {
stream = codec.createInputStream(stream);
outputFilename = StringUtils.removeEnd(originalFilename, codec.getDefaultExtension());
} else {
outputFilename = originalFilename;
}
FlowFile flowFile = session.create();
final StopWatch stopWatch = new StopWatch(true);
flowFile = session.importFrom(stream, flowFile);
stopWatch.stop();
final String dataRate = stopWatch.calculateDataRate(flowFile.getSize());
final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS);
flowFile = session.putAttribute(flowFile, CoreAttributes.PATH.key(), relativePath.isEmpty() ? "." : relativePath);
flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), outputFilename);
if (!keepSourceFiles && !getUserGroupInformation().doAs((PrivilegedExceptionAction<Boolean>) () -> hdfs.delete(file, false))) {
getLogger().warn("Could not remove {} from HDFS. Not ingesting this file ...", new Object[] { file });
session.remove(flowFile);
continue;
}
session.getProvenanceReporter().receive(flowFile, file.toString());
session.transfer(flowFile, REL_SUCCESS);
getLogger().info("retrieved {} from HDFS {} in {} milliseconds at a rate of {}", new Object[] { flowFile, file, millis, dataRate });
session.commit();
} catch (final Throwable t) {
getLogger().error("Error retrieving file {} from HDFS due to {}", new Object[] { file, t });
session.rollback();
context.yield();
} finally {
IOUtils.closeQuietly(stream);
stream = null;
}
}
}
use of org.apache.hadoop.io.compress.CompressionCodec in project apex-malhar by apache.
the class AbstractFileOutputOperatorTest method checkSnappyFile.
private void checkSnappyFile(File file, List<Long> offsets, int startVal, int totalWindows, int totalRecords) throws IOException {
FileInputStream fis;
InputStream gss = null;
Configuration conf = new Configuration();
CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(SnappyCodec.class, conf);
CompressionInputStream snappyIs = null;
BufferedReader br = null;
int numWindows = 0;
try {
fis = new FileInputStream(file);
gss = fis;
long startOffset = 0;
for (long offset : offsets) {
// Skip initial case in case file is not yet created
if (offset == 0) {
continue;
}
long limit = offset - startOffset;
LimitInputStream lis = new LimitInputStream(gss, limit);
snappyIs = codec.createInputStream(lis);
br = new BufferedReader(new InputStreamReader(snappyIs));
String eline = "" + (startVal + numWindows * 2);
int count = 0;
String line;
while ((line = br.readLine()) != null) {
Assert.assertEquals("File line", eline, line);
++count;
if ((count % totalRecords) == 0) {
++numWindows;
eline = "" + (startVal + numWindows * 2);
}
}
startOffset = offset;
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (br != null) {
br.close();
} else {
if (snappyIs != null) {
snappyIs.close();
} else if (gss != null) {
gss.close();
}
}
}
Assert.assertEquals("Total", totalWindows, numWindows);
}
Aggregations