use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project parquet-mr by apache.
the class ConvertCSVCommand method run.
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() == 1, "CSV path is required.");
if (header != null) {
// if a header is given on the command line, don't assume one is in the file
noHeader = true;
}
CSVProperties props = new CSVProperties.Builder().delimiter(delimiter).escape(escape).quote(quote).header(header).hasHeader(!noHeader).linesToSkip(linesToSkip).charset(charsetName).build();
String source = targets.get(0);
Schema csvSchema;
if (avroSchemaFile != null) {
csvSchema = Schemas.fromAvsc(open(avroSchemaFile));
} else {
Set<String> required = ImmutableSet.of();
if (requiredFields != null) {
required = ImmutableSet.copyOf(requiredFields);
}
String filename = new File(source).getName();
String recordName;
if (filename.contains(".")) {
recordName = filename.substring(0, filename.indexOf("."));
} else {
recordName = filename;
}
csvSchema = AvroCSV.inferNullableSchema(recordName, open(source), props, required);
}
long count = 0;
try (AvroCSVReader<Record> reader = new AvroCSVReader<>(open(source), props, csvSchema, Record.class, true)) {
CompressionCodecName codec = Codecs.parquetCodec(compressionCodecName);
try (ParquetWriter<Record> writer = AvroParquetWriter.<Record>builder(qualifiedPath(outputPath)).withWriterVersion(v2 ? PARQUET_2_0 : PARQUET_1_0).withWriteMode(overwrite ? ParquetFileWriter.Mode.OVERWRITE : ParquetFileWriter.Mode.CREATE).withCompressionCodec(codec).withDictionaryEncoding(true).withDictionaryPageSize(dictionaryPageSize).withPageSize(pageSize).withRowGroupSize(rowGroupSize).withDataModel(GenericData.get()).withConf(getConf()).withSchema(csvSchema).build()) {
for (Record record : reader) {
writer.write(record);
}
} catch (RuntimeException e) {
throw new RuntimeException("Failed on record " + count, e);
}
}
return 0;
}
use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project parquet-mr by apache.
the class CodecConfig method getCodec.
public CompressionCodecName getCodec() {
CompressionCodecName codec;
Configuration configuration = getConfiguration();
if (isParquetCompressionSet(configuration)) {
// explicit parquet config
codec = getParquetCompressionCodec(configuration);
} else if (isHadoopCompressionSet()) {
// from hadoop config
codec = getHadoopCompressionCodec();
} else {
LOG.info("Compression set to false");
codec = CompressionCodecName.UNCOMPRESSED;
}
LOG.info("Compression: {}", codec.name());
return codec;
}
use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project parquet-mr by apache.
the class TestParquetMetadataConverter method createColumnChunkMetaData.
private ColumnChunkMetaData createColumnChunkMetaData() {
Set<org.apache.parquet.column.Encoding> e = new HashSet<org.apache.parquet.column.Encoding>();
PrimitiveTypeName t = PrimitiveTypeName.BINARY;
ColumnPath p = ColumnPath.get("foo");
CompressionCodecName c = CompressionCodecName.GZIP;
BinaryStatistics s = new BinaryStatistics();
ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, e, s, 0, 0, 0, 0, 0);
return md;
}
use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project drill by apache.
the class AsyncPageReader method decompressPageV2.
/**
* Reads a compressed v2 data page which excluded the repetition and definition level
* sections from compression.
* @return decompressed Parquet page data
* @throws IOException
*/
protected DrillBuf decompressPageV2(ReadStatus readStatus) throws IOException {
Stopwatch timer = Stopwatch.createUnstarted();
PageHeader pageHeader = readStatus.getPageHeader();
int inputSize = pageHeader.getCompressed_page_size();
int repLevelSize = pageHeader.data_page_header_v2.getRepetition_levels_byte_length();
int defLevelSize = pageHeader.data_page_header_v2.getDefinition_levels_byte_length();
int compDataOffset = repLevelSize + defLevelSize;
int outputSize = pageHeader.uncompressed_page_size;
// TODO: does reporting this number have the same meaning in an async context?
long start = dataReader.getPos();
long timeToRead;
DrillBuf inputPageData = readStatus.getPageData();
DrillBuf outputPageData = this.allocator.buffer(outputSize);
try {
timer.start();
// Write out the uncompressed section
// Note that the following setBytes call to read the repetition and definition level sections
// advances readerIndex in inputPageData but not writerIndex in outputPageData.
outputPageData.setBytes(0, inputPageData, compDataOffset);
// decompress from the start of compressed data to the end of the input buffer
CompressionCodecName codecName = columnChunkMetaData.getCodec();
CompressionCodecFactory.BytesInputDecompressor decomp = codecFactory.getDecompressor(codecName);
ByteBuffer input = inputPageData.nioBuffer(compDataOffset, inputSize - compDataOffset);
ByteBuffer output = outputPageData.nioBuffer(compDataOffset, outputSize - compDataOffset);
decomp.decompress(input, inputSize - compDataOffset, output, outputSize - compDataOffset);
outputPageData.writerIndex(outputSize);
timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
if (logger.isTraceEnabled()) {
logger.trace("Col: {} readPos: {} Uncompressed_size: {} pageData: {}", columnChunkMetaData.toString(), // TODO: see comment on earlier call to getPos()
dataReader.getPos(), outputSize, ByteBufUtil.hexDump(outputPageData));
}
this.updateStats(pageHeader, "Decompress", start, timeToRead, inputSize, outputSize);
} finally {
readStatus.setPageData(null);
if (inputPageData != null) {
inputPageData.release();
}
}
return outputPageData;
}
use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project drill by apache.
the class PageReader method readCompressedPageV2.
/**
* Reads a compressed v2 data page which excluded the repetition and definition level
* sections from compression.
* @return decompressed Parquet page data
* @throws IOException
*/
protected DrillBuf readCompressedPageV2() throws IOException {
Stopwatch timer = Stopwatch.createUnstarted();
int inputSize = pageHeader.getCompressed_page_size();
int repLevelSize = pageHeader.data_page_header_v2.getRepetition_levels_byte_length();
int defLevelSize = pageHeader.data_page_header_v2.getDefinition_levels_byte_length();
int compDataOffset = repLevelSize + defLevelSize;
int outputSize = pageHeader.uncompressed_page_size;
long start = dataReader.getPos();
long timeToRead;
DrillBuf inputPageData = null;
DrillBuf outputPageData = this.allocator.buffer(outputSize);
try {
timer.start();
// Read in both the uncompressed and compressed sections
inputPageData = dataReader.getNext(inputSize);
timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
this.updateStats(pageHeader, "Page Read", start, timeToRead, inputSize, inputSize);
timer.reset();
timer.start();
start = dataReader.getPos();
// Write out the uncompressed section
// Note that the following setBytes call to read the repetition and definition level sections
// advances readerIndex in inputPageData but not writerIndex in outputPageData.
outputPageData.setBytes(0, inputPageData, compDataOffset);
// decompress from the start of compressed data to the end of the input buffer
CompressionCodecName codecName = columnChunkMetaData.getCodec();
BytesInputDecompressor decomp = codecFactory.getDecompressor(codecName);
ByteBuffer input = inputPageData.nioBuffer(compDataOffset, inputSize - compDataOffset);
ByteBuffer output = outputPageData.nioBuffer(compDataOffset, outputSize - compDataOffset);
decomp.decompress(input, inputSize - compDataOffset, output, outputSize - compDataOffset);
outputPageData.writerIndex(outputSize);
timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
if (logger.isTraceEnabled()) {
logger.trace("Col: {} readPos: {} Uncompressed_size: {} pageData: {}", columnChunkMetaData.toString(), dataReader.getPos(), outputSize, ByteBufUtil.hexDump(outputPageData));
}
this.updateStats(pageHeader, "Decompress", start, timeToRead, inputSize, outputSize);
} finally {
if (inputPageData != null) {
inputPageData.release();
}
}
return outputPageData;
}
Aggregations