use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project presto by prestodb.
the class ParquetFileWriterFactory method createFileWriter.
@Override
public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf conf, ConnectorSession session, Optional<EncryptionInformation> encryptionInformation) {
if (!isParquetOptimizedWriterEnabled(session)) {
return Optional.empty();
}
if (!MapredParquetOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
return Optional.empty();
}
ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder().setMaxPageSize(getParquetWriterPageSize(session)).setMaxBlockSize(getParquetWriterBlockSize(session)).build();
CompressionCodecName compressionCodecName = getCompression(conf);
List<String> fileColumnNames = Splitter.on(',').trimResults().omitEmptyStrings().splitToList(schema.getProperty(META_TABLE_COLUMNS, ""));
List<Type> fileColumnTypes = toHiveTypes(schema.getProperty(META_TABLE_COLUMN_TYPES, "")).stream().map(hiveType -> hiveType.getType(typeManager)).collect(toList());
int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, conf);
Callable<Void> rollbackAction = () -> {
fileSystem.delete(path, false);
return null;
};
return Optional.of(new ParquetFileWriter(fileSystem.create(path), rollbackAction, fileColumnNames, fileColumnTypes, parquetWriterOptions, fileInputColumnIndexes, compressionCodecName));
} catch (IOException e) {
throw new PrestoException(HIVE_WRITER_OPEN_ERROR, "Error creating Parquet file", e);
}
}
use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project drill by apache.
the class TestFileGenerator method generateParquetFile.
public static void generateParquetFile(String filename, ParquetTestProperties props) throws Exception {
int currentBooleanByte = 0;
WrapAroundCounter booleanBitCounter = new WrapAroundCounter(7);
Configuration configuration = new Configuration();
configuration.set(FileSystem.FS_DEFAULT_NAME_KEY, "file:///");
// "message m { required int32 integer; required int64 integer64; required boolean b; required float f; required double d;}"
FileSystem fs = FileSystem.get(configuration);
Path path = new Path(filename);
if (fs.exists(path)) {
fs.delete(path, false);
}
String messageSchema = "message m {";
for (FieldInfo fieldInfo : props.fields.values()) {
messageSchema += " required " + fieldInfo.parquetType + " " + fieldInfo.name + ";";
}
// remove the last semicolon, java really needs a join method for strings...
// TODO - nvm apparently it requires a semicolon after every field decl, might want to file a bug
// messageSchema = messageSchema.substring(schemaType, messageSchema.length() - 1);
messageSchema += "}";
MessageType schema = MessageTypeParser.parseMessageType(messageSchema);
CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
w.start();
HashMap<String, Integer> columnValuesWritten = new HashMap<>();
int valsWritten;
for (int k = 0; k < props.numberRowGroups; k++) {
w.startBlock(props.recordsPerRowGroup);
currentBooleanByte = 0;
booleanBitCounter.reset();
for (FieldInfo fieldInfo : props.fields.values()) {
if (!columnValuesWritten.containsKey(fieldInfo.name)) {
columnValuesWritten.put(fieldInfo.name, 0);
valsWritten = 0;
} else {
valsWritten = columnValuesWritten.get(fieldInfo.name);
}
String[] path1 = { fieldInfo.name };
ColumnDescriptor c1 = schema.getColumnDescription(path1);
w.startColumn(c1, props.recordsPerRowGroup, codec);
final int valsPerPage = (int) Math.ceil(props.recordsPerRowGroup / (float) fieldInfo.numberOfPages);
// 1 MB
final int PAGE_SIZE = 1024 * 1024;
byte[] bytes;
RunLengthBitPackingHybridValuesWriter defLevels = new RunLengthBitPackingHybridValuesWriter(MAX_EXPECTED_BIT_WIDTH_FOR_DEFINITION_LEVELS, valsPerPage, PAGE_SIZE, new DirectByteBufferAllocator());
RunLengthBitPackingHybridValuesWriter repLevels = new RunLengthBitPackingHybridValuesWriter(MAX_EXPECTED_BIT_WIDTH_FOR_DEFINITION_LEVELS, valsPerPage, PAGE_SIZE, new DirectByteBufferAllocator());
// for variable length binary fields
int bytesNeededToEncodeLength = 4;
if (fieldInfo.bitLength > 0) {
bytes = new byte[(int) Math.ceil(valsPerPage * fieldInfo.bitLength / 8.0)];
} else {
// the twelve at the end is to account for storing a 4 byte length with each value
int totalValLength = ((byte[]) fieldInfo.values[0]).length + ((byte[]) fieldInfo.values[1]).length + ((byte[]) fieldInfo.values[2]).length + 3 * bytesNeededToEncodeLength;
// used for the case where there is a number of values in this row group that is not divisible by 3
int leftOverBytes = 0;
if (valsPerPage % 3 > 0) {
leftOverBytes += ((byte[]) fieldInfo.values[1]).length + bytesNeededToEncodeLength;
}
if (valsPerPage % 3 > 1) {
leftOverBytes += ((byte[]) fieldInfo.values[2]).length + bytesNeededToEncodeLength;
}
bytes = new byte[valsPerPage / 3 * totalValLength + leftOverBytes];
}
int bytesPerPage = (int) (valsPerPage * (fieldInfo.bitLength / 8.0));
int bytesWritten = 0;
for (int z = 0; z < fieldInfo.numberOfPages; z++, bytesWritten = 0) {
for (int i = 0; i < valsPerPage; i++) {
repLevels.writeInteger(0);
defLevels.writeInteger(1);
if (fieldInfo.values[0] instanceof Boolean) {
bytes[currentBooleanByte] |= bitFields[booleanBitCounter.val] & ((boolean) fieldInfo.values[valsWritten % 3] ? allBitsTrue : allBitsFalse);
booleanBitCounter.increment();
if (booleanBitCounter.val == 0) {
currentBooleanByte++;
}
valsWritten++;
if (currentBooleanByte > bytesPerPage) {
break;
}
} else {
if (fieldInfo.values[valsWritten % 3] instanceof byte[]) {
System.arraycopy(ByteArrayUtil.toByta(((byte[]) fieldInfo.values[valsWritten % 3]).length), 0, bytes, bytesWritten, bytesNeededToEncodeLength);
System.arraycopy(fieldInfo.values[valsWritten % 3], 0, bytes, bytesWritten + bytesNeededToEncodeLength, ((byte[]) fieldInfo.values[valsWritten % 3]).length);
bytesWritten += ((byte[]) fieldInfo.values[valsWritten % 3]).length + bytesNeededToEncodeLength;
} else {
System.arraycopy(ByteArrayUtil.toByta(fieldInfo.values[valsWritten % 3]), 0, bytes, i * (fieldInfo.bitLength / 8), fieldInfo.bitLength / 8);
}
valsWritten++;
}
}
byte[] fullPage = new byte[2 * 4 * valsPerPage + bytes.length];
byte[] repLevelBytes = repLevels.getBytes().toByteArray();
byte[] defLevelBytes = defLevels.getBytes().toByteArray();
System.arraycopy(bytes, 0, fullPage, 0, bytes.length);
System.arraycopy(repLevelBytes, 0, fullPage, bytes.length, repLevelBytes.length);
System.arraycopy(defLevelBytes, 0, fullPage, bytes.length + repLevelBytes.length, defLevelBytes.length);
w.writeDataPage((props.recordsPerRowGroup / fieldInfo.numberOfPages), fullPage.length, BytesInput.from(fullPage), RLE, RLE, PLAIN);
currentBooleanByte = 0;
}
w.endColumn();
columnValuesWritten.remove(fieldInfo.name);
columnValuesWritten.put(fieldInfo.name, valsWritten);
}
w.endBlock();
}
w.end(new HashMap<String, String>());
logger.debug("Finished generating parquet file {}", path.getName());
}
use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project drill by apache.
the class PageReader method readCompressedPageV1.
/**
* Reads a compressed v1 data page or a dictionary page, both of which are compressed
* in their entirety.
* @return decompressed Parquet page data
* @throws IOException
*/
protected DrillBuf readCompressedPageV1() throws IOException {
Stopwatch timer = Stopwatch.createUnstarted();
int inputSize = pageHeader.getCompressed_page_size();
int outputSize = pageHeader.getUncompressed_page_size();
long start = dataReader.getPos();
long timeToRead;
DrillBuf inputPageData = null;
DrillBuf outputPageData = this.allocator.buffer(outputSize);
try {
timer.start();
inputPageData = dataReader.getNext(inputSize);
timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
this.updateStats(pageHeader, "Page Read", start, timeToRead, inputSize, inputSize);
timer.reset();
timer.start();
start = dataReader.getPos();
CompressionCodecName codecName = columnChunkMetaData.getCodec();
BytesInputDecompressor decomp = codecFactory.getDecompressor(codecName);
ByteBuffer input = inputPageData.nioBuffer(0, inputSize);
ByteBuffer output = outputPageData.nioBuffer(0, outputSize);
decomp.decompress(input, inputSize, output, outputSize);
outputPageData.writerIndex(outputSize);
timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
if (logger.isTraceEnabled()) {
logger.trace("Col: {} readPos: {} Uncompressed_size: {} pageData: {}", columnChunkMetaData.toString(), dataReader.getPos(), outputSize, ByteBufUtil.hexDump(outputPageData));
}
this.updateStats(pageHeader, "Decompress", start, timeToRead, inputSize, outputSize);
} finally {
if (inputPageData != null) {
inputPageData.release();
}
}
return outputPageData;
}
use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project drill by apache.
the class AsyncPageReader method decompressPageV1.
/**
* Reads a compressed v1 data page or a dictionary page, both of which are compressed
* in their entirety.
* @return decompressed Parquet page data
* @throws IOException
*/
protected DrillBuf decompressPageV1(ReadStatus readStatus) throws IOException {
Stopwatch timer = Stopwatch.createUnstarted();
PageHeader pageHeader = readStatus.getPageHeader();
int inputSize = pageHeader.getCompressed_page_size();
int outputSize = pageHeader.getUncompressed_page_size();
// TODO: does reporting this number have the same meaning in an async context?
long start = dataReader.getPos();
long timeToRead;
DrillBuf inputPageData = readStatus.getPageData();
DrillBuf outputPageData = this.allocator.buffer(outputSize);
try {
timer.start();
CompressionCodecName codecName = columnChunkMetaData.getCodec();
CompressionCodecFactory.BytesInputDecompressor decomp = codecFactory.getDecompressor(codecName);
ByteBuffer input = inputPageData.nioBuffer(0, inputSize);
ByteBuffer output = outputPageData.nioBuffer(0, outputSize);
decomp.decompress(input, inputSize, output, outputSize);
outputPageData.writerIndex(outputSize);
timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
if (logger.isTraceEnabled()) {
logger.trace("Col: {} readPos: {} Uncompressed_size: {} pageData: {}", columnChunkMetaData.toString(), // TODO: see comment on earlier call to getPos()
dataReader.getPos(), outputSize, ByteBufUtil.hexDump(outputPageData));
}
this.updateStats(pageHeader, "Decompress", start, timeToRead, inputSize, outputSize);
} finally {
readStatus.setPageData(null);
if (inputPageData != null) {
inputPageData.release();
}
}
return outputPageData;
}
use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project hive by apache.
the class ParquetRecordWriterWrapper method initializeSerProperties.
private void initializeSerProperties(JobContext job, Properties tableProperties) {
String blockSize = tableProperties.getProperty(ParquetOutputFormat.BLOCK_SIZE);
Configuration conf = ContextUtil.getConfiguration(job);
if (blockSize != null && !blockSize.isEmpty()) {
LOG.debug("get override parquet.block.size property via tblproperties");
conf.setInt(ParquetOutputFormat.BLOCK_SIZE, Integer.parseInt(blockSize));
}
String enableDictionaryPage = tableProperties.getProperty(ParquetOutputFormat.ENABLE_DICTIONARY);
if (enableDictionaryPage != null && !enableDictionaryPage.isEmpty()) {
LOG.debug("get override parquet.enable.dictionary property via tblproperties");
conf.setBoolean(ParquetOutputFormat.ENABLE_DICTIONARY, Boolean.parseBoolean(enableDictionaryPage));
}
String compressionName = tableProperties.getProperty(ParquetOutputFormat.COMPRESSION);
if (compressionName != null && !compressionName.isEmpty()) {
// get override compression properties via "tblproperties" clause if it is set
LOG.debug("get override compression properties via tblproperties");
CompressionCodecName codecName = CompressionCodecName.fromConf(compressionName);
conf.set(ParquetOutputFormat.COMPRESSION, codecName.name());
}
}
Aggregations