use of org.apache.parquet.column.Encoding in project parquet-mr by apache.
the class TestColumnChunkPageWriteStore method test.
@Test
public void test() throws Exception {
Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
Path root = file.getParent();
FileSystem fs = file.getFileSystem(conf);
if (fs.exists(root)) {
fs.delete(root, true);
}
fs.mkdirs(root);
MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
ColumnDescriptor col = schema.getColumns().get(0);
Encoding dataEncoding = PLAIN;
int valueCount = 10;
int d = 1;
int r = 2;
int v = 3;
BytesInput definitionLevels = BytesInput.fromInt(d);
BytesInput repetitionLevels = BytesInput.fromInt(r);
Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
BytesInput data = BytesInput.fromInt(v);
int rowCount = 5;
int nullCount = 1;
{
ParquetFileWriter writer = new ParquetFileWriter(conf, schema, file);
writer.start();
writer.startBlock(rowCount);
{
ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator());
PageWriter pageWriter = store.getPageWriter(col);
pageWriter.writePageV2(rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics);
store.flushToFileWriter(writer);
}
writer.endBlock();
writer.end(new HashMap<String, String>());
}
{
ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
PageReadStore rowGroup = reader.readNextRowGroup();
PageReader pageReader = rowGroup.getPageReader(col);
DataPageV2 page = (DataPageV2) pageReader.readPage();
assertEquals(rowCount, page.getRowCount());
assertEquals(nullCount, page.getNullCount());
assertEquals(valueCount, page.getValueCount());
assertEquals(d, intValue(page.getDefinitionLevels()));
assertEquals(r, intValue(page.getRepetitionLevels()));
assertEquals(dataEncoding, page.getDataEncoding());
assertEquals(v, intValue(page.getData()));
assertEquals(statistics.toString(), page.getStatistics().toString());
reader.close();
}
}
use of org.apache.parquet.column.Encoding in project parquet-mr by apache.
the class TestInputFormat method newBlock.
private BlockMetaData newBlock(long start, long compressedBlockSize) {
BlockMetaData blockMetaData = new BlockMetaData();
// assuming the compression ratio is 2
long uncompressedSize = compressedBlockSize * 2;
ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"), PrimitiveTypeName.BINARY, CompressionCodecName.GZIP, new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)), new BinaryStatistics(), start, 0l, 0l, compressedBlockSize, uncompressedSize);
blockMetaData.addColumn(column);
blockMetaData.setTotalByteSize(uncompressedSize);
return blockMetaData;
}
use of org.apache.parquet.column.Encoding in project drill by axbaretto.
the class PageReader method next.
/**
* Grab the next page.
*
* @return - if another page was present
* @throws IOException
*/
public boolean next() throws IOException {
Stopwatch timer = Stopwatch.createUnstarted();
currentPageCount = -1;
valuesRead = 0;
valuesReadyToRead = 0;
// TODO - the metatdata for total size appears to be incorrect for impala generated files, need to find cause
// and submit a bug report
long totalValueCount = parentColumnReader.columnChunkMetaData.getValueCount();
if (parentColumnReader.totalValuesRead >= totalValueCount) {
return false;
}
clearBuffers();
nextInternal();
if (pageData == null || pageHeader == null) {
// TODO: Is this an error condition or a normal condition??
return false;
}
timer.start();
currentPageCount = pageHeader.data_page_header.num_values;
final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding);
final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding);
final Encoding valueEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.encoding);
byteLength = pageHeader.uncompressed_page_size;
final ByteBuffer pageDataBuffer = pageData.nioBuffer(0, pageData.capacity());
readPosInBytes = 0;
if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) {
repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL);
repetitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
// we know that the first value will be a 0, at the end of each list of repeated values we will hit another 0 indicating
// a new record, although we don't know the length until we hit it (and this is a one way stream of integers) so we
// read the first zero here to simplify the reading processes, and start reading the first value the same as all
// of the rest. Effectively we are 'reading' the non-existent value in front of the first allowing direct access to
// the first list of repetition levels
readPosInBytes = repetitionLevels.getNextOffset();
repetitionLevels.readInteger();
}
if (parentColumnReader.columnDescriptor.getMaxDefinitionLevel() != 0) {
parentColumnReader.currDefLevel = -1;
definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL);
definitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
readPosInBytes = definitionLevels.getNextOffset();
if (!valueEncoding.usesDictionary()) {
valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
}
}
if (parentColumnReader.columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN) {
valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
}
if (valueEncoding.usesDictionary()) {
// initialize two of the dictionary readers, one is for determining the lengths of each value, the second is for
// actually copying the values out into the vectors
dictionaryLengthDeterminingReader = new DictionaryValuesReader(dictionary);
dictionaryLengthDeterminingReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
dictionaryValueReader = new DictionaryValuesReader(dictionary);
dictionaryValueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
parentColumnReader.usingDictionary = true;
} else {
parentColumnReader.usingDictionary = false;
}
// readPosInBytes is used for actually reading the values after we determine how many will fit in the vector
// readyToReadPosInBytes serves a similar purpose for the vector types where we must count up the values that will
// fit one record at a time, such as for variable length data. Both operations must start in the same location after the
// definition and repetition level data which is stored alongside the page data itself
readyToReadPosInBytes = readPosInBytes;
long timeDecode = timer.elapsed(TimeUnit.NANOSECONDS);
stats.numDataPagesDecoded.incrementAndGet();
stats.timeDataPageDecode.addAndGet(timeDecode);
return true;
}
use of org.apache.parquet.column.Encoding in project parquet-mr by apache.
the class ParquetMetadataCommand method printColumnChunk.
private void printColumnChunk(Logger console, int width, ColumnChunkMetaData column, MessageType schema) {
String[] path = column.getPath().toArray();
PrimitiveType type = primitive(schema, path);
Preconditions.checkNotNull(type);
ColumnDescriptor desc = schema.getColumnDescription(path);
long size = column.getTotalSize();
long count = column.getValueCount();
float perValue = ((float) size) / count;
CompressionCodecName codec = column.getCodec();
Set<Encoding> encodings = column.getEncodings();
EncodingStats encodingStats = column.getEncodingStats();
String encodingSummary = encodingStats == null ? encodingsAsString(encodings, desc) : encodingStatsAsString(encodingStats);
Statistics stats = column.getStatistics();
String name = column.getPath().toDotString();
PrimitiveType.PrimitiveTypeName typeName = type.getPrimitiveTypeName();
if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
console.info(String.format("%-" + width + "s FIXED[%d] %s %-7s %-9d %-8s %-7s %s", name, type.getTypeLength(), shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType())));
} else {
console.info(String.format("%-" + width + "s %-9s %s %-7s %-9d %-10s %-7s %s", name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType())));
}
}
use of org.apache.parquet.column.Encoding in project parquet-mr by apache.
the class Util method encodingStatsAsString.
public static String encodingStatsAsString(EncodingStats encodingStats) {
StringBuilder sb = new StringBuilder();
if (encodingStats.hasDictionaryPages()) {
for (Encoding encoding : encodingStats.getDictionaryEncodings()) {
sb.append(encodingAsString(encoding, true));
}
sb.append(" ");
} else {
sb.append(" ");
}
Set<Encoding> encodings = encodingStats.getDataEncodings();
if (encodings.contains(RLE_DICTIONARY) || encodings.contains(PLAIN_DICTIONARY)) {
sb.append("R");
}
if (encodings.contains(PLAIN)) {
sb.append("_");
}
if (encodings.contains(DELTA_BYTE_ARRAY) || encodings.contains(DELTA_BINARY_PACKED) || encodings.contains(DELTA_LENGTH_BYTE_ARRAY)) {
sb.append("D");
}
// Check for fallback and add a flag
if (encodingStats.hasDictionaryEncodedPages() && encodingStats.hasNonDictionaryEncodedPages()) {
sb.append(" F");
}
return sb.toString();
}
Aggregations