use of org.apache.parquet.column.EncodingStats in project parquet-mr by apache.
the class DictionaryPageReader method hasDictionaryPage.
private boolean hasDictionaryPage(ColumnChunkMetaData column) {
EncodingStats stats = column.getEncodingStats();
if (stats != null) {
// ensure there is a dictionary page and that it is used to encode data pages
return stats.hasDictionaryPages() && stats.hasDictionaryEncodedPages();
}
Set<Encoding> encodings = column.getEncodings();
return (encodings.contains(PLAIN_DICTIONARY) || encodings.contains(RLE_DICTIONARY));
}
use of org.apache.parquet.column.EncodingStats in project presto by prestodb.
the class PredicateUtils method isOnlyDictionaryEncodingPages.
@VisibleForTesting
@SuppressWarnings("deprecation")
public static boolean isOnlyDictionaryEncodingPages(ColumnChunkMetaData columnMetaData) {
// Files written with newer versions of Parquet libraries (e.g. parquet-mr 1.9.0) will have EncodingStats available
// Otherwise, fallback to v1 logic
EncodingStats stats = columnMetaData.getEncodingStats();
if (stats != null) {
return stats.hasDictionaryPages() && !stats.hasNonDictionaryEncodedPages();
}
Set<Encoding> encodings = columnMetaData.getEncodings();
if (encodings.contains(PLAIN_DICTIONARY)) {
// The only other allowed encodings are RLE and BIT_PACKED which are used for repetition or definition levels
return Sets.difference(encodings, ImmutableSet.of(PLAIN_DICTIONARY, RLE, BIT_PACKED)).isEmpty();
}
return false;
}
use of org.apache.parquet.column.EncodingStats in project parquet-mr by apache.
the class ParquetMetadataCommand method printColumnChunk.
private void printColumnChunk(Logger console, int width, ColumnChunkMetaData column, MessageType schema) {
String[] path = column.getPath().toArray();
PrimitiveType type = primitive(schema, path);
Preconditions.checkNotNull(type);
ColumnDescriptor desc = schema.getColumnDescription(path);
long size = column.getTotalSize();
long count = column.getValueCount();
float perValue = ((float) size) / count;
CompressionCodecName codec = column.getCodec();
Set<Encoding> encodings = column.getEncodings();
EncodingStats encodingStats = column.getEncodingStats();
String encodingSummary = encodingStats == null ? encodingsAsString(encodings, desc) : encodingStatsAsString(encodingStats);
Statistics stats = column.getStatistics();
String name = column.getPath().toDotString();
PrimitiveType.PrimitiveTypeName typeName = type.getPrimitiveTypeName();
if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
console.info(String.format("%-" + width + "s FIXED[%d] %s %-7s %-9d %-8s %-7s %s", name, type.getTypeLength(), shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType())));
} else {
console.info(String.format("%-" + width + "s %-9s %s %-7s %-9d %-10s %-7s %s", name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType())));
}
}
use of org.apache.parquet.column.EncodingStats in project parquet-mr by apache.
the class DictionaryFilter method hasNonDictionaryPages.
@SuppressWarnings("deprecation")
private static boolean hasNonDictionaryPages(ColumnChunkMetaData meta) {
EncodingStats stats = meta.getEncodingStats();
if (stats != null) {
return stats.hasNonDictionaryEncodedPages();
}
// without EncodingStats, fall back to testing the encoding list
Set<Encoding> encodings = new HashSet<Encoding>(meta.getEncodings());
if (encodings.remove(Encoding.PLAIN_DICTIONARY)) {
// if remove returned true, PLAIN_DICTIONARY was present, which means at
// least one page was dictionary encoded and 1.0 encodings are used
// RLE and BIT_PACKED are only used for repetition or definition levels
encodings.remove(Encoding.RLE);
encodings.remove(Encoding.BIT_PACKED);
if (encodings.isEmpty()) {
// no encodings other than dictionary or rep/def levels
return false;
}
return true;
} else {
// page encoding stats
return true;
}
}
use of org.apache.parquet.column.EncodingStats in project parquet-mr by apache.
the class TestReadWriteEncodingStats method testReadWrite.
@Test
public void testReadWrite() throws Exception {
File file = temp.newFile("encoding-stats.parquet");
assertTrue(file.delete());
Path path = new Path(file.toString());
ParquetWriter<Group> writer = ExampleParquetWriter.builder(path).withWriterVersion(PARQUET_1_0).withPageSize(// ensure multiple pages are written
1024).enableDictionaryEncoding().withDictionaryPageSize(2 * 1024).withConf(CONF).withType(SCHEMA).build();
writeData(writer);
writer.close();
ParquetFileReader reader = ParquetFileReader.open(CONF, path);
assertEquals("Should have one row group", 1, reader.getRowGroups().size());
BlockMetaData rowGroup = reader.getRowGroups().get(0);
ColumnChunkMetaData dictColumn = rowGroup.getColumns().get(0);
EncodingStats dictStats = dictColumn.getEncodingStats();
assertNotNull("Dict column should have non-null encoding stats", dictStats);
assertTrue("Dict column should have a dict page", dictStats.hasDictionaryPages());
assertTrue("Dict column should have dict-encoded pages", dictStats.hasDictionaryEncodedPages());
assertFalse("Dict column should not have non-dict pages", dictStats.hasNonDictionaryEncodedPages());
ColumnChunkMetaData plainColumn = rowGroup.getColumns().get(1);
EncodingStats plainStats = plainColumn.getEncodingStats();
assertNotNull("Plain column should have non-null encoding stats", plainStats);
assertFalse("Plain column should not have a dict page", plainStats.hasDictionaryPages());
assertFalse("Plain column should not have dict-encoded pages", plainStats.hasDictionaryEncodedPages());
assertTrue("Plain column should have non-dict pages", plainStats.hasNonDictionaryEncodedPages());
ColumnChunkMetaData fallbackColumn = rowGroup.getColumns().get(2);
EncodingStats fallbackStats = fallbackColumn.getEncodingStats();
assertNotNull("Fallback column should have non-null encoding stats", fallbackStats);
assertTrue("Fallback column should have a dict page", fallbackStats.hasDictionaryPages());
assertTrue("Fallback column should have dict-encoded pages", fallbackStats.hasDictionaryEncodedPages());
assertTrue("Fallback column should have non-dict pages", fallbackStats.hasNonDictionaryEncodedPages());
}
Aggregations