Search in sources :

Example 1 with EncodingStats

use of org.apache.parquet.column.EncodingStats in project parquet-mr by apache.

the class DictionaryPageReader method hasDictionaryPage.

private boolean hasDictionaryPage(ColumnChunkMetaData column) {
    EncodingStats stats = column.getEncodingStats();
    if (stats != null) {
        // ensure there is a dictionary page and that it is used to encode data pages
        return stats.hasDictionaryPages() && stats.hasDictionaryEncodedPages();
    }
    Set<Encoding> encodings = column.getEncodings();
    return (encodings.contains(PLAIN_DICTIONARY) || encodings.contains(RLE_DICTIONARY));
}
Also used : EncodingStats(org.apache.parquet.column.EncodingStats) Encoding(org.apache.parquet.column.Encoding)

Example 2 with EncodingStats

use of org.apache.parquet.column.EncodingStats in project presto by prestodb.

the class PredicateUtils method isOnlyDictionaryEncodingPages.

@VisibleForTesting
@SuppressWarnings("deprecation")
public static boolean isOnlyDictionaryEncodingPages(ColumnChunkMetaData columnMetaData) {
    // Files written with newer versions of Parquet libraries (e.g. parquet-mr 1.9.0) will have EncodingStats available
    // Otherwise, fallback to v1 logic
    EncodingStats stats = columnMetaData.getEncodingStats();
    if (stats != null) {
        return stats.hasDictionaryPages() && !stats.hasNonDictionaryEncodedPages();
    }
    Set<Encoding> encodings = columnMetaData.getEncodings();
    if (encodings.contains(PLAIN_DICTIONARY)) {
        // The only other allowed encodings are RLE and BIT_PACKED which are used for repetition or definition levels
        return Sets.difference(encodings, ImmutableSet.of(PLAIN_DICTIONARY, RLE, BIT_PACKED)).isEmpty();
    }
    return false;
}
Also used : EncodingStats(org.apache.parquet.column.EncodingStats) Encoding(org.apache.parquet.column.Encoding) ParquetEncoding(com.facebook.presto.parquet.ParquetEncoding) ParquetTypeUtils.getParquetEncoding(com.facebook.presto.parquet.ParquetTypeUtils.getParquetEncoding) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 3 with EncodingStats

use of org.apache.parquet.column.EncodingStats in project parquet-mr by apache.

the class ParquetMetadataCommand method printColumnChunk.

private void printColumnChunk(Logger console, int width, ColumnChunkMetaData column, MessageType schema) {
    String[] path = column.getPath().toArray();
    PrimitiveType type = primitive(schema, path);
    Preconditions.checkNotNull(type);
    ColumnDescriptor desc = schema.getColumnDescription(path);
    long size = column.getTotalSize();
    long count = column.getValueCount();
    float perValue = ((float) size) / count;
    CompressionCodecName codec = column.getCodec();
    Set<Encoding> encodings = column.getEncodings();
    EncodingStats encodingStats = column.getEncodingStats();
    String encodingSummary = encodingStats == null ? encodingsAsString(encodings, desc) : encodingStatsAsString(encodingStats);
    Statistics stats = column.getStatistics();
    String name = column.getPath().toDotString();
    PrimitiveType.PrimitiveTypeName typeName = type.getPrimitiveTypeName();
    if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
        console.info(String.format("%-" + width + "s  FIXED[%d] %s %-7s %-9d %-8s %-7s %s", name, type.getTypeLength(), shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType())));
    } else {
        console.info(String.format("%-" + width + "s  %-9s %s %-7s %-9d %-10s %-7s %s", name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType())));
    }
}
Also used : EncodingStats(org.apache.parquet.column.EncodingStats) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PrimitiveType(org.apache.parquet.schema.PrimitiveType) Encoding(org.apache.parquet.column.Encoding) Util.minMaxAsString(org.apache.parquet.cli.Util.minMaxAsString) Util.encodingsAsString(org.apache.parquet.cli.Util.encodingsAsString) Util.encodingStatsAsString(org.apache.parquet.cli.Util.encodingStatsAsString) Statistics(org.apache.parquet.column.statistics.Statistics)

Example 4 with EncodingStats

use of org.apache.parquet.column.EncodingStats in project parquet-mr by apache.

the class DictionaryFilter method hasNonDictionaryPages.

@SuppressWarnings("deprecation")
private static boolean hasNonDictionaryPages(ColumnChunkMetaData meta) {
    EncodingStats stats = meta.getEncodingStats();
    if (stats != null) {
        return stats.hasNonDictionaryEncodedPages();
    }
    // without EncodingStats, fall back to testing the encoding list
    Set<Encoding> encodings = new HashSet<Encoding>(meta.getEncodings());
    if (encodings.remove(Encoding.PLAIN_DICTIONARY)) {
        // if remove returned true, PLAIN_DICTIONARY was present, which means at
        // least one page was dictionary encoded and 1.0 encodings are used
        // RLE and BIT_PACKED are only used for repetition or definition levels
        encodings.remove(Encoding.RLE);
        encodings.remove(Encoding.BIT_PACKED);
        if (encodings.isEmpty()) {
            // no encodings other than dictionary or rep/def levels
            return false;
        }
        return true;
    } else {
        // page encoding stats
        return true;
    }
}
Also used : EncodingStats(org.apache.parquet.column.EncodingStats) Encoding(org.apache.parquet.column.Encoding) HashSet(java.util.HashSet)

Example 5 with EncodingStats

use of org.apache.parquet.column.EncodingStats in project parquet-mr by apache.

the class TestReadWriteEncodingStats method testReadWrite.

@Test
public void testReadWrite() throws Exception {
    File file = temp.newFile("encoding-stats.parquet");
    assertTrue(file.delete());
    Path path = new Path(file.toString());
    ParquetWriter<Group> writer = ExampleParquetWriter.builder(path).withWriterVersion(PARQUET_1_0).withPageSize(// ensure multiple pages are written
    1024).enableDictionaryEncoding().withDictionaryPageSize(2 * 1024).withConf(CONF).withType(SCHEMA).build();
    writeData(writer);
    writer.close();
    ParquetFileReader reader = ParquetFileReader.open(CONF, path);
    assertEquals("Should have one row group", 1, reader.getRowGroups().size());
    BlockMetaData rowGroup = reader.getRowGroups().get(0);
    ColumnChunkMetaData dictColumn = rowGroup.getColumns().get(0);
    EncodingStats dictStats = dictColumn.getEncodingStats();
    assertNotNull("Dict column should have non-null encoding stats", dictStats);
    assertTrue("Dict column should have a dict page", dictStats.hasDictionaryPages());
    assertTrue("Dict column should have dict-encoded pages", dictStats.hasDictionaryEncodedPages());
    assertFalse("Dict column should not have non-dict pages", dictStats.hasNonDictionaryEncodedPages());
    ColumnChunkMetaData plainColumn = rowGroup.getColumns().get(1);
    EncodingStats plainStats = plainColumn.getEncodingStats();
    assertNotNull("Plain column should have non-null encoding stats", plainStats);
    assertFalse("Plain column should not have a dict page", plainStats.hasDictionaryPages());
    assertFalse("Plain column should not have dict-encoded pages", plainStats.hasDictionaryEncodedPages());
    assertTrue("Plain column should have non-dict pages", plainStats.hasNonDictionaryEncodedPages());
    ColumnChunkMetaData fallbackColumn = rowGroup.getColumns().get(2);
    EncodingStats fallbackStats = fallbackColumn.getEncodingStats();
    assertNotNull("Fallback column should have non-null encoding stats", fallbackStats);
    assertTrue("Fallback column should have a dict page", fallbackStats.hasDictionaryPages());
    assertTrue("Fallback column should have dict-encoded pages", fallbackStats.hasDictionaryEncodedPages());
    assertTrue("Fallback column should have non-dict pages", fallbackStats.hasNonDictionaryEncodedPages());
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) EncodingStats(org.apache.parquet.column.EncodingStats) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) File(java.io.File) Test(org.junit.Test)

Aggregations

EncodingStats (org.apache.parquet.column.EncodingStats)5 Encoding (org.apache.parquet.column.Encoding)4 ParquetEncoding (com.facebook.presto.parquet.ParquetEncoding)1 ParquetTypeUtils.getParquetEncoding (com.facebook.presto.parquet.ParquetTypeUtils.getParquetEncoding)1 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 File (java.io.File)1 HashSet (java.util.HashSet)1 Path (org.apache.hadoop.fs.Path)1 Util.encodingStatsAsString (org.apache.parquet.cli.Util.encodingStatsAsString)1 Util.encodingsAsString (org.apache.parquet.cli.Util.encodingsAsString)1 Util.minMaxAsString (org.apache.parquet.cli.Util.minMaxAsString)1 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)1 Statistics (org.apache.parquet.column.statistics.Statistics)1 Group (org.apache.parquet.example.data.Group)1 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)1 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)1 CompressionCodecName (org.apache.parquet.hadoop.metadata.CompressionCodecName)1 PrimitiveType (org.apache.parquet.schema.PrimitiveType)1 Test (org.junit.Test)1