Search in sources :

Example 86 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project presto by prestodb.

the class TestTupleDomainParquetPredicate method testVarcharMatchesWithStatistics.

@Test
public void testVarcharMatchesWithStatistics() throws ParquetCorruptionException {
    String value = "Test";
    ColumnDescriptor columnDescriptor = new ColumnDescriptor(new String[] { "path" }, BINARY, 0, 0);
    RichColumnDescriptor column = new RichColumnDescriptor(columnDescriptor, new PrimitiveType(OPTIONAL, BINARY, "Test column"));
    TupleDomain<ColumnDescriptor> effectivePredicate = getEffectivePredicate(column, createVarcharType(255), utf8Slice(value));
    TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column));
    Statistics<?> stats = getStatsBasedOnType(column.getType());
    stats.setNumNulls(1L);
    stats.setMinMaxFromBytes(value.getBytes(), value.getBytes());
    assertTrue(parquetPredicate.matches(2, singletonMap(column, stats), ID));
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) TupleDomainParquetPredicate(com.facebook.presto.parquet.predicate.TupleDomainParquetPredicate) PrimitiveType(org.apache.parquet.schema.PrimitiveType) Test(org.testng.annotations.Test)

Example 87 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project presto by prestodb.

the class TestTupleDomainParquetPredicate method stringColumnStats.

private static Statistics stringColumnStats(String minimum, String maximum) {
    Statistics.Builder builder = Statistics.getBuilderForReading(new PrimitiveType(OPTIONAL, BINARY, "testFile", UTF8));
    builder.withMin(minimum.getBytes()).withMax(maximum.getBytes()).withNumNulls(0);
    return builder.build();
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) BooleanStatistics(org.apache.parquet.column.statistics.BooleanStatistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics)

Example 88 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project presto by prestodb.

the class TestTupleDomainParquetPredicate method testIntegerMatchesWithStatistics.

@Test(dataProvider = "typeForParquetInt32")
public void testIntegerMatchesWithStatistics(Type typeForParquetInt32) throws ParquetCorruptionException {
    RichColumnDescriptor column = new RichColumnDescriptor(new ColumnDescriptor(new String[] { "path" }, INT32, 0, 0), new PrimitiveType(OPTIONAL, INT32, "Test column"));
    TupleDomain<ColumnDescriptor> effectivePredicate = TupleDomain.withColumnDomains(ImmutableMap.of(column, Domain.create(ValueSet.of(typeForParquetInt32, 42L, 43L, 44L, 112L), false)));
    TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column));
    assertTrue(parquetPredicate.matches(2, ImmutableMap.of(column, intColumnStats(32, 42)), ID));
    assertFalse(parquetPredicate.matches(2, ImmutableMap.of(column, intColumnStats(30, 40)), ID));
    // stats invalid for smallint/tinyint
    assertEquals(parquetPredicate.matches(2, ImmutableMap.of(column, intColumnStats(1024, 0x10000 + 42)), ID), (typeForParquetInt32 != INTEGER));
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) TupleDomainParquetPredicate(com.facebook.presto.parquet.predicate.TupleDomainParquetPredicate) PrimitiveType(org.apache.parquet.schema.PrimitiveType) Test(org.testng.annotations.Test)

Example 89 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project presto by prestodb.

the class TestColumnIndexBuilder method testBuildFloatZeroNaN.

@Test
public void testBuildFloatZeroNaN() {
    PrimitiveType type = Types.required(FLOAT).named("test_float");
    ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    StatsBuilder sb = new StatsBuilder();
    builder.add(sb.stats(type, -1.0f, -0.0f));
    builder.add(sb.stats(type, 0.0f, 1.0f));
    builder.add(sb.stats(type, 1.0f, 100.0f));
    ColumnIndex columnIndex = builder.build();
    assertCorrectValues(columnIndex.getMinValues(), -1.0f, -0.0f, 1.0f);
    assertCorrectValues(columnIndex.getMaxValues(), 0.0f, 1.0f, 100.0f);
    builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    builder.add(sb.stats(type, -1.0f, -0.0f));
    builder.add(sb.stats(type, 0.0f, Float.NaN));
    builder.add(sb.stats(type, 1.0f, 100.0f));
    assertNull(builder.build());
}
Also used : ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) ColumnIndexBuilder(org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder) PrimitiveType(org.apache.parquet.schema.PrimitiveType) Test(org.testng.annotations.Test)

Example 90 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project presto by prestodb.

the class MetadataReader method readFooter.

public static ParquetFileMetadata readFooter(ParquetDataSource parquetDataSource, long fileSize) throws IOException {
    // Parquet File Layout:
    // 
    // MAGIC
    // variable: Data
    // variable: Metadata
    // 4 bytes: MetadataLength
    // MAGIC
    validateParquet(fileSize >= MAGIC.length() + POST_SCRIPT_SIZE, "%s is not a valid Parquet File", parquetDataSource.getId());
    // EXPECTED_FOOTER_SIZE is an int, so this will never fail
    byte[] buffer = new byte[toIntExact(min(fileSize, EXPECTED_FOOTER_SIZE))];
    parquetDataSource.readFully(fileSize - buffer.length, buffer);
    Slice tailSlice = wrappedBuffer(buffer);
    Slice magic = tailSlice.slice(tailSlice.length() - MAGIC.length(), MAGIC.length());
    if (!MAGIC.equals(magic)) {
        throw new ParquetCorruptionException(format("Not valid Parquet file: %s expected magic number: %s got: %s", parquetDataSource.getId(), Arrays.toString(MAGIC.getBytes()), Arrays.toString(magic.getBytes())));
    }
    int metadataLength = tailSlice.getInt(tailSlice.length() - POST_SCRIPT_SIZE);
    int completeFooterSize = metadataLength + POST_SCRIPT_SIZE;
    long metadataFileOffset = fileSize - completeFooterSize;
    validateParquet(metadataFileOffset >= MAGIC.length() && metadataFileOffset + POST_SCRIPT_SIZE < fileSize, "Corrupted Parquet file: %s metadata index: %s out of range", parquetDataSource.getId(), metadataFileOffset);
    // Ensure the slice covers the entire metadata range
    if (tailSlice.length() < completeFooterSize) {
        byte[] footerBuffer = new byte[completeFooterSize];
        parquetDataSource.readFully(metadataFileOffset, footerBuffer, 0, footerBuffer.length - tailSlice.length());
        // Copy the previous slice contents into the new buffer
        tailSlice.getBytes(0, footerBuffer, footerBuffer.length - tailSlice.length(), tailSlice.length());
        tailSlice = wrappedBuffer(footerBuffer, 0, footerBuffer.length);
    }
    FileMetaData fileMetaData = readFileMetaData(tailSlice.slice(tailSlice.length() - completeFooterSize, metadataLength).getInput());
    List<SchemaElement> schema = fileMetaData.getSchema();
    validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", parquetDataSource.getId());
    MessageType messageType = readParquetSchema(schema);
    List<BlockMetaData> blocks = new ArrayList<>();
    List<RowGroup> rowGroups = fileMetaData.getRow_groups();
    if (rowGroups != null) {
        for (RowGroup rowGroup : rowGroups) {
            BlockMetaData blockMetaData = new BlockMetaData();
            blockMetaData.setRowCount(rowGroup.getNum_rows());
            blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
            List<ColumnChunk> columns = rowGroup.getColumns();
            validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
            String filePath = columns.get(0).getFile_path();
            for (ColumnChunk columnChunk : columns) {
                validateParquet((filePath == null && columnChunk.getFile_path() == null) || (filePath != null && filePath.equals(columnChunk.getFile_path())), "all column chunks of the same row group must be in the same file");
                ColumnMetaData metaData = columnChunk.meta_data;
                String[] path = metaData.path_in_schema.stream().map(value -> value.toLowerCase(Locale.ENGLISH)).toArray(String[]::new);
                ColumnPath columnPath = ColumnPath.get(path);
                PrimitiveType primitiveType = messageType.getType(columnPath.toArray()).asPrimitiveType();
                PrimitiveTypeName primitiveTypeName = primitiveType.getPrimitiveTypeName();
                ColumnChunkMetaData column = ColumnChunkMetaData.get(columnPath, primitiveType, CompressionCodecName.fromParquet(metaData.codec), PARQUET_METADATA_CONVERTER.convertEncodingStats(metaData.encoding_stats), readEncodings(metaData.encodings), readStats(metaData.statistics, primitiveTypeName), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size);
                column.setColumnIndexReference(toColumnIndexReference(columnChunk));
                column.setOffsetIndexReference(toOffsetIndexReference(columnChunk));
                blockMetaData.addColumn(column);
            }
            blockMetaData.setPath(filePath);
            blocks.add(blockMetaData);
        }
    }
    Map<String, String> keyValueMetaData = new HashMap<>();
    List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
    if (keyValueList != null) {
        for (KeyValue keyValue : keyValueList) {
            keyValueMetaData.put(keyValue.key, keyValue.value);
        }
    }
    ParquetMetadata parquetMetadata = new ParquetMetadata(new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
    return new ParquetFileMetadata(parquetMetadata, toIntExact(metadataLength));
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Arrays(java.util.Arrays) Slice(io.airlift.slice.Slice) Util.readFileMetaData(org.apache.parquet.format.Util.readFileMetaData) ConvertedType(org.apache.parquet.format.ConvertedType) Repetition(org.apache.parquet.schema.Type.Repetition) HashMap(java.util.HashMap) FileMetaData(org.apache.parquet.format.FileMetaData) ParquetMetadataConverter(org.apache.parquet.format.converter.ParquetMetadataConverter) ArrayList(java.util.ArrayList) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) HashSet(java.util.HashSet) Slices.wrappedBuffer(io.airlift.slice.Slices.wrappedBuffer) KeyValue(org.apache.parquet.format.KeyValue) Locale(java.util.Locale) SchemaElement(org.apache.parquet.format.SchemaElement) Map(java.util.Map) Type(org.apache.parquet.format.Type) IndexReference(org.apache.parquet.internal.hadoop.metadata.IndexReference) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName) Math.toIntExact(java.lang.Math.toIntExact) OriginalType(org.apache.parquet.schema.OriginalType) ParquetDataSource(com.facebook.presto.parquet.ParquetDataSource) Types(org.apache.parquet.schema.Types) Iterator(java.util.Iterator) Encoding(org.apache.parquet.format.Encoding) Set(java.util.Set) Statistics(org.apache.parquet.format.Statistics) IOException(java.io.IOException) Math.min(java.lang.Math.min) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) String.format(java.lang.String.format) ColumnChunk(org.apache.parquet.format.ColumnChunk) ColumnMetaData(org.apache.parquet.format.ColumnMetaData) US_ASCII(java.nio.charset.StandardCharsets.US_ASCII) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) RowGroup(org.apache.parquet.format.RowGroup) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) Collections(java.util.Collections) ParquetValidationUtils.validateParquet(com.facebook.presto.parquet.ParquetValidationUtils.validateParquet) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) KeyValue(org.apache.parquet.format.KeyValue) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) RowGroup(org.apache.parquet.format.RowGroup) ArrayList(java.util.ArrayList) ColumnChunk(org.apache.parquet.format.ColumnChunk) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) SchemaElement(org.apache.parquet.format.SchemaElement) PrimitiveType(org.apache.parquet.schema.PrimitiveType) ColumnMetaData(org.apache.parquet.format.ColumnMetaData) Util.readFileMetaData(org.apache.parquet.format.Util.readFileMetaData) FileMetaData(org.apache.parquet.format.FileMetaData) MessageType(org.apache.parquet.schema.MessageType) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName) Slice(io.airlift.slice.Slice)

Aggregations

PrimitiveType (org.apache.parquet.schema.PrimitiveType)123 Test (org.junit.Test)66 MessageType (org.apache.parquet.schema.MessageType)41 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)28 BooleanWritable (org.apache.hadoop.io.BooleanWritable)28 BytesWritable (org.apache.hadoop.io.BytesWritable)28 DoubleWritable (org.apache.hadoop.io.DoubleWritable)28 FloatWritable (org.apache.hadoop.io.FloatWritable)28 IntWritable (org.apache.hadoop.io.IntWritable)28 LongWritable (org.apache.hadoop.io.LongWritable)28 Writable (org.apache.hadoop.io.Writable)28 GroupType (org.apache.parquet.schema.GroupType)25 Test (org.testng.annotations.Test)20 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)14 OriginalType (org.apache.parquet.schema.OriginalType)14 Type (org.apache.parquet.schema.Type)12 List (java.util.List)11 ColumnIndex (org.apache.parquet.internal.column.columnindex.ColumnIndex)11 ColumnIndexBuilder (org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder)11 ArrayList (java.util.ArrayList)10