use of org.apache.parquet.schema.PrimitiveType in project presto by prestodb.
the class TestTupleDomainParquetPredicate method testVarcharMatchesWithStatistics.
@Test
public void testVarcharMatchesWithStatistics() throws ParquetCorruptionException {
String value = "Test";
ColumnDescriptor columnDescriptor = new ColumnDescriptor(new String[] { "path" }, BINARY, 0, 0);
RichColumnDescriptor column = new RichColumnDescriptor(columnDescriptor, new PrimitiveType(OPTIONAL, BINARY, "Test column"));
TupleDomain<ColumnDescriptor> effectivePredicate = getEffectivePredicate(column, createVarcharType(255), utf8Slice(value));
TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column));
Statistics<?> stats = getStatsBasedOnType(column.getType());
stats.setNumNulls(1L);
stats.setMinMaxFromBytes(value.getBytes(), value.getBytes());
assertTrue(parquetPredicate.matches(2, singletonMap(column, stats), ID));
}
use of org.apache.parquet.schema.PrimitiveType in project presto by prestodb.
the class TestTupleDomainParquetPredicate method stringColumnStats.
private static Statistics stringColumnStats(String minimum, String maximum) {
Statistics.Builder builder = Statistics.getBuilderForReading(new PrimitiveType(OPTIONAL, BINARY, "testFile", UTF8));
builder.withMin(minimum.getBytes()).withMax(maximum.getBytes()).withNumNulls(0);
return builder.build();
}
use of org.apache.parquet.schema.PrimitiveType in project presto by prestodb.
the class TestTupleDomainParquetPredicate method testIntegerMatchesWithStatistics.
@Test(dataProvider = "typeForParquetInt32")
public void testIntegerMatchesWithStatistics(Type typeForParquetInt32) throws ParquetCorruptionException {
RichColumnDescriptor column = new RichColumnDescriptor(new ColumnDescriptor(new String[] { "path" }, INT32, 0, 0), new PrimitiveType(OPTIONAL, INT32, "Test column"));
TupleDomain<ColumnDescriptor> effectivePredicate = TupleDomain.withColumnDomains(ImmutableMap.of(column, Domain.create(ValueSet.of(typeForParquetInt32, 42L, 43L, 44L, 112L), false)));
TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column));
assertTrue(parquetPredicate.matches(2, ImmutableMap.of(column, intColumnStats(32, 42)), ID));
assertFalse(parquetPredicate.matches(2, ImmutableMap.of(column, intColumnStats(30, 40)), ID));
// stats invalid for smallint/tinyint
assertEquals(parquetPredicate.matches(2, ImmutableMap.of(column, intColumnStats(1024, 0x10000 + 42)), ID), (typeForParquetInt32 != INTEGER));
}
use of org.apache.parquet.schema.PrimitiveType in project presto by prestodb.
the class TestColumnIndexBuilder method testBuildFloatZeroNaN.
@Test
public void testBuildFloatZeroNaN() {
PrimitiveType type = Types.required(FLOAT).named("test_float");
ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
StatsBuilder sb = new StatsBuilder();
builder.add(sb.stats(type, -1.0f, -0.0f));
builder.add(sb.stats(type, 0.0f, 1.0f));
builder.add(sb.stats(type, 1.0f, 100.0f));
ColumnIndex columnIndex = builder.build();
assertCorrectValues(columnIndex.getMinValues(), -1.0f, -0.0f, 1.0f);
assertCorrectValues(columnIndex.getMaxValues(), 0.0f, 1.0f, 100.0f);
builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
builder.add(sb.stats(type, -1.0f, -0.0f));
builder.add(sb.stats(type, 0.0f, Float.NaN));
builder.add(sb.stats(type, 1.0f, 100.0f));
assertNull(builder.build());
}
use of org.apache.parquet.schema.PrimitiveType in project presto by prestodb.
the class MetadataReader method readFooter.
public static ParquetFileMetadata readFooter(ParquetDataSource parquetDataSource, long fileSize) throws IOException {
// Parquet File Layout:
//
// MAGIC
// variable: Data
// variable: Metadata
// 4 bytes: MetadataLength
// MAGIC
validateParquet(fileSize >= MAGIC.length() + POST_SCRIPT_SIZE, "%s is not a valid Parquet File", parquetDataSource.getId());
// EXPECTED_FOOTER_SIZE is an int, so this will never fail
byte[] buffer = new byte[toIntExact(min(fileSize, EXPECTED_FOOTER_SIZE))];
parquetDataSource.readFully(fileSize - buffer.length, buffer);
Slice tailSlice = wrappedBuffer(buffer);
Slice magic = tailSlice.slice(tailSlice.length() - MAGIC.length(), MAGIC.length());
if (!MAGIC.equals(magic)) {
throw new ParquetCorruptionException(format("Not valid Parquet file: %s expected magic number: %s got: %s", parquetDataSource.getId(), Arrays.toString(MAGIC.getBytes()), Arrays.toString(magic.getBytes())));
}
int metadataLength = tailSlice.getInt(tailSlice.length() - POST_SCRIPT_SIZE);
int completeFooterSize = metadataLength + POST_SCRIPT_SIZE;
long metadataFileOffset = fileSize - completeFooterSize;
validateParquet(metadataFileOffset >= MAGIC.length() && metadataFileOffset + POST_SCRIPT_SIZE < fileSize, "Corrupted Parquet file: %s metadata index: %s out of range", parquetDataSource.getId(), metadataFileOffset);
// Ensure the slice covers the entire metadata range
if (tailSlice.length() < completeFooterSize) {
byte[] footerBuffer = new byte[completeFooterSize];
parquetDataSource.readFully(metadataFileOffset, footerBuffer, 0, footerBuffer.length - tailSlice.length());
// Copy the previous slice contents into the new buffer
tailSlice.getBytes(0, footerBuffer, footerBuffer.length - tailSlice.length(), tailSlice.length());
tailSlice = wrappedBuffer(footerBuffer, 0, footerBuffer.length);
}
FileMetaData fileMetaData = readFileMetaData(tailSlice.slice(tailSlice.length() - completeFooterSize, metadataLength).getInput());
List<SchemaElement> schema = fileMetaData.getSchema();
validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", parquetDataSource.getId());
MessageType messageType = readParquetSchema(schema);
List<BlockMetaData> blocks = new ArrayList<>();
List<RowGroup> rowGroups = fileMetaData.getRow_groups();
if (rowGroups != null) {
for (RowGroup rowGroup : rowGroups) {
BlockMetaData blockMetaData = new BlockMetaData();
blockMetaData.setRowCount(rowGroup.getNum_rows());
blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
List<ColumnChunk> columns = rowGroup.getColumns();
validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
String filePath = columns.get(0).getFile_path();
for (ColumnChunk columnChunk : columns) {
validateParquet((filePath == null && columnChunk.getFile_path() == null) || (filePath != null && filePath.equals(columnChunk.getFile_path())), "all column chunks of the same row group must be in the same file");
ColumnMetaData metaData = columnChunk.meta_data;
String[] path = metaData.path_in_schema.stream().map(value -> value.toLowerCase(Locale.ENGLISH)).toArray(String[]::new);
ColumnPath columnPath = ColumnPath.get(path);
PrimitiveType primitiveType = messageType.getType(columnPath.toArray()).asPrimitiveType();
PrimitiveTypeName primitiveTypeName = primitiveType.getPrimitiveTypeName();
ColumnChunkMetaData column = ColumnChunkMetaData.get(columnPath, primitiveType, CompressionCodecName.fromParquet(metaData.codec), PARQUET_METADATA_CONVERTER.convertEncodingStats(metaData.encoding_stats), readEncodings(metaData.encodings), readStats(metaData.statistics, primitiveTypeName), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size);
column.setColumnIndexReference(toColumnIndexReference(columnChunk));
column.setOffsetIndexReference(toOffsetIndexReference(columnChunk));
blockMetaData.addColumn(column);
}
blockMetaData.setPath(filePath);
blocks.add(blockMetaData);
}
}
Map<String, String> keyValueMetaData = new HashMap<>();
List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
if (keyValueList != null) {
for (KeyValue keyValue : keyValueList) {
keyValueMetaData.put(keyValue.key, keyValue.value);
}
}
ParquetMetadata parquetMetadata = new ParquetMetadata(new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
return new ParquetFileMetadata(parquetMetadata, toIntExact(metadataLength));
}
Aggregations