use of parquet.column.ColumnDescriptor in project presto by prestodb.
the class TupleDomainParquetPredicate method getDomain.
@VisibleForTesting
public static Domain getDomain(Type type, ParquetDictionaryDescriptor dictionaryDescriptor) {
if (dictionaryDescriptor == null) {
return null;
}
ColumnDescriptor columnDescriptor = dictionaryDescriptor.getColumnDescriptor();
Optional<ParquetDictionaryPage> dictionaryPage = dictionaryDescriptor.getDictionaryPage();
if (!dictionaryPage.isPresent()) {
return null;
}
ParquetDictionary dictionary;
try {
dictionary = dictionaryPage.get().getEncoding().initDictionary(columnDescriptor, dictionaryPage.get());
} catch (Exception e) {
// OK to ignore exception when reading dictionaries
return null;
}
int dictionarySize = dictionaryPage.get().getDictionarySize();
if (type.equals(BIGINT) && columnDescriptor.getType() == PrimitiveTypeName.INT64) {
List<Domain> domains = new ArrayList<>();
for (int i = 0; i < dictionarySize; i++) {
domains.add(Domain.singleValue(type, dictionary.decodeToLong(i)));
}
domains.add(Domain.onlyNull(type));
return Domain.union(domains);
} else if (type.equals(BIGINT) && columnDescriptor.getType() == PrimitiveTypeName.INT32) {
List<Domain> domains = new ArrayList<>();
for (int i = 0; i < dictionarySize; i++) {
domains.add(Domain.singleValue(type, (long) dictionary.decodeToInt(i)));
}
domains.add(Domain.onlyNull(type));
return Domain.union(domains);
} else if (type.equals(DOUBLE) && columnDescriptor.getType() == PrimitiveTypeName.DOUBLE) {
List<Domain> domains = new ArrayList<>();
for (int i = 0; i < dictionarySize; i++) {
domains.add(Domain.singleValue(type, dictionary.decodeToDouble(i)));
}
domains.add(Domain.onlyNull(type));
return Domain.union(domains);
} else if (type.equals(DOUBLE) && columnDescriptor.getType() == PrimitiveTypeName.FLOAT) {
List<Domain> domains = new ArrayList<>();
for (int i = 0; i < dictionarySize; i++) {
domains.add(Domain.singleValue(type, (double) dictionary.decodeToFloat(i)));
}
domains.add(Domain.onlyNull(type));
return Domain.union(domains);
} else if (isVarcharType(type) && columnDescriptor.getType() == PrimitiveTypeName.BINARY) {
List<Domain> domains = new ArrayList<>();
for (int i = 0; i < dictionarySize; i++) {
domains.add(Domain.singleValue(type, Slices.wrappedBuffer(dictionary.decodeToBinary(i).getBytes())));
}
domains.add(Domain.onlyNull(type));
return Domain.union(domains);
}
return null;
}
use of parquet.column.ColumnDescriptor in project presto by prestodb.
the class ParquetReader method initializeColumnReaders.
private void initializeColumnReaders() {
for (PrimitiveColumnIO columnIO : getColumns(fileSchema, requestedSchema)) {
ColumnDescriptor descriptor = columnIO.getColumnDescriptor();
RichColumnDescriptor column = new RichColumnDescriptor(descriptor.getPath(), columnIO.getType().asPrimitiveType(), descriptor.getMaxRepetitionLevel(), descriptor.getMaxDefinitionLevel());
columnReadersMap.put(column, ParquetColumnReader.createReader(column));
}
}
use of parquet.column.ColumnDescriptor in project presto by prestodb.
the class ParquetReader method nextBatch.
public int nextBatch() {
if (nextRowInGroup >= currentGroupRowCount && !advanceToNextRowGroup()) {
return -1;
}
batchSize = toIntExact(min(MAX_VECTOR_LENGTH, currentGroupRowCount - nextRowInGroup));
nextRowInGroup += batchSize;
currentPosition += batchSize;
for (PrimitiveColumnIO columnIO : getColumns(fileSchema, requestedSchema)) {
ColumnDescriptor descriptor = columnIO.getColumnDescriptor();
RichColumnDescriptor column = new RichColumnDescriptor(descriptor.getPath(), columnIO.getType().asPrimitiveType(), descriptor.getMaxRepetitionLevel(), descriptor.getMaxDefinitionLevel());
ParquetColumnReader columnReader = columnReadersMap.get(column);
columnReader.prepareNextRead(batchSize);
}
return batchSize;
}
use of parquet.column.ColumnDescriptor in project presto by prestodb.
the class ParquetTester method assertFileContents.
private static void assertFileContents(JobConf jobConf, TempFile tempFile, Iterable<?> expectedValues, Type type) throws IOException, InterruptedException {
Path path = new Path(tempFile.getFile().toURI());
FileSystem fileSystem = path.getFileSystem(jobConf);
ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
long size = fileSystem.getFileStatus(path).getLen();
FSDataInputStream inputStream = fileSystem.open(path);
ParquetDataSource dataSource = new HdfsParquetDataSource(path, size, inputStream);
TypeManager typeManager = new TypeRegistry();
ParquetReader parquetReader = new ParquetReader(fileSchema, fileSchema, parquetMetadata.getBlocks(), dataSource, typeManager, new AggregatedMemoryContext());
assertEquals(parquetReader.getPosition(), 0);
int rowsProcessed = 0;
Iterator<?> iterator = expectedValues.iterator();
for (int batchSize = parquetReader.nextBatch(); batchSize >= 0; batchSize = parquetReader.nextBatch()) {
ColumnDescriptor columnDescriptor = fileSchema.getColumns().get(0);
Block block = parquetReader.readPrimitive(columnDescriptor, type);
for (int i = 0; i < batchSize; i++) {
assertTrue(iterator.hasNext());
Object expected = iterator.next();
Object actual = decodeObject(type, block, i);
assertEquals(actual, expected);
}
rowsProcessed += batchSize;
assertEquals(parquetReader.getPosition(), rowsProcessed);
}
assertFalse(iterator.hasNext());
assertEquals(parquetReader.getPosition(), rowsProcessed);
parquetReader.close();
}
use of parquet.column.ColumnDescriptor in project presto by prestodb.
the class ParquetTypeUtils method getDescriptor.
public static Optional<RichColumnDescriptor> getDescriptor(MessageType fileSchema, MessageType requestedSchema, List<String> path) {
checkArgument(path.size() >= 1, "Parquet nested path should have at least one component");
int level = path.size();
for (PrimitiveColumnIO columnIO : getColumns(fileSchema, requestedSchema)) {
ColumnIO[] fields = columnIO.getPath();
if (fields.length <= level) {
continue;
}
if (fields[level].getName().equalsIgnoreCase(path.get(level - 1))) {
boolean match = true;
for (int i = 0; i < level - 1; i++) {
if (!fields[i + 1].getName().equalsIgnoreCase(path.get(i))) {
match = false;
}
}
if (match) {
ColumnDescriptor descriptor = columnIO.getColumnDescriptor();
return Optional.of(new RichColumnDescriptor(descriptor.getPath(), columnIO.getType().asPrimitiveType(), descriptor.getMaxRepetitionLevel(), descriptor.getMaxDefinitionLevel()));
}
}
}
return empty();
}
Aggregations