use of com.facebook.presto.hive.parquet.reader.ParquetReader in project presto by prestodb.
the class ParquetTester method assertFileContents.
private static void assertFileContents(JobConf jobConf, TempFile tempFile, Iterable<?> expectedValues, Type type) throws IOException, InterruptedException {
Path path = new Path(tempFile.getFile().toURI());
FileSystem fileSystem = path.getFileSystem(jobConf);
ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
long size = fileSystem.getFileStatus(path).getLen();
FSDataInputStream inputStream = fileSystem.open(path);
ParquetDataSource dataSource = new HdfsParquetDataSource(path, size, inputStream);
TypeManager typeManager = new TypeRegistry();
ParquetReader parquetReader = new ParquetReader(fileSchema, fileSchema, parquetMetadata.getBlocks(), dataSource, typeManager, new AggregatedMemoryContext());
assertEquals(parquetReader.getPosition(), 0);
int rowsProcessed = 0;
Iterator<?> iterator = expectedValues.iterator();
for (int batchSize = parquetReader.nextBatch(); batchSize >= 0; batchSize = parquetReader.nextBatch()) {
ColumnDescriptor columnDescriptor = fileSchema.getColumns().get(0);
Block block = parquetReader.readPrimitive(columnDescriptor, type);
for (int i = 0; i < batchSize; i++) {
assertTrue(iterator.hasNext());
Object expected = iterator.next();
Object actual = decodeObject(type, block, i);
assertEquals(actual, expected);
}
rowsProcessed += batchSize;
assertEquals(parquetReader.getPosition(), rowsProcessed);
}
assertFalse(iterator.hasNext());
assertEquals(parquetReader.getPosition(), rowsProcessed);
parquetReader.close();
}
use of com.facebook.presto.hive.parquet.reader.ParquetReader in project presto by prestodb.
the class ParquetPageSourceFactory method createParquetPageSource.
public static ParquetPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns, boolean useParquetColumnNames, TypeManager typeManager, boolean predicatePushdownEnabled, TupleDomain<HiveColumnHandle> effectivePredicate) {
AggregatedMemoryContext systemMemoryContext = new AggregatedMemoryContext();
ParquetDataSource dataSource = null;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration);
dataSource = buildHdfsParquetDataSource(fileSystem, path, start, length);
ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
List<parquet.schema.Type> fields = columns.stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getParquetType(column, fileSchema, useParquetColumnNames)).filter(Objects::nonNull).collect(toList());
MessageType requestedSchema = new MessageType(fileSchema.getName(), fields);
List<BlockMetaData> blocks = new ArrayList<>();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= start && firstDataPage < start + length) {
blocks.add(block);
}
}
if (predicatePushdownEnabled) {
ParquetPredicate parquetPredicate = buildParquetPredicate(columns, effectivePredicate, fileMetaData.getSchema(), typeManager);
final ParquetDataSource finalDataSource = dataSource;
blocks = blocks.stream().filter(block -> predicateMatches(parquetPredicate, block, finalDataSource, requestedSchema, effectivePredicate)).collect(toList());
}
ParquetReader parquetReader = new ParquetReader(fileSchema, requestedSchema, blocks, dataSource, typeManager, systemMemoryContext);
return new ParquetPageSource(parquetReader, dataSource, fileSchema, requestedSchema, length, schema, columns, effectivePredicate, typeManager, useParquetColumnNames, systemMemoryContext);
} catch (Exception e) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (e instanceof PrestoException) {
throw (PrestoException) e;
}
String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e.getClass().getSimpleName().equals("BlockMissingException")) {
throw new PrestoException(HIVE_MISSING_DATA, message, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
Aggregations