use of io.prestosql.plugin.hive.FileFormatDataSourceStats in project hetu-core by openlookeng.
the class TestOrcDeleteDeltaPageSource method testReadingDeletedRows.
@Test
public void testReadingDeletedRows() {
OrcDeleteDeltaPageSourceFactory pageSourceFactory = new OrcDeleteDeltaPageSourceFactory("test", new JobConf(new Configuration(false)), HiveTestUtils.HDFS_ENVIRONMENT, new DataSize(1, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(16, MEGABYTE), new DataSize(8, MEGABYTE), true, false, new FileFormatDataSourceStats());
OrcDeleteDeltaPageSource pageSource = pageSourceFactory.createPageSource(new Path(DELETE_FILE.toURI()), DELETE_FILE.length(), DELETE_FILE.lastModified());
MaterializedResult materializedRows = MaterializedResult.materializeSourceDataStream(HiveTestUtils.SESSION, pageSource, ImmutableList.of(BIGINT, INTEGER, BIGINT));
assertEquals(materializedRows.getRowCount(), 1);
assertEquals(materializedRows.getMaterializedRows().get(0), new MaterializedRow(5, 2L, 536870912, 0L));
}
use of io.prestosql.plugin.hive.FileFormatDataSourceStats in project boostkit-bigdata by kunpengcompute.
the class TestOrcDeletedRows method createOrcDeletedRows.
private OrcDeletedRows createOrcDeletedRows(Optional<DeleteDeltaLocations> deleteDeltaLocations) {
JobConf configuration = new JobConf(new Configuration(false));
OrcDeleteDeltaPageSourceFactory pageSourceFactory = new OrcDeleteDeltaPageSourceFactory("test", configuration, HiveTestUtils.HDFS_ENVIRONMENT, new DataSize(1, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(16, MEGABYTE), new DataSize(8, MEGABYTE), true, false, new FileFormatDataSourceStats());
return new OrcDeletedRows("bucket_00000", deleteDeltaLocations, pageSourceFactory, "test", configuration, HiveTestUtils.HDFS_ENVIRONMENT, Optional.empty());
}
use of io.prestosql.plugin.hive.FileFormatDataSourceStats in project boostkit-bigdata by kunpengcompute.
the class ParquetPageSourceFactory method createParquetPageSource.
public static ParquetPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, long fileSize, Properties schema, List<HiveColumnHandle> columns, boolean useParquetColumnNames, boolean failOnCorruptedParquetStatistics, DataSize maxReadBlockSize, TypeManager typeManager, TupleDomain<HiveColumnHandle> effectivePredicate, FileFormatDataSourceStats stats, DateTimeZone timeZone) {
AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext();
ParquetDataSource dataSource = null;
DateTimeZone readerTimeZone = timeZone;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(user, () -> fileSystem.open(path));
ParquetMetadata parquetMetadata = MetadataReader.readFooter(inputStream, path, fileSize);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
dataSource = buildHdfsParquetDataSource(inputStream, path, fileSize, stats);
String writerTimeZoneId = fileMetaData.getKeyValueMetaData().get(WRITER_TIME_ZONE_KEY);
if (writerTimeZoneId != null && !writerTimeZoneId.equalsIgnoreCase(readerTimeZone.getID())) {
readerTimeZone = DateTimeZone.forID(writerTimeZoneId);
}
List<org.apache.parquet.schema.Type> fields = columns.stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getParquetType(column, fileSchema, useParquetColumnNames)).filter(Objects::nonNull).collect(toList());
MessageType requestedSchema = new MessageType(fileSchema.getName(), fields);
ImmutableList.Builder<BlockMetaData> footerBlocks = ImmutableList.builder();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= start && firstDataPage < start + length) {
footerBlocks.add(block);
}
}
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath);
final ParquetDataSource finalDataSource = dataSource;
ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
for (BlockMetaData block : footerBlocks.build()) {
if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain, failOnCorruptedParquetStatistics)) {
blocks.add(block);
}
}
MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
ParquetReader parquetReader = new ParquetReader(Optional.ofNullable(fileMetaData.getCreatedBy()), messageColumnIO, blocks.build(), dataSource, readerTimeZone, systemMemoryContext, maxReadBlockSize);
return new ParquetPageSource(parquetReader, fileSchema, messageColumnIO, typeManager, schema, columns, effectivePredicate, useParquetColumnNames);
} catch (Exception e) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (e instanceof PrestoException) {
throw (PrestoException) e;
}
if (e instanceof ParquetCorruptionException) {
throw new PrestoException(HIVE_BAD_DATA, e);
}
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e instanceof BlockMissingException) {
throw new PrestoException(HIVE_MISSING_DATA, message, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
use of io.prestosql.plugin.hive.FileFormatDataSourceStats in project boostkit-bigdata by kunpengcompute.
the class TestParquetPageSourceFactory method setUp.
@BeforeClass
public void setUp() {
HiveHdfsConfiguration hiveHdfsConfiguration = new HiveHdfsConfiguration(new HdfsConfigurationInitializer(new HiveConfig(), ImmutableSet.of()), ImmutableSet.of());
HdfsEnvironment hdfsEnvironment = new HdfsEnvironment(hiveHdfsConfiguration, new HiveConfig(), new NoHdfsAuthentication());
parquetPageSourceFactory = new ParquetPageSourceFactory(new TestingTypeManager(), hdfsEnvironment, new FileFormatDataSourceStats(), new HiveConfig());
}
use of io.prestosql.plugin.hive.FileFormatDataSourceStats in project hetu-core by openlookeng.
the class TestParquetPageSourceFactory method setUp.
@BeforeClass
public void setUp() {
HiveHdfsConfiguration hiveHdfsConfiguration = new HiveHdfsConfiguration(new HdfsConfigurationInitializer(new HiveConfig(), ImmutableSet.of()), ImmutableSet.of());
HdfsEnvironment hdfsEnvironment = new HdfsEnvironment(hiveHdfsConfiguration, new HiveConfig(), new NoHdfsAuthentication());
parquetPageSourceFactory = new ParquetPageSourceFactory(new TestingTypeManager(), hdfsEnvironment, new FileFormatDataSourceStats(), new HiveConfig());
}
Aggregations