use of com.facebook.presto.orc.OrcEncoding.ORC in project presto by prestodb.
the class AbstractTestOrcReader method testCaching.
@Test
public void testCaching() throws Exception {
Cache<OrcDataSourceId, OrcFileTail> orcFileTailCache = CacheBuilder.newBuilder().maximumWeight(new DataSize(1, MEGABYTE).toBytes()).weigher((id, tail) -> ((OrcFileTail) tail).getFooterSize() + ((OrcFileTail) tail).getMetadataSize()).expireAfterAccess(new Duration(10, MINUTES).toMillis(), MILLISECONDS).recordStats().build();
OrcFileTailSource orcFileTailSource = new CachingOrcFileTailSource(new StorageOrcFileTailSource(), orcFileTailCache);
Cache<StripeId, Slice> stripeFootercache = CacheBuilder.newBuilder().maximumWeight(new DataSize(1, MEGABYTE).toBytes()).weigher((id, footer) -> ((Slice) footer).length()).expireAfterAccess(new Duration(10, MINUTES).toMillis(), MILLISECONDS).recordStats().build();
Cache<StripeStreamId, Slice> stripeStreamCache = CacheBuilder.newBuilder().maximumWeight(new DataSize(1, MEGABYTE).toBytes()).weigher((id, stream) -> ((Slice) stream).length()).expireAfterAccess(new Duration(10, MINUTES).toMillis(), MILLISECONDS).recordStats().build();
Optional<Cache<StripeStreamId, List<RowGroupIndex>>> rowGroupIndexCache = Optional.of(CacheBuilder.newBuilder().maximumWeight(new DataSize(1, MEGABYTE).toBytes()).weigher((id, rowGroupIndices) -> toIntExact(((List<RowGroupIndex>) rowGroupIndices).stream().mapToLong(RowGroupIndex::getRetainedSizeInBytes).sum())).expireAfterAccess(new Duration(10, MINUTES).toMillis(), MILLISECONDS).recordStats().build());
StripeMetadataSource stripeMetadataSource = new CachingStripeMetadataSource(new StorageStripeMetadataSource(), stripeFootercache, stripeStreamCache, rowGroupIndexCache);
try (TempFile tempFile = createTempFile(10001)) {
OrcBatchRecordReader storageReader = createCustomOrcRecordReader(tempFile, ORC, OrcPredicate.TRUE, ImmutableList.of(BIGINT), INITIAL_BATCH_SIZE, orcFileTailSource, stripeMetadataSource, true, ImmutableMap.of(), false);
assertEquals(orcFileTailCache.stats().missCount(), 1);
assertEquals(orcFileTailCache.stats().hitCount(), 0);
OrcBatchRecordReader cacheReader = createCustomOrcRecordReader(tempFile, ORC, OrcPredicate.TRUE, ImmutableList.of(BIGINT), INITIAL_BATCH_SIZE, orcFileTailSource, stripeMetadataSource, true, ImmutableMap.of(), false);
assertEquals(orcFileTailCache.stats().missCount(), 1);
assertEquals(orcFileTailCache.stats().hitCount(), 1);
assertEquals(storageReader.getRetainedSizeInBytes(), cacheReader.getRetainedSizeInBytes());
assertEquals(storageReader.getFileRowCount(), cacheReader.getFileRowCount());
assertEquals(storageReader.getSplitLength(), cacheReader.getSplitLength());
storageReader.nextBatch();
assertEquals(stripeFootercache.stats().missCount(), 1);
assertEquals(stripeFootercache.stats().hitCount(), 0);
assertEquals(stripeStreamCache.stats().missCount(), 2);
assertEquals(stripeStreamCache.stats().hitCount(), 0);
assertEquals(rowGroupIndexCache.get().stats().missCount(), 1);
assertEquals(rowGroupIndexCache.get().stats().hitCount(), 0);
cacheReader.nextBatch();
assertEquals(stripeFootercache.stats().missCount(), 1);
assertEquals(stripeFootercache.stats().hitCount(), 1);
assertEquals(stripeStreamCache.stats().missCount(), 2);
assertEquals(stripeStreamCache.stats().hitCount(), 2);
assertEquals(rowGroupIndexCache.get().stats().missCount(), 1);
assertEquals(rowGroupIndexCache.get().stats().hitCount(), 1);
assertEquals(storageReader.readBlock(0).getInt(0), cacheReader.readBlock(0).getInt(0));
}
}
use of com.facebook.presto.orc.OrcEncoding.ORC in project presto by prestodb.
the class IcebergFileWriterFactory method createOrcWriter.
private IcebergFileWriter createOrcWriter(Path outputPath, Schema icebergSchema, JobConf jobConf, ConnectorSession session) {
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), outputPath, jobConf);
DataSink orcDataSink = hdfsEnvironment.doAs(session.getUser(), () -> new OutputStreamDataSink(fileSystem.create(outputPath)));
Callable<Void> rollbackAction = () -> {
hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.delete(outputPath, false));
return null;
};
List<Types.NestedField> columnFields = icebergSchema.columns();
List<String> fileColumnNames = columnFields.stream().map(Types.NestedField::name).collect(toImmutableList());
List<Type> fileColumnTypes = columnFields.stream().map(Types.NestedField::type).map(type -> toPrestoType(type, typeManager)).collect(toImmutableList());
Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
if (isOrcOptimizedWriterValidate(session)) {
validationInputFactory = Optional.of(() -> {
try {
return new HdfsOrcDataSource(new OrcDataSourceId(outputPath.toString()), hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.getFileStatus(outputPath).getLen()), getOrcMaxMergeDistance(session), getOrcMaxBufferSize(session), getOrcStreamBufferSize(session), false, hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.open(outputPath)), readStats);
} catch (IOException e) {
throw new PrestoException(ICEBERG_WRITE_VALIDATION_FAILED, e);
}
});
}
return new IcebergOrcFileWriter(icebergSchema, orcDataSink, rollbackAction, ORC, fileColumnNames, fileColumnTypes, toOrcType(icebergSchema), getCompressionCodec(session).getOrcCompressionKind(), orcFileWriterConfig.toOrcWriterOptionsBuilder().withFlushPolicy(DefaultOrcWriterFlushPolicy.builder().withStripeMinSize(HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(session)).build()).withDictionaryMaxMemory(HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(HiveSessionProperties.getOrcStringStatisticsLimit(session)).build(), IntStream.range(0, fileColumnNames.size()).toArray(), ImmutableMap.<String, String>builder().put(PRESTO_VERSION_NAME, nodeVersion.toString()).put(PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), UTC, validationInputFactory, getOrcOptimizedWriterValidateMode(session), orcWriterStats, dwrfEncryptionProvider, Optional.empty());
} catch (IOException e) {
throw new PrestoException(ICEBERG_WRITER_OPEN_ERROR, "Error creating ORC file", e);
}
}
use of com.facebook.presto.orc.OrcEncoding.ORC in project presto by prestodb.
the class TestOrcReaderPositions method testFilterFunctionWithAppendRowNumber.
@Test
public void testFilterFunctionWithAppendRowNumber() throws Exception {
try (TempFile tempFile = new TempFile()) {
int rowCount = 100;
createSequentialFile(tempFile.getFile(), rowCount);
List<Long> expectedValues = LongStream.range(0, 100).boxed().filter(input -> input % 2 != 0).collect(ArrayList::new, List::add, List::addAll);
ConnectorSession session = new TestingConnectorSession(ImmutableList.of());
FilterFunction filter = new FilterFunction(session.getSqlFunctionProperties(), true, new IsOddPredicate());
OrcSelectiveRecordReader reader = createCustomOrcSelectiveRecordReader(tempFile.getFile(), ORC, OrcPredicate.TRUE, ImmutableList.of(BIGINT), MAX_BATCH_SIZE, ImmutableMap.of(), ImmutableList.of(filter), ImmutableMap.of(0, 0), ImmutableMap.of(), ImmutableMap.of(), ImmutableMap.of(), ImmutableMap.of(0, BIGINT), ImmutableList.of(0), false, new TestingHiveOrcAggregatedMemoryContext(), true);
verifyAppendNumber(expectedValues, reader);
}
}
use of com.facebook.presto.orc.OrcEncoding.ORC in project presto by prestodb.
the class TestOrcReaderPositions method testRowGroupSkipping.
@Test
public void testRowGroupSkipping() throws Exception {
try (TempFile tempFile = new TempFile()) {
// create single stripe file with multiple row groups
int rowCount = 142_000;
createSequentialFile(tempFile.getFile(), rowCount);
// test reading two row groups from middle of file
OrcPredicate predicate = (numberOfRows, statisticsByColumnIndex) -> {
if (numberOfRows == rowCount) {
return true;
}
IntegerStatistics stats = statisticsByColumnIndex.get(0).getIntegerStatistics();
return (stats.getMin() == 50_000) || (stats.getMin() == 60_000);
};
try (OrcBatchRecordReader reader = createCustomOrcRecordReader(tempFile, ORC, predicate, BIGINT, MAX_BATCH_SIZE, false, false)) {
assertEquals(reader.getFileRowCount(), rowCount);
assertEquals(reader.getReaderRowCount(), rowCount);
assertEquals(reader.getFilePosition(), 0);
assertEquals(reader.getReaderPosition(), 0);
long position = 50_000;
while (true) {
int batchSize = reader.nextBatch();
if (batchSize == -1) {
break;
}
Block block = reader.readBlock(0);
for (int i = 0; i < batchSize; i++) {
assertEquals(BIGINT.getLong(block, i), position + i);
}
assertEquals(reader.getFilePosition(), position);
assertEquals(reader.getReaderPosition(), position);
position += batchSize;
}
assertEquals(position, 70_000);
assertEquals(reader.getFilePosition(), rowCount);
assertEquals(reader.getReaderPosition(), rowCount);
}
}
}
use of com.facebook.presto.orc.OrcEncoding.ORC in project presto by prestodb.
the class TestOrcReaderPositions method testRowGroupSkippingWithAppendRowNumber.
@Test
public void testRowGroupSkippingWithAppendRowNumber() throws Exception {
try (TempFile tempFile = new TempFile()) {
// create single stripe file with multiple row groups
int rowCount = 142_000;
createSequentialFile(tempFile.getFile(), rowCount);
// test reading two row groups from middle of file
OrcPredicate predicate = (numberOfRows, statisticsByColumnIndex) -> {
if (numberOfRows == rowCount) {
return true;
}
IntegerStatistics stats = statisticsByColumnIndex.get(0).getIntegerStatistics();
return (stats.getMin() == 50_000) || (stats.getMin() == 70_000);
};
List<Long> expectedValues = new ArrayList<>();
expectedValues.addAll(LongStream.range(50_000, 60_000).collect(ArrayList::new, List::add, List::addAll));
expectedValues.addAll(LongStream.range(70_000, 80_000).collect(ArrayList::new, List::add, List::addAll));
OrcSelectiveRecordReader reader = createCustomOrcSelectiveRecordReader(tempFile, ORC, predicate, BIGINT, MAX_BATCH_SIZE, false, true);
verifyAppendNumber(expectedValues, reader);
}
}
Aggregations