Search in sources :

Example 1 with OrcWriterStats

use of com.facebook.presto.orc.OrcWriterStats in project presto by prestodb.

the class TestWriterBlockRawSize method testFileMetadataRawSize.

@Test
public void testFileMetadataRawSize() throws IOException {
    Type type = INTEGER;
    List<Type> types = ImmutableList.of(type);
    int numBlocksPerRowGroup = 3;
    int numBlocksPerStripe = numBlocksPerRowGroup * 5;
    int numStripes = 4;
    int numBlocksPerFile = numBlocksPerStripe * numStripes + 1;
    BlockBuilder blockBuilder = type.createBlockBuilder(null, NUM_ELEMENTS * 2);
    for (int i = 0; i < NUM_ELEMENTS; i++) {
        blockBuilder.appendNull();
        type.writeLong(blockBuilder, i);
    }
    long blockRawSize = ((FixedWidthType) type).getFixedSize() * NUM_ELEMENTS + NUM_ELEMENTS;
    Block block = blockBuilder.build();
    Block[] blocks = new Block[] { block };
    OrcWriterOptions writerOptions = OrcWriterOptions.builder().withRowGroupMaxRowCount(block.getPositionCount() * numBlocksPerRowGroup).withFlushPolicy(DefaultOrcWriterFlushPolicy.builder().withStripeMaxRowCount(block.getPositionCount() * numBlocksPerStripe).build()).build();
    for (OrcEncoding encoding : OrcEncoding.values()) {
        try (TempFile tempFile = new TempFile()) {
            OrcWriter writer = createOrcWriter(tempFile.getFile(), encoding, ZSTD, Optional.empty(), types, writerOptions, new OrcWriterStats());
            for (int i = 0; i < numBlocksPerFile; i++) {
                writer.write(new Page(blocks));
            }
            writer.close();
            writer.validate(new FileOrcDataSource(tempFile.getFile(), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), true));
            Footer footer = OrcTester.getFileMetadata(tempFile.getFile(), encoding).getFooter();
            verifyValue(encoding, footer.getRawSize(), blockRawSize * numBlocksPerFile);
            assertEquals(footer.getStripes().size(), numStripes + 1);
            int numBlocksRemaining = numBlocksPerFile;
            for (StripeInformation stripeInfo : footer.getStripes()) {
                int numBlocksInStripe = Math.min(numBlocksRemaining, numBlocksPerStripe);
                verifyValue(encoding, stripeInfo.getRawDataSize(), blockRawSize * numBlocksInStripe);
                numBlocksRemaining -= numBlocksInStripe;
            }
        }
    }
}
Also used : OrcWriterStats(com.facebook.presto.orc.OrcWriterStats) OrcWriter(com.facebook.presto.orc.OrcWriter) OrcTester.createOrcWriter(com.facebook.presto.orc.OrcTester.createOrcWriter) Page(com.facebook.presto.common.Page) OrcEncoding(com.facebook.presto.orc.OrcEncoding) OrcWriterOptions(com.facebook.presto.orc.OrcWriterOptions) TestOrcMapNullKey.createMapType(com.facebook.presto.orc.TestOrcMapNullKey.createMapType) TimestampType(com.facebook.presto.common.type.TimestampType) ArrayType(com.facebook.presto.common.type.ArrayType) OrcType(com.facebook.presto.orc.metadata.OrcType) Type(com.facebook.presto.common.type.Type) FixedWidthType(com.facebook.presto.common.type.FixedWidthType) RowType(com.facebook.presto.common.type.RowType) TempFile(com.facebook.presto.orc.TempFile) FileOrcDataSource(com.facebook.presto.orc.FileOrcDataSource) DataSize(io.airlift.units.DataSize) Footer(com.facebook.presto.orc.metadata.Footer) RowBlock(com.facebook.presto.common.block.RowBlock) Block(com.facebook.presto.common.block.Block) StripeInformation(com.facebook.presto.orc.metadata.StripeInformation) BlockBuilder(com.facebook.presto.common.block.BlockBuilder) FixedWidthType(com.facebook.presto.common.type.FixedWidthType) Test(org.testng.annotations.Test)

Example 2 with OrcWriterStats

use of com.facebook.presto.orc.OrcWriterStats in project presto by prestodb.

the class IcebergFileWriterFactory method createOrcWriter.

private IcebergFileWriter createOrcWriter(Path outputPath, Schema icebergSchema, JobConf jobConf, ConnectorSession session) {
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), outputPath, jobConf);
        DataSink orcDataSink = hdfsEnvironment.doAs(session.getUser(), () -> new OutputStreamDataSink(fileSystem.create(outputPath)));
        Callable<Void> rollbackAction = () -> {
            hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.delete(outputPath, false));
            return null;
        };
        List<Types.NestedField> columnFields = icebergSchema.columns();
        List<String> fileColumnNames = columnFields.stream().map(Types.NestedField::name).collect(toImmutableList());
        List<Type> fileColumnTypes = columnFields.stream().map(Types.NestedField::type).map(type -> toPrestoType(type, typeManager)).collect(toImmutableList());
        Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
        if (isOrcOptimizedWriterValidate(session)) {
            validationInputFactory = Optional.of(() -> {
                try {
                    return new HdfsOrcDataSource(new OrcDataSourceId(outputPath.toString()), hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.getFileStatus(outputPath).getLen()), getOrcMaxMergeDistance(session), getOrcMaxBufferSize(session), getOrcStreamBufferSize(session), false, hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.open(outputPath)), readStats);
                } catch (IOException e) {
                    throw new PrestoException(ICEBERG_WRITE_VALIDATION_FAILED, e);
                }
            });
        }
        return new IcebergOrcFileWriter(icebergSchema, orcDataSink, rollbackAction, ORC, fileColumnNames, fileColumnTypes, toOrcType(icebergSchema), getCompressionCodec(session).getOrcCompressionKind(), orcFileWriterConfig.toOrcWriterOptionsBuilder().withFlushPolicy(DefaultOrcWriterFlushPolicy.builder().withStripeMinSize(HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(session)).build()).withDictionaryMaxMemory(HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(HiveSessionProperties.getOrcStringStatisticsLimit(session)).build(), IntStream.range(0, fileColumnNames.size()).toArray(), ImmutableMap.<String, String>builder().put(PRESTO_VERSION_NAME, nodeVersion.toString()).put(PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), UTC, validationInputFactory, getOrcOptimizedWriterValidateMode(session), orcWriterStats, dwrfEncryptionProvider, Optional.empty());
    } catch (IOException e) {
        throw new PrestoException(ICEBERG_WRITER_OPEN_ERROR, "Error creating ORC file", e);
    }
}
Also used : HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) HdfsOrcDataSource(com.facebook.presto.hive.orc.HdfsOrcDataSource) Types(org.apache.iceberg.types.Types) FileSystem(org.apache.hadoop.fs.FileSystem) DataSink(com.facebook.presto.common.io.DataSink) IcebergSessionProperties.getOrcMaxBufferSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxBufferSize) NodeVersion(com.facebook.presto.hive.NodeVersion) PRESTO_VERSION_NAME(com.facebook.presto.hive.HiveMetadata.PRESTO_VERSION_NAME) Path(org.apache.hadoop.fs.Path) HiveSessionProperties.getParquetWriterBlockSize(com.facebook.presto.hive.HiveSessionProperties.getParquetWriterBlockSize) OrcDataSource(com.facebook.presto.orc.OrcDataSource) FileFormatDataSourceStats(com.facebook.presto.hive.FileFormatDataSourceStats) HdfsContext(com.facebook.presto.hive.HdfsContext) TypeConverter.toPrestoType(com.facebook.presto.iceberg.TypeConverter.toPrestoType) ParquetSchemaUtil.convert(org.apache.iceberg.parquet.ParquetSchemaUtil.convert) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveDwrfEncryptionProvider(com.facebook.presto.hive.HiveDwrfEncryptionProvider) Schema(org.apache.iceberg.Schema) ConnectorSession(com.facebook.presto.spi.ConnectorSession) ORC(com.facebook.presto.orc.OrcEncoding.ORC) List(java.util.List) NOT_SUPPORTED(com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED) ICEBERG_WRITER_OPEN_ERROR(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_WRITER_OPEN_ERROR) IcebergSessionProperties.isOrcOptimizedWriterValidate(com.facebook.presto.iceberg.IcebergSessionProperties.isOrcOptimizedWriterValidate) Optional(java.util.Optional) OutputStreamDataSink(com.facebook.presto.common.io.OutputStreamDataSink) HiveSessionProperties(com.facebook.presto.hive.HiveSessionProperties) IntStream(java.util.stream.IntStream) HiveSessionProperties.getParquetWriterPageSize(com.facebook.presto.hive.HiveSessionProperties.getParquetWriterPageSize) Callable(java.util.concurrent.Callable) PrestoException(com.facebook.presto.spi.PrestoException) Supplier(java.util.function.Supplier) Inject(javax.inject.Inject) IcebergSessionProperties.getCompressionCodec(com.facebook.presto.iceberg.IcebergSessionProperties.getCompressionCodec) TypeManager(com.facebook.presto.common.type.TypeManager) Objects.requireNonNull(java.util.Objects.requireNonNull) PrimitiveTypeMapBuilder.makeTypeMap(com.facebook.presto.iceberg.util.PrimitiveTypeMapBuilder.makeTypeMap) TypeConverter.toOrcType(com.facebook.presto.iceberg.TypeConverter.toOrcType) OrcWriterStats(com.facebook.presto.orc.OrcWriterStats) Type(com.facebook.presto.common.type.Type) IcebergSessionProperties.getOrcMaxMergeDistance(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxMergeDistance) DwrfEncryptionProvider(com.facebook.presto.orc.DwrfEncryptionProvider) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) DefaultOrcWriterFlushPolicy(com.facebook.presto.orc.DefaultOrcWriterFlushPolicy) OrcFileWriterConfig(com.facebook.presto.hive.OrcFileWriterConfig) ParquetWriterOptions(com.facebook.presto.parquet.writer.ParquetWriterOptions) IOException(java.io.IOException) UTC(org.joda.time.DateTimeZone.UTC) FileFormat(org.apache.iceberg.FileFormat) ICEBERG_WRITE_VALIDATION_FAILED(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_WRITE_VALIDATION_FAILED) JobConf(org.apache.hadoop.mapred.JobConf) IcebergSessionProperties.getOrcOptimizedWriterValidateMode(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcOptimizedWriterValidateMode) PRESTO_QUERY_ID_NAME(com.facebook.presto.hive.metastore.MetastoreUtil.PRESTO_QUERY_ID_NAME) IcebergSessionProperties.getOrcStreamBufferSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcStreamBufferSize) DataSink(com.facebook.presto.common.io.DataSink) OutputStreamDataSink(com.facebook.presto.common.io.OutputStreamDataSink) Types(org.apache.iceberg.types.Types) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) HdfsOrcDataSource(com.facebook.presto.hive.orc.HdfsOrcDataSource) PrestoException(com.facebook.presto.spi.PrestoException) IOException(java.io.IOException) TypeConverter.toPrestoType(com.facebook.presto.iceberg.TypeConverter.toPrestoType) TypeConverter.toOrcType(com.facebook.presto.iceberg.TypeConverter.toOrcType) Type(com.facebook.presto.common.type.Type) FileSystem(org.apache.hadoop.fs.FileSystem) Supplier(java.util.function.Supplier) OutputStreamDataSink(com.facebook.presto.common.io.OutputStreamDataSink)

Aggregations

Type (com.facebook.presto.common.type.Type)2 OrcWriterStats (com.facebook.presto.orc.OrcWriterStats)2 Page (com.facebook.presto.common.Page)1 Block (com.facebook.presto.common.block.Block)1 BlockBuilder (com.facebook.presto.common.block.BlockBuilder)1 RowBlock (com.facebook.presto.common.block.RowBlock)1 DataSink (com.facebook.presto.common.io.DataSink)1 OutputStreamDataSink (com.facebook.presto.common.io.OutputStreamDataSink)1 ArrayType (com.facebook.presto.common.type.ArrayType)1 FixedWidthType (com.facebook.presto.common.type.FixedWidthType)1 RowType (com.facebook.presto.common.type.RowType)1 TimestampType (com.facebook.presto.common.type.TimestampType)1 TypeManager (com.facebook.presto.common.type.TypeManager)1 FileFormatDataSourceStats (com.facebook.presto.hive.FileFormatDataSourceStats)1 HdfsContext (com.facebook.presto.hive.HdfsContext)1 HdfsEnvironment (com.facebook.presto.hive.HdfsEnvironment)1 HiveDwrfEncryptionProvider (com.facebook.presto.hive.HiveDwrfEncryptionProvider)1 PRESTO_VERSION_NAME (com.facebook.presto.hive.HiveMetadata.PRESTO_VERSION_NAME)1 HiveSessionProperties (com.facebook.presto.hive.HiveSessionProperties)1 HiveSessionProperties.getParquetWriterBlockSize (com.facebook.presto.hive.HiveSessionProperties.getParquetWriterBlockSize)1