Search in sources :

Example 1 with OutputStreamDataSink

use of com.facebook.presto.common.io.OutputStreamDataSink in project presto by prestodb.

the class OrcTester method createOrcWriter.

public static OrcWriter createOrcWriter(File outputFile, OrcEncoding encoding, CompressionKind compression, Optional<DwrfWriterEncryption> dwrfWriterEncryption, List<Type> types, OrcWriterOptions writerOptions, WriterStats stats) throws FileNotFoundException {
    List<String> columnNames = makeColumnNames(types.size());
    ImmutableMap.Builder<String, String> metadata = ImmutableMap.builder();
    metadata.put("columns", String.join(", ", columnNames));
    metadata.put("columns.types", createSettableStructObjectInspector(types).getTypeName());
    OrcWriter writer = new OrcWriter(new OutputStreamDataSink(new FileOutputStream(outputFile)), columnNames, types, encoding, compression, dwrfWriterEncryption, new DwrfEncryptionProvider(new UnsupportedEncryptionLibrary(), new TestingEncryptionLibrary()), writerOptions, ImmutableMap.of(), HIVE_STORAGE_TIME_ZONE, true, BOTH, stats);
    return writer;
}
Also used : FileOutputStream(java.io.FileOutputStream) OutputStreamDataSink(com.facebook.presto.common.io.OutputStreamDataSink) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 2 with OutputStreamDataSink

use of com.facebook.presto.common.io.OutputStreamDataSink in project presto by prestodb.

the class TestOrcWriter method testStreamOrder.

private void testStreamOrder(OrcEncoding encoding, CompressionKind kind, OptionalInt level, StreamLayoutFactory streamLayoutFactory, Supplier<Consumer<Stream>> streamConsumerFactory) throws IOException {
    OrcWriterOptions orcWriterOptions = OrcWriterOptions.builder().withFlushPolicy(DefaultOrcWriterFlushPolicy.builder().withStripeMinSize(new DataSize(0, MEGABYTE)).withStripeMaxSize(new DataSize(32, MEGABYTE)).withStripeMaxRowCount(ORC_STRIPE_SIZE).build()).withRowGroupMaxRowCount(ORC_ROW_GROUP_SIZE).withDictionaryMaxMemory(new DataSize(32, MEGABYTE)).withCompressionLevel(level).withStreamLayoutFactory(streamLayoutFactory).build();
    for (OrcWriteValidationMode validationMode : OrcWriteValidationMode.values()) {
        TempFile tempFile = new TempFile();
        OrcWriter writer = new OrcWriter(new OutputStreamDataSink(new FileOutputStream(tempFile.getFile())), ImmutableList.of("test1", "test2", "test3", "test4", "test5"), ImmutableList.of(VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR), encoding, kind, Optional.empty(), NO_ENCRYPTION, orcWriterOptions, ImmutableMap.of(), HIVE_STORAGE_TIME_ZONE, true, validationMode, new OrcWriterStats());
        // write down some data with unsorted streams
        String[] data = new String[] { "a", "bbbbb", "ccc", "dd", "eeee" };
        Block[] blocks = new Block[data.length];
        int entries = 65536;
        BlockBuilder blockBuilder = VARCHAR.createBlockBuilder(null, entries);
        for (int i = 0; i < data.length; i++) {
            byte[] bytes = data[i].getBytes();
            for (int j = 0; j < entries; j++) {
                // force to write different data
                bytes[0] = (byte) ((bytes[0] + 1) % 128);
                blockBuilder.writeBytes(Slices.wrappedBuffer(bytes, 0, bytes.length), 0, bytes.length);
                blockBuilder.closeEntry();
            }
            blocks[i] = blockBuilder.build();
            blockBuilder = blockBuilder.newBlockBuilderLike(null);
        }
        writer.write(new Page(blocks));
        writer.close();
        for (StripeFooter stripeFooter : OrcTester.getStripes(tempFile.getFile(), encoding)) {
            Consumer<Stream> streamConsumer = streamConsumerFactory.get();
            boolean dataStreamStarted = false;
            for (Stream stream : stripeFooter.getStreams()) {
                if (isIndexStream(stream)) {
                    assertFalse(dataStreamStarted);
                    continue;
                }
                dataStreamStarted = true;
                streamConsumer.accept(stream);
            }
        }
    }
}
Also used : Page(com.facebook.presto.common.Page) StripeFooter(com.facebook.presto.orc.metadata.StripeFooter) DataSize(io.airlift.units.DataSize) FileOutputStream(java.io.FileOutputStream) OrcWriteValidationMode(com.facebook.presto.orc.OrcWriteValidation.OrcWriteValidationMode) Block(com.facebook.presto.common.block.Block) StripeReader.isIndexStream(com.facebook.presto.orc.StripeReader.isIndexStream) FileOutputStream(java.io.FileOutputStream) Stream(com.facebook.presto.orc.metadata.Stream) OutputStreamDataSink(com.facebook.presto.common.io.OutputStreamDataSink) BlockBuilder(com.facebook.presto.common.block.BlockBuilder)

Example 3 with OutputStreamDataSink

use of com.facebook.presto.common.io.OutputStreamDataSink in project presto by prestodb.

the class TestTempStorageSingleStreamSpiller method assertSpill.

private void assertSpill(boolean compression, boolean encryption) throws Exception {
    File spillPath = new File(tempDirectory, UUID.randomUUID().toString());
    TempStorageSingleStreamSpillerFactory spillerFactory = new TempStorageSingleStreamSpillerFactory(new TestingTempStorageManager(spillPath.toString()), // executor won't be closed, because we don't call destroy() on the spiller factory
    executor, new BlockEncodingManager(), new SpillerStats(), compression, encryption, LocalTempStorage.NAME);
    LocalMemoryContext memoryContext = newSimpleAggregatedMemoryContext().newLocalMemoryContext("test");
    SingleStreamSpiller singleStreamSpiller = spillerFactory.create(TYPES, new TestingSpillContext(), memoryContext);
    assertTrue(singleStreamSpiller instanceof TempStorageSingleStreamSpiller);
    TempStorageSingleStreamSpiller spiller = (TempStorageSingleStreamSpiller) singleStreamSpiller;
    Page page = buildPage();
    // The spillers will reserve memory in their constructors
    int retainedSizeForEmptyDataSink = toIntExact(new OutputStreamDataSink(new DynamicSliceOutput(0)).getRetainedSizeInBytes());
    assertEquals(memoryContext.getBytes(), retainedSizeForEmptyDataSink);
    spiller.spill(page).get();
    spiller.spill(Iterators.forArray(page, page, page)).get();
    assertEquals(listFiles(spillPath.toPath()).size(), 1);
    // The spillers release their memory reservations when they are closed, therefore at this point
    // they will have non-zero memory reservation.
    // assertEquals(memoryContext.getBytes(), 0);
    Iterator<Page> spilledPagesIterator = spiller.getSpilledPages();
    assertEquals(memoryContext.getBytes(), retainedSizeForEmptyDataSink);
    ImmutableList<Page> spilledPages = ImmutableList.copyOf(spilledPagesIterator);
    // The spillers release their memory reservations when they are closed, therefore at this point
    // they will have non-zero memory reservation.
    // assertEquals(memoryContext.getBytes(), 0);
    assertEquals(4, spilledPages.size());
    for (int i = 0; i < 4; ++i) {
        PageAssertions.assertPageEquals(TYPES, page, spilledPages.get(i));
    }
    // Assert the spill codec flags match the expected configuration
    try (InputStream is = newInputStream(listFiles(spillPath.toPath()).get(0))) {
        Iterator<SerializedPage> serializedPages = PagesSerdeUtil.readSerializedPages(new InputStreamSliceInput(is));
        assertTrue(serializedPages.hasNext(), "at least one page should be successfully read back");
        byte markers = serializedPages.next().getPageCodecMarkers();
        assertEquals(PageCodecMarker.COMPRESSED.isSet(markers), compression);
        assertEquals(PageCodecMarker.ENCRYPTED.isSet(markers), encryption);
    }
    spiller.close();
    assertEquals(listFiles(spillPath.toPath()).size(), 0);
    assertEquals(memoryContext.getBytes(), 0);
}
Also used : LocalMemoryContext(com.facebook.presto.memory.context.LocalMemoryContext) Files.newInputStream(java.nio.file.Files.newInputStream) InputStream(java.io.InputStream) Page(com.facebook.presto.common.Page) SerializedPage(com.facebook.presto.spi.page.SerializedPage) BlockEncodingManager(com.facebook.presto.common.block.BlockEncodingManager) TestingTempStorageManager(com.facebook.presto.testing.TestingTempStorageManager) DynamicSliceOutput(io.airlift.slice.DynamicSliceOutput) SerializedPage(com.facebook.presto.spi.page.SerializedPage) InputStreamSliceInput(io.airlift.slice.InputStreamSliceInput) File(java.io.File) OutputStreamDataSink(com.facebook.presto.common.io.OutputStreamDataSink)

Example 4 with OutputStreamDataSink

use of com.facebook.presto.common.io.OutputStreamDataSink in project presto by prestodb.

the class IcebergFileWriterFactory method createOrcWriter.

private IcebergFileWriter createOrcWriter(Path outputPath, Schema icebergSchema, JobConf jobConf, ConnectorSession session) {
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), outputPath, jobConf);
        DataSink orcDataSink = hdfsEnvironment.doAs(session.getUser(), () -> new OutputStreamDataSink(fileSystem.create(outputPath)));
        Callable<Void> rollbackAction = () -> {
            hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.delete(outputPath, false));
            return null;
        };
        List<Types.NestedField> columnFields = icebergSchema.columns();
        List<String> fileColumnNames = columnFields.stream().map(Types.NestedField::name).collect(toImmutableList());
        List<Type> fileColumnTypes = columnFields.stream().map(Types.NestedField::type).map(type -> toPrestoType(type, typeManager)).collect(toImmutableList());
        Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
        if (isOrcOptimizedWriterValidate(session)) {
            validationInputFactory = Optional.of(() -> {
                try {
                    return new HdfsOrcDataSource(new OrcDataSourceId(outputPath.toString()), hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.getFileStatus(outputPath).getLen()), getOrcMaxMergeDistance(session), getOrcMaxBufferSize(session), getOrcStreamBufferSize(session), false, hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.open(outputPath)), readStats);
                } catch (IOException e) {
                    throw new PrestoException(ICEBERG_WRITE_VALIDATION_FAILED, e);
                }
            });
        }
        return new IcebergOrcFileWriter(icebergSchema, orcDataSink, rollbackAction, ORC, fileColumnNames, fileColumnTypes, toOrcType(icebergSchema), getCompressionCodec(session).getOrcCompressionKind(), orcFileWriterConfig.toOrcWriterOptionsBuilder().withFlushPolicy(DefaultOrcWriterFlushPolicy.builder().withStripeMinSize(HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(session)).build()).withDictionaryMaxMemory(HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(HiveSessionProperties.getOrcStringStatisticsLimit(session)).build(), IntStream.range(0, fileColumnNames.size()).toArray(), ImmutableMap.<String, String>builder().put(PRESTO_VERSION_NAME, nodeVersion.toString()).put(PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), UTC, validationInputFactory, getOrcOptimizedWriterValidateMode(session), orcWriterStats, dwrfEncryptionProvider, Optional.empty());
    } catch (IOException e) {
        throw new PrestoException(ICEBERG_WRITER_OPEN_ERROR, "Error creating ORC file", e);
    }
}
Also used : HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) HdfsOrcDataSource(com.facebook.presto.hive.orc.HdfsOrcDataSource) Types(org.apache.iceberg.types.Types) FileSystem(org.apache.hadoop.fs.FileSystem) DataSink(com.facebook.presto.common.io.DataSink) IcebergSessionProperties.getOrcMaxBufferSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxBufferSize) NodeVersion(com.facebook.presto.hive.NodeVersion) PRESTO_VERSION_NAME(com.facebook.presto.hive.HiveMetadata.PRESTO_VERSION_NAME) Path(org.apache.hadoop.fs.Path) HiveSessionProperties.getParquetWriterBlockSize(com.facebook.presto.hive.HiveSessionProperties.getParquetWriterBlockSize) OrcDataSource(com.facebook.presto.orc.OrcDataSource) FileFormatDataSourceStats(com.facebook.presto.hive.FileFormatDataSourceStats) HdfsContext(com.facebook.presto.hive.HdfsContext) TypeConverter.toPrestoType(com.facebook.presto.iceberg.TypeConverter.toPrestoType) ParquetSchemaUtil.convert(org.apache.iceberg.parquet.ParquetSchemaUtil.convert) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveDwrfEncryptionProvider(com.facebook.presto.hive.HiveDwrfEncryptionProvider) Schema(org.apache.iceberg.Schema) ConnectorSession(com.facebook.presto.spi.ConnectorSession) ORC(com.facebook.presto.orc.OrcEncoding.ORC) List(java.util.List) NOT_SUPPORTED(com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED) ICEBERG_WRITER_OPEN_ERROR(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_WRITER_OPEN_ERROR) IcebergSessionProperties.isOrcOptimizedWriterValidate(com.facebook.presto.iceberg.IcebergSessionProperties.isOrcOptimizedWriterValidate) Optional(java.util.Optional) OutputStreamDataSink(com.facebook.presto.common.io.OutputStreamDataSink) HiveSessionProperties(com.facebook.presto.hive.HiveSessionProperties) IntStream(java.util.stream.IntStream) HiveSessionProperties.getParquetWriterPageSize(com.facebook.presto.hive.HiveSessionProperties.getParquetWriterPageSize) Callable(java.util.concurrent.Callable) PrestoException(com.facebook.presto.spi.PrestoException) Supplier(java.util.function.Supplier) Inject(javax.inject.Inject) IcebergSessionProperties.getCompressionCodec(com.facebook.presto.iceberg.IcebergSessionProperties.getCompressionCodec) TypeManager(com.facebook.presto.common.type.TypeManager) Objects.requireNonNull(java.util.Objects.requireNonNull) PrimitiveTypeMapBuilder.makeTypeMap(com.facebook.presto.iceberg.util.PrimitiveTypeMapBuilder.makeTypeMap) TypeConverter.toOrcType(com.facebook.presto.iceberg.TypeConverter.toOrcType) OrcWriterStats(com.facebook.presto.orc.OrcWriterStats) Type(com.facebook.presto.common.type.Type) IcebergSessionProperties.getOrcMaxMergeDistance(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxMergeDistance) DwrfEncryptionProvider(com.facebook.presto.orc.DwrfEncryptionProvider) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) DefaultOrcWriterFlushPolicy(com.facebook.presto.orc.DefaultOrcWriterFlushPolicy) OrcFileWriterConfig(com.facebook.presto.hive.OrcFileWriterConfig) ParquetWriterOptions(com.facebook.presto.parquet.writer.ParquetWriterOptions) IOException(java.io.IOException) UTC(org.joda.time.DateTimeZone.UTC) FileFormat(org.apache.iceberg.FileFormat) ICEBERG_WRITE_VALIDATION_FAILED(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_WRITE_VALIDATION_FAILED) JobConf(org.apache.hadoop.mapred.JobConf) IcebergSessionProperties.getOrcOptimizedWriterValidateMode(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcOptimizedWriterValidateMode) PRESTO_QUERY_ID_NAME(com.facebook.presto.hive.metastore.MetastoreUtil.PRESTO_QUERY_ID_NAME) IcebergSessionProperties.getOrcStreamBufferSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcStreamBufferSize) DataSink(com.facebook.presto.common.io.DataSink) OutputStreamDataSink(com.facebook.presto.common.io.OutputStreamDataSink) Types(org.apache.iceberg.types.Types) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) HdfsOrcDataSource(com.facebook.presto.hive.orc.HdfsOrcDataSource) PrestoException(com.facebook.presto.spi.PrestoException) IOException(java.io.IOException) TypeConverter.toPrestoType(com.facebook.presto.iceberg.TypeConverter.toPrestoType) TypeConverter.toOrcType(com.facebook.presto.iceberg.TypeConverter.toOrcType) Type(com.facebook.presto.common.type.Type) FileSystem(org.apache.hadoop.fs.FileSystem) Supplier(java.util.function.Supplier) OutputStreamDataSink(com.facebook.presto.common.io.OutputStreamDataSink)

Example 5 with OutputStreamDataSink

use of com.facebook.presto.common.io.OutputStreamDataSink in project presto by prestodb.

the class AbstractTestDwrfStripeCaching method writeOrcFile.

/**
 * Creates a file with 3 INT columns and 4 stripes with 100 rows each with the
 * following values:
 * Column 0: row number
 * Column 1: Integer.MAX_VALUE
 * Column 2: row number * 10
 */
private static TempFile writeOrcFile(boolean cacheEnabled, DwrfStripeCacheMode cacheMode, DataSize cacheMaxSize) {
    TempFile outputFile = new TempFile();
    try {
        Type type = INTEGER;
        List<Type> types = ImmutableList.of(type, type, type);
        OrcWriterOptions writerOptions = OrcWriterOptions.builder().withFlushPolicy(DefaultOrcWriterFlushPolicy.builder().withStripeMaxRowCount(100).build()).withDwrfStripeCacheEnabled(cacheEnabled).withDwrfStripeCacheMode(cacheMode).withDwrfStripeCacheMaxSize(cacheMaxSize).build();
        OrcWriter writer = new OrcWriter(new OutputStreamDataSink(new FileOutputStream(outputFile.getFile())), ImmutableList.of("Int1", "Int2", "Int3"), types, DWRF, ZLIB, Optional.empty(), NO_ENCRYPTION, writerOptions, ImmutableMap.of(), HIVE_STORAGE_TIME_ZONE, true, BOTH, new OrcWriterStats());
        // write 4 stripes with 100 values each
        int count = 0;
        for (int stripe = 0; stripe < 4; stripe++) {
            BlockBuilder[] blockBuilders = new BlockBuilder[3];
            for (int i = 0; i < blockBuilders.length; i++) {
                blockBuilders[i] = type.createBlockBuilder(null, 100);
            }
            for (int row = 0; row < 100; row++) {
                blockBuilders[0].writeInt(count);
                blockBuilders[1].writeInt(Integer.MAX_VALUE);
                blockBuilders[2].writeInt(count * 10);
                count++;
            }
            Block[] blocks = new Block[blockBuilders.length];
            for (int i = 0; i < blocks.length; i++) {
                blocks[i] = blockBuilders[i].build();
            }
            writer.write(new Page(blocks));
        }
        writer.close();
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    }
    return outputFile;
}
Also used : Page(com.facebook.presto.common.Page) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) Type(com.facebook.presto.common.type.Type) FileOutputStream(java.io.FileOutputStream) Block(com.facebook.presto.common.block.Block) OutputStreamDataSink(com.facebook.presto.common.io.OutputStreamDataSink) BlockBuilder(com.facebook.presto.common.block.BlockBuilder)

Aggregations

OutputStreamDataSink (com.facebook.presto.common.io.OutputStreamDataSink)6 Page (com.facebook.presto.common.Page)4 FileOutputStream (java.io.FileOutputStream)4 Block (com.facebook.presto.common.block.Block)3 BlockBuilder (com.facebook.presto.common.block.BlockBuilder)3 Type (com.facebook.presto.common.type.Type)2 ImmutableMap (com.google.common.collect.ImmutableMap)2 BlockEncodingManager (com.facebook.presto.common.block.BlockEncodingManager)1 RowBlock (com.facebook.presto.common.block.RowBlock)1 DataSink (com.facebook.presto.common.io.DataSink)1 TypeManager (com.facebook.presto.common.type.TypeManager)1 FileFormatDataSourceStats (com.facebook.presto.hive.FileFormatDataSourceStats)1 HdfsContext (com.facebook.presto.hive.HdfsContext)1 HdfsEnvironment (com.facebook.presto.hive.HdfsEnvironment)1 HiveDwrfEncryptionProvider (com.facebook.presto.hive.HiveDwrfEncryptionProvider)1 PRESTO_VERSION_NAME (com.facebook.presto.hive.HiveMetadata.PRESTO_VERSION_NAME)1 HiveSessionProperties (com.facebook.presto.hive.HiveSessionProperties)1 HiveSessionProperties.getParquetWriterBlockSize (com.facebook.presto.hive.HiveSessionProperties.getParquetWriterBlockSize)1 HiveSessionProperties.getParquetWriterPageSize (com.facebook.presto.hive.HiveSessionProperties.getParquetWriterPageSize)1 NodeVersion (com.facebook.presto.hive.NodeVersion)1