Search in sources :

Example 6 with BulkFormat

use of org.apache.flink.connector.file.src.reader.BulkFormat in project flink by apache.

the class FileSystemTableSource method getScanRuntimeProvider.

@Override
public ScanRuntimeProvider getScanRuntimeProvider(ScanContext scanContext) {
    // When this table has no partition, just return a empty source.
    if (!partitionKeys.isEmpty() && getOrFetchPartitions().isEmpty()) {
        return InputFormatProvider.of(new CollectionInputFormat<>(new ArrayList<>(), null));
    }
    // Resolve metadata and make sure to filter out metadata not in the producedDataType
    final List<String> metadataKeys = DataType.getFieldNames(producedDataType).stream().filter(((this.metadataKeys == null) ? Collections.emptyList() : this.metadataKeys)::contains).collect(Collectors.toList());
    final List<ReadableFileInfo> metadataToExtract = metadataKeys.stream().map(ReadableFileInfo::resolve).collect(Collectors.toList());
    // Filter out partition columns not in producedDataType
    final List<String> partitionKeysToExtract = DataType.getFieldNames(producedDataType).stream().filter(this.partitionKeys::contains).collect(Collectors.toList());
    // Compute the physical projection and the physical data type, that is
    // the type without partition columns and metadata in the same order of the schema
    DataType physicalDataType = physicalRowDataType;
    final Projection partitionKeysProjections = Projection.fromFieldNames(physicalDataType, partitionKeysToExtract);
    final Projection physicalProjections = (projectFields != null ? Projection.of(projectFields) : Projection.all(physicalDataType)).difference(partitionKeysProjections);
    physicalDataType = partitionKeysProjections.complement(physicalDataType).project(physicalDataType);
    if (bulkReaderFormat != null) {
        if (bulkReaderFormat instanceof BulkDecodingFormat && filters != null && filters.size() > 0) {
            ((BulkDecodingFormat<RowData>) bulkReaderFormat).applyFilters(filters);
        }
        BulkFormat<RowData, FileSourceSplit> format;
        if (bulkReaderFormat instanceof ProjectableDecodingFormat) {
            format = ((ProjectableDecodingFormat<BulkFormat<RowData, FileSourceSplit>>) bulkReaderFormat).createRuntimeDecoder(scanContext, physicalDataType, physicalProjections.toNestedIndexes());
        } else {
            format = new ProjectingBulkFormat(bulkReaderFormat.createRuntimeDecoder(scanContext, physicalDataType), physicalProjections.toTopLevelIndexes(), scanContext.createTypeInformation(physicalProjections.project(physicalDataType)));
        }
        format = wrapBulkFormat(scanContext, format, producedDataType, metadataToExtract, partitionKeysToExtract);
        return createSourceProvider(format);
    } else if (deserializationFormat != null) {
        BulkFormat<RowData, FileSourceSplit> format;
        if (deserializationFormat instanceof ProjectableDecodingFormat) {
            format = new DeserializationSchemaAdapter(((ProjectableDecodingFormat<DeserializationSchema<RowData>>) deserializationFormat).createRuntimeDecoder(scanContext, physicalDataType, physicalProjections.toNestedIndexes()));
        } else {
            format = new ProjectingBulkFormat(new DeserializationSchemaAdapter(deserializationFormat.createRuntimeDecoder(scanContext, physicalDataType)), physicalProjections.toTopLevelIndexes(), scanContext.createTypeInformation(physicalProjections.project(physicalDataType)));
        }
        format = wrapBulkFormat(scanContext, format, producedDataType, metadataToExtract, partitionKeysToExtract);
        return createSourceProvider(format);
    } else {
        throw new TableException("Can not find format factory.");
    }
}
Also used : TableException(org.apache.flink.table.api.TableException) ProjectableDecodingFormat(org.apache.flink.table.connector.format.ProjectableDecodingFormat) FileSourceSplit(org.apache.flink.connector.file.src.FileSourceSplit) ArrayList(java.util.ArrayList) Projection(org.apache.flink.table.connector.Projection) BulkDecodingFormat(org.apache.flink.connector.file.table.format.BulkDecodingFormat) DeserializationSchema(org.apache.flink.api.common.serialization.DeserializationSchema) RowData(org.apache.flink.table.data.RowData) DataType(org.apache.flink.table.types.DataType) BulkFormat(org.apache.flink.connector.file.src.reader.BulkFormat)

Example 7 with BulkFormat

use of org.apache.flink.connector.file.src.reader.BulkFormat in project flink by apache.

the class FileSystemTableSink method createCompactReaderFactory.

private Optional<CompactReader.Factory<RowData>> createCompactReaderFactory(Context context) {
    // Compute producedDataType (including partition fields) and physicalDataType (excluding
    // partition fields)
    final DataType producedDataType = physicalRowDataType;
    final DataType physicalDataType = DataType.getFields(producedDataType).stream().filter(field -> !partitionKeys.contains(field.getName())).collect(Collectors.collectingAndThen(Collectors.toList(), DataTypes::ROW));
    if (bulkReaderFormat != null) {
        final BulkFormat<RowData, FileSourceSplit> format = new FileInfoExtractorBulkFormat(bulkReaderFormat.createRuntimeDecoder(createSourceContext(context), physicalDataType), producedDataType, context.createTypeInformation(producedDataType), Collections.emptyMap(), partitionKeys, defaultPartName);
        return Optional.of(CompactBulkReader.factory(format));
    } else if (deserializationFormat != null) {
        final DeserializationSchema<RowData> decoder = deserializationFormat.createRuntimeDecoder(createSourceContext(context), physicalDataType);
        final BulkFormat<RowData, FileSourceSplit> format = new FileInfoExtractorBulkFormat(new DeserializationSchemaAdapter(decoder), producedDataType, context.createTypeInformation(producedDataType), Collections.emptyMap(), partitionKeys, defaultPartName);
        return Optional.of(CompactBulkReader.factory(format));
    }
    return Optional.empty();
}
Also used : DataType(org.apache.flink.table.types.DataType) CompactBulkReader(org.apache.flink.connector.file.table.stream.compact.CompactBulkReader) DecodingFormat(org.apache.flink.table.connector.format.DecodingFormat) PartitionCommitInfo(org.apache.flink.connector.file.table.stream.PartitionCommitInfo) SupportsPartitioning(org.apache.flink.table.connector.sink.abilities.SupportsPartitioning) FSDataOutputStream(org.apache.flink.core.fs.FSDataOutputStream) Path(org.apache.flink.core.fs.Path) Map(java.util.Map) SINK_ROLLING_POLICY_CHECK_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_CHECK_INTERVAL) StreamingFileSink(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink) AUTO_COMPACTION(org.apache.flink.connector.file.table.FileSystemConnectorOptions.AUTO_COMPACTION) PartFileInfo(org.apache.flink.streaming.api.functions.sink.filesystem.PartFileInfo) CheckpointRollingPolicy(org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.CheckpointRollingPolicy) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) PartitionPathUtils(org.apache.flink.table.utils.PartitionPathUtils) OutputFormat(org.apache.flink.api.common.io.OutputFormat) DynamicTableSource(org.apache.flink.table.connector.source.DynamicTableSource) DynamicTableSink(org.apache.flink.table.connector.sink.DynamicTableSink) SINK_ROLLING_POLICY_ROLLOVER_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_ROLLOVER_INTERVAL) CompactOperator.convertToUncompacted(org.apache.flink.connector.file.table.stream.compact.CompactOperator.convertToUncompacted) UUID(java.util.UUID) Preconditions(org.apache.flink.util.Preconditions) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) List(java.util.List) Stream(java.util.stream.Stream) FileSystem(org.apache.flink.core.fs.FileSystem) FactoryUtil(org.apache.flink.table.factories.FactoryUtil) LogicalType(org.apache.flink.table.types.logical.LogicalType) DataStreamSinkProvider(org.apache.flink.table.connector.sink.DataStreamSinkProvider) SimpleVersionedSerializer(org.apache.flink.core.io.SimpleVersionedSerializer) ValidationException(org.apache.flink.table.api.ValidationException) Optional(java.util.Optional) BulkFormat(org.apache.flink.connector.file.src.reader.BulkFormat) SerializationSchema(org.apache.flink.api.common.serialization.SerializationSchema) EncodingFormat(org.apache.flink.table.connector.format.EncodingFormat) ObjectIdentifier(org.apache.flink.table.catalog.ObjectIdentifier) ChangelogMode(org.apache.flink.table.connector.ChangelogMode) BucketAssigner(org.apache.flink.streaming.api.functions.sink.filesystem.BucketAssigner) BucketsBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink.BucketsBuilder) LinkedHashMap(java.util.LinkedHashMap) ReadableConfig(org.apache.flink.configuration.ReadableConfig) FileSourceSplit(org.apache.flink.connector.file.src.FileSourceSplit) COMPACTION_FILE_SIZE(org.apache.flink.connector.file.table.FileSystemConnectorOptions.COMPACTION_FILE_SIZE) SINK_ROLLING_POLICY_INACTIVITY_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_INACTIVITY_INTERVAL) SupportsOverwrite(org.apache.flink.table.connector.sink.abilities.SupportsOverwrite) Nullable(javax.annotation.Nullable) StreamingSink(org.apache.flink.connector.file.table.stream.StreamingSink) DataStreamSink(org.apache.flink.streaming.api.datastream.DataStreamSink) OutputStream(java.io.OutputStream) RollingPolicy(org.apache.flink.streaming.api.functions.sink.filesystem.RollingPolicy) RowData(org.apache.flink.table.data.RowData) ProviderContext(org.apache.flink.table.connector.ProviderContext) BulkWriter(org.apache.flink.api.common.serialization.BulkWriter) Configuration(org.apache.flink.configuration.Configuration) TableException(org.apache.flink.table.api.TableException) DataTypes(org.apache.flink.table.api.DataTypes) IOException(java.io.IOException) DeserializationSchema(org.apache.flink.api.common.serialization.DeserializationSchema) DataStream(org.apache.flink.streaming.api.datastream.DataStream) Encoder(org.apache.flink.api.common.serialization.Encoder) SimpleVersionedStringSerializer(org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.SimpleVersionedStringSerializer) RowKind(org.apache.flink.types.RowKind) CompactReader(org.apache.flink.connector.file.table.stream.compact.CompactReader) OutputFileConfig(org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig) Internal(org.apache.flink.annotation.Internal) Collections(java.util.Collections) SINK_ROLLING_POLICY_FILE_SIZE(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_FILE_SIZE) RowData(org.apache.flink.table.data.RowData) FileSourceSplit(org.apache.flink.connector.file.src.FileSourceSplit) DataType(org.apache.flink.table.types.DataType) DeserializationSchema(org.apache.flink.api.common.serialization.DeserializationSchema) BulkFormat(org.apache.flink.connector.file.src.reader.BulkFormat)

Aggregations

FileSourceSplit (org.apache.flink.connector.file.src.FileSourceSplit)7 BulkFormat (org.apache.flink.connector.file.src.reader.BulkFormat)7 Configuration (org.apache.flink.configuration.Configuration)6 Path (org.apache.flink.core.fs.Path)5 RowData (org.apache.flink.table.data.RowData)4 ArrayList (java.util.ArrayList)3 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 DeserializationSchema (org.apache.flink.api.common.serialization.DeserializationSchema)2 TextLineInputFormat (org.apache.flink.connector.file.src.reader.TextLineInputFormat)2 TableException (org.apache.flink.table.api.TableException)2 GenericRowData (org.apache.flink.table.data.GenericRowData)2 DataType (org.apache.flink.table.types.DataType)2 IOException (java.io.IOException)1 OutputStream (java.io.OutputStream)1 Collections (java.util.Collections)1 LinkedHashMap (java.util.LinkedHashMap)1 List (java.util.List)1 Map (java.util.Map)1 Objects (java.util.Objects)1 Optional (java.util.Optional)1