Search in sources :

Example 41 with FileSourceSplit

use of org.apache.flink.connector.file.src.FileSourceSplit in project flink by apache.

the class StaticFileSplitEnumeratorTest method testNoMoreSplits.

@Test
public void testNoMoreSplits() throws Exception {
    final TestingSplitEnumeratorContext<FileSourceSplit> context = new TestingSplitEnumeratorContext<>(4);
    final FileSourceSplit split = createRandomSplit();
    final StaticFileSplitEnumerator enumerator = createEnumerator(context, split);
    // first split assignment
    context.registerReader(1, "somehost");
    enumerator.addReader(1);
    enumerator.handleSplitRequest(1, "somehost");
    // second request has no more split
    enumerator.handleSplitRequest(1, "somehost");
    assertThat(context.getSplitAssignments().get(1).getAssignedSplits(), contains(split));
    assertTrue(context.getSplitAssignments().get(1).hasReceivedNoMoreSplitsSignal());
}
Also used : FileSourceSplit(org.apache.flink.connector.file.src.FileSourceSplit) TestingSplitEnumeratorContext(org.apache.flink.connector.testutils.source.reader.TestingSplitEnumeratorContext) Test(org.junit.Test)

Example 42 with FileSourceSplit

use of org.apache.flink.connector.file.src.FileSourceSplit in project flink by apache.

the class BlockSplittingRecursiveEnumeratorTest method testFileWithMultipleBlocks.

@Test
@Override
public void testFileWithMultipleBlocks() throws Exception {
    final Path testPath = new Path("testfs:///dir/file");
    testFs = TestingFileSystem.createForFileStatus("testfs", TestingFileSystem.TestFileStatus.forFileWithBlocks(testPath, 1000L, new TestingFileSystem.TestBlockLocation(0L, 100L, "host1", "host2"), new TestingFileSystem.TestBlockLocation(100L, 520L, "host2", "host3"), new TestingFileSystem.TestBlockLocation(620L, 380L, "host3", "host4")));
    testFs.register();
    final BlockSplittingRecursiveEnumerator enumerator = createEnumerator();
    final Collection<FileSourceSplit> splits = enumerator.enumerateSplits(new Path[] { new Path("testfs:///dir") }, 0);
    final Collection<FileSourceSplit> expected = Arrays.asList(new FileSourceSplit("ignoredId", testPath, 0L, 100L, 0, 1000L, "host1", "host2"), new FileSourceSplit("ignoredId", testPath, 100L, 520L, 0, 1000L, "host1", "host2"), new FileSourceSplit("ignoredId", testPath, 620L, 380L, 0, 1000L, "host1", "host2"));
    assertSplitsEqual(expected, splits);
}
Also used : Path(org.apache.flink.core.fs.Path) FileSourceSplit(org.apache.flink.connector.file.src.FileSourceSplit) TestingFileSystem(org.apache.flink.connector.file.src.testutils.TestingFileSystem) Test(org.junit.Test)

Example 43 with FileSourceSplit

use of org.apache.flink.connector.file.src.FileSourceSplit in project flink by apache.

the class LimitableBulkFormatTest method test.

@Test
public void test() throws IOException {
    // read
    BulkFormat<String, FileSourceSplit> format = LimitableBulkFormat.create(new StreamFormatAdapter<>(new TextLineInputFormat()), 22L);
    BulkFormat.Reader<String> reader = format.createReader(new Configuration(), new FileSourceSplit("id", new Path(file.toURI()), 0, file.length(), file.lastModified(), file.length()));
    AtomicInteger i = new AtomicInteger(0);
    Utils.forEachRemaining(reader, s -> i.incrementAndGet());
    Assert.assertEquals(22, i.get());
}
Also used : Path(org.apache.flink.core.fs.Path) FileSourceSplit(org.apache.flink.connector.file.src.FileSourceSplit) TextLineInputFormat(org.apache.flink.connector.file.src.reader.TextLineInputFormat) Configuration(org.apache.flink.configuration.Configuration) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) BulkFormat(org.apache.flink.connector.file.src.reader.BulkFormat) Test(org.junit.Test)

Example 44 with FileSourceSplit

use of org.apache.flink.connector.file.src.FileSourceSplit in project flink by apache.

the class FileSystemTableSource method getScanRuntimeProvider.

@Override
public ScanRuntimeProvider getScanRuntimeProvider(ScanContext scanContext) {
    // When this table has no partition, just return a empty source.
    if (!partitionKeys.isEmpty() && getOrFetchPartitions().isEmpty()) {
        return InputFormatProvider.of(new CollectionInputFormat<>(new ArrayList<>(), null));
    }
    // Resolve metadata and make sure to filter out metadata not in the producedDataType
    final List<String> metadataKeys = DataType.getFieldNames(producedDataType).stream().filter(((this.metadataKeys == null) ? Collections.emptyList() : this.metadataKeys)::contains).collect(Collectors.toList());
    final List<ReadableFileInfo> metadataToExtract = metadataKeys.stream().map(ReadableFileInfo::resolve).collect(Collectors.toList());
    // Filter out partition columns not in producedDataType
    final List<String> partitionKeysToExtract = DataType.getFieldNames(producedDataType).stream().filter(this.partitionKeys::contains).collect(Collectors.toList());
    // Compute the physical projection and the physical data type, that is
    // the type without partition columns and metadata in the same order of the schema
    DataType physicalDataType = physicalRowDataType;
    final Projection partitionKeysProjections = Projection.fromFieldNames(physicalDataType, partitionKeysToExtract);
    final Projection physicalProjections = (projectFields != null ? Projection.of(projectFields) : Projection.all(physicalDataType)).difference(partitionKeysProjections);
    physicalDataType = partitionKeysProjections.complement(physicalDataType).project(physicalDataType);
    if (bulkReaderFormat != null) {
        if (bulkReaderFormat instanceof BulkDecodingFormat && filters != null && filters.size() > 0) {
            ((BulkDecodingFormat<RowData>) bulkReaderFormat).applyFilters(filters);
        }
        BulkFormat<RowData, FileSourceSplit> format;
        if (bulkReaderFormat instanceof ProjectableDecodingFormat) {
            format = ((ProjectableDecodingFormat<BulkFormat<RowData, FileSourceSplit>>) bulkReaderFormat).createRuntimeDecoder(scanContext, physicalDataType, physicalProjections.toNestedIndexes());
        } else {
            format = new ProjectingBulkFormat(bulkReaderFormat.createRuntimeDecoder(scanContext, physicalDataType), physicalProjections.toTopLevelIndexes(), scanContext.createTypeInformation(physicalProjections.project(physicalDataType)));
        }
        format = wrapBulkFormat(scanContext, format, producedDataType, metadataToExtract, partitionKeysToExtract);
        return createSourceProvider(format);
    } else if (deserializationFormat != null) {
        BulkFormat<RowData, FileSourceSplit> format;
        if (deserializationFormat instanceof ProjectableDecodingFormat) {
            format = new DeserializationSchemaAdapter(((ProjectableDecodingFormat<DeserializationSchema<RowData>>) deserializationFormat).createRuntimeDecoder(scanContext, physicalDataType, physicalProjections.toNestedIndexes()));
        } else {
            format = new ProjectingBulkFormat(new DeserializationSchemaAdapter(deserializationFormat.createRuntimeDecoder(scanContext, physicalDataType)), physicalProjections.toTopLevelIndexes(), scanContext.createTypeInformation(physicalProjections.project(physicalDataType)));
        }
        format = wrapBulkFormat(scanContext, format, producedDataType, metadataToExtract, partitionKeysToExtract);
        return createSourceProvider(format);
    } else {
        throw new TableException("Can not find format factory.");
    }
}
Also used : TableException(org.apache.flink.table.api.TableException) ProjectableDecodingFormat(org.apache.flink.table.connector.format.ProjectableDecodingFormat) FileSourceSplit(org.apache.flink.connector.file.src.FileSourceSplit) ArrayList(java.util.ArrayList) Projection(org.apache.flink.table.connector.Projection) BulkDecodingFormat(org.apache.flink.connector.file.table.format.BulkDecodingFormat) DeserializationSchema(org.apache.flink.api.common.serialization.DeserializationSchema) RowData(org.apache.flink.table.data.RowData) DataType(org.apache.flink.table.types.DataType) BulkFormat(org.apache.flink.connector.file.src.reader.BulkFormat)

Example 45 with FileSourceSplit

use of org.apache.flink.connector.file.src.FileSourceSplit in project flink by apache.

the class FileSystemTableSink method createCompactReaderFactory.

private Optional<CompactReader.Factory<RowData>> createCompactReaderFactory(Context context) {
    // Compute producedDataType (including partition fields) and physicalDataType (excluding
    // partition fields)
    final DataType producedDataType = physicalRowDataType;
    final DataType physicalDataType = DataType.getFields(producedDataType).stream().filter(field -> !partitionKeys.contains(field.getName())).collect(Collectors.collectingAndThen(Collectors.toList(), DataTypes::ROW));
    if (bulkReaderFormat != null) {
        final BulkFormat<RowData, FileSourceSplit> format = new FileInfoExtractorBulkFormat(bulkReaderFormat.createRuntimeDecoder(createSourceContext(context), physicalDataType), producedDataType, context.createTypeInformation(producedDataType), Collections.emptyMap(), partitionKeys, defaultPartName);
        return Optional.of(CompactBulkReader.factory(format));
    } else if (deserializationFormat != null) {
        final DeserializationSchema<RowData> decoder = deserializationFormat.createRuntimeDecoder(createSourceContext(context), physicalDataType);
        final BulkFormat<RowData, FileSourceSplit> format = new FileInfoExtractorBulkFormat(new DeserializationSchemaAdapter(decoder), producedDataType, context.createTypeInformation(producedDataType), Collections.emptyMap(), partitionKeys, defaultPartName);
        return Optional.of(CompactBulkReader.factory(format));
    }
    return Optional.empty();
}
Also used : DataType(org.apache.flink.table.types.DataType) CompactBulkReader(org.apache.flink.connector.file.table.stream.compact.CompactBulkReader) DecodingFormat(org.apache.flink.table.connector.format.DecodingFormat) PartitionCommitInfo(org.apache.flink.connector.file.table.stream.PartitionCommitInfo) SupportsPartitioning(org.apache.flink.table.connector.sink.abilities.SupportsPartitioning) FSDataOutputStream(org.apache.flink.core.fs.FSDataOutputStream) Path(org.apache.flink.core.fs.Path) Map(java.util.Map) SINK_ROLLING_POLICY_CHECK_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_CHECK_INTERVAL) StreamingFileSink(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink) AUTO_COMPACTION(org.apache.flink.connector.file.table.FileSystemConnectorOptions.AUTO_COMPACTION) PartFileInfo(org.apache.flink.streaming.api.functions.sink.filesystem.PartFileInfo) CheckpointRollingPolicy(org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.CheckpointRollingPolicy) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) PartitionPathUtils(org.apache.flink.table.utils.PartitionPathUtils) OutputFormat(org.apache.flink.api.common.io.OutputFormat) DynamicTableSource(org.apache.flink.table.connector.source.DynamicTableSource) DynamicTableSink(org.apache.flink.table.connector.sink.DynamicTableSink) SINK_ROLLING_POLICY_ROLLOVER_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_ROLLOVER_INTERVAL) CompactOperator.convertToUncompacted(org.apache.flink.connector.file.table.stream.compact.CompactOperator.convertToUncompacted) UUID(java.util.UUID) Preconditions(org.apache.flink.util.Preconditions) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) List(java.util.List) Stream(java.util.stream.Stream) FileSystem(org.apache.flink.core.fs.FileSystem) FactoryUtil(org.apache.flink.table.factories.FactoryUtil) LogicalType(org.apache.flink.table.types.logical.LogicalType) DataStreamSinkProvider(org.apache.flink.table.connector.sink.DataStreamSinkProvider) SimpleVersionedSerializer(org.apache.flink.core.io.SimpleVersionedSerializer) ValidationException(org.apache.flink.table.api.ValidationException) Optional(java.util.Optional) BulkFormat(org.apache.flink.connector.file.src.reader.BulkFormat) SerializationSchema(org.apache.flink.api.common.serialization.SerializationSchema) EncodingFormat(org.apache.flink.table.connector.format.EncodingFormat) ObjectIdentifier(org.apache.flink.table.catalog.ObjectIdentifier) ChangelogMode(org.apache.flink.table.connector.ChangelogMode) BucketAssigner(org.apache.flink.streaming.api.functions.sink.filesystem.BucketAssigner) BucketsBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink.BucketsBuilder) LinkedHashMap(java.util.LinkedHashMap) ReadableConfig(org.apache.flink.configuration.ReadableConfig) FileSourceSplit(org.apache.flink.connector.file.src.FileSourceSplit) COMPACTION_FILE_SIZE(org.apache.flink.connector.file.table.FileSystemConnectorOptions.COMPACTION_FILE_SIZE) SINK_ROLLING_POLICY_INACTIVITY_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_INACTIVITY_INTERVAL) SupportsOverwrite(org.apache.flink.table.connector.sink.abilities.SupportsOverwrite) Nullable(javax.annotation.Nullable) StreamingSink(org.apache.flink.connector.file.table.stream.StreamingSink) DataStreamSink(org.apache.flink.streaming.api.datastream.DataStreamSink) OutputStream(java.io.OutputStream) RollingPolicy(org.apache.flink.streaming.api.functions.sink.filesystem.RollingPolicy) RowData(org.apache.flink.table.data.RowData) ProviderContext(org.apache.flink.table.connector.ProviderContext) BulkWriter(org.apache.flink.api.common.serialization.BulkWriter) Configuration(org.apache.flink.configuration.Configuration) TableException(org.apache.flink.table.api.TableException) DataTypes(org.apache.flink.table.api.DataTypes) IOException(java.io.IOException) DeserializationSchema(org.apache.flink.api.common.serialization.DeserializationSchema) DataStream(org.apache.flink.streaming.api.datastream.DataStream) Encoder(org.apache.flink.api.common.serialization.Encoder) SimpleVersionedStringSerializer(org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.SimpleVersionedStringSerializer) RowKind(org.apache.flink.types.RowKind) CompactReader(org.apache.flink.connector.file.table.stream.compact.CompactReader) OutputFileConfig(org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig) Internal(org.apache.flink.annotation.Internal) Collections(java.util.Collections) SINK_ROLLING_POLICY_FILE_SIZE(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_FILE_SIZE) RowData(org.apache.flink.table.data.RowData) FileSourceSplit(org.apache.flink.connector.file.src.FileSourceSplit) DataType(org.apache.flink.table.types.DataType) DeserializationSchema(org.apache.flink.api.common.serialization.DeserializationSchema) BulkFormat(org.apache.flink.connector.file.src.reader.BulkFormat)

Aggregations

FileSourceSplit (org.apache.flink.connector.file.src.FileSourceSplit)50 Test (org.junit.Test)32 Path (org.apache.flink.core.fs.Path)20 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)11 BulkFormat (org.apache.flink.connector.file.src.reader.BulkFormat)11 Configuration (org.apache.flink.configuration.Configuration)10 ArrayList (java.util.ArrayList)9 TestingSplitEnumeratorContext (org.apache.flink.connector.testutils.source.reader.TestingSplitEnumeratorContext)7 IOException (java.io.IOException)6 RowData (org.apache.flink.table.data.RowData)6 LogicalType (org.apache.flink.table.types.logical.LogicalType)6 LinkedHashMap (java.util.LinkedHashMap)5 TestingFileSystem (org.apache.flink.connector.file.src.testutils.TestingFileSystem)5 FileStatus (org.apache.flink.core.fs.FileStatus)5 AtomicLong (java.util.concurrent.atomic.AtomicLong)4 BigIntType (org.apache.flink.table.types.logical.BigIntType)4 DoubleType (org.apache.flink.table.types.logical.DoubleType)4 IntType (org.apache.flink.table.types.logical.IntType)4 SmallIntType (org.apache.flink.table.types.logical.SmallIntType)4 TinyIntType (org.apache.flink.table.types.logical.TinyIntType)4