Search in sources :

Example 31 with FileSourceSplit

use of org.apache.flink.connector.file.src.FileSourceSplit in project flink by apache.

the class LimitableBulkFormatTest method testLimitOverBatches.

@Test
public void testLimitOverBatches() throws IOException {
    // set limit
    Long limit = 2048L;
    // configuration for small batches
    Configuration conf = new Configuration();
    conf.set(StreamFormat.FETCH_IO_SIZE, MemorySize.parse("4k"));
    // read
    BulkFormat<String, FileSourceSplit> format = LimitableBulkFormat.create(new StreamFormatAdapter<>(new TextLineInputFormat()), limit);
    BulkFormat.Reader<String> reader = format.createReader(conf, new FileSourceSplit("id", new Path(file.toURI()), 0, file.length(), file.lastModified(), file.length()));
    // check
    AtomicInteger i = new AtomicInteger(0);
    Utils.forEachRemaining(reader, s -> i.incrementAndGet());
    Assert.assertEquals(limit.intValue(), i.get());
}
Also used : Path(org.apache.flink.core.fs.Path) Configuration(org.apache.flink.configuration.Configuration) FileSourceSplit(org.apache.flink.connector.file.src.FileSourceSplit) TextLineInputFormat(org.apache.flink.connector.file.src.reader.TextLineInputFormat) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) BulkFormat(org.apache.flink.connector.file.src.reader.BulkFormat) Test(org.junit.Test)

Example 32 with FileSourceSplit

use of org.apache.flink.connector.file.src.FileSourceSplit in project flink by apache.

the class OrcColumnarRowInputFormatTest method testReadFileWithPartitionFields.

@Test
public void testReadFileWithPartitionFields() throws IOException {
    LinkedHashMap<String, String> partSpec = new LinkedHashMap<>();
    partSpec.put("f1", "1");
    partSpec.put("f3", "3");
    partSpec.put("f5", "f5");
    partSpec.put("f8", BigDecimal.valueOf(5.333).toString());
    partSpec.put("f13", "f13");
    final Path flatFile = copyFileFromResource("test-data-flat.orc", partSpec);
    RowType tableType = RowType.of(/* 0 */
    DataTypes.INT().getLogicalType(), // part-1
    DataTypes.INT().getLogicalType(), /* 2 */
    DataTypes.STRING().getLogicalType(), // part-2
    DataTypes.BIGINT().getLogicalType(), /* 4 */
    DataTypes.STRING().getLogicalType(), // part-3
    DataTypes.STRING().getLogicalType(), /* 6 */
    DataTypes.STRING().getLogicalType(), /* 7 */
    DataTypes.INT().getLogicalType(), // part-4
    DataTypes.DECIMAL(10, 5).getLogicalType(), /* 9 */
    DataTypes.STRING().getLogicalType(), /* 11*/
    DataTypes.INT().getLogicalType(), /* 12*/
    DataTypes.INT().getLogicalType(), // part-5
    DataTypes.STRING().getLogicalType(), /* 14*/
    DataTypes.INT().getLogicalType());
    int[] projectedFields = { 8, 1, 3, 0, 5, 2 };
    OrcColumnarRowInputFormat<?, FileSourceSplit> format = createPartitionFormat(tableType, new ArrayList<>(partSpec.keySet()), projectedFields);
    AtomicInteger cnt = new AtomicInteger(0);
    AtomicLong totalF0 = new AtomicLong(0);
    // read all splits
    for (FileSourceSplit split : createSplits(flatFile, 4)) {
        forEach(format, split, row -> {
            // data values
            Assert.assertFalse(row.isNullAt(3));
            Assert.assertFalse(row.isNullAt(5));
            totalF0.addAndGet(row.getInt(3));
            Assert.assertNotNull(row.getString(5).toString());
            // part values
            Assert.assertFalse(row.isNullAt(0));
            Assert.assertFalse(row.isNullAt(1));
            Assert.assertFalse(row.isNullAt(2));
            Assert.assertFalse(row.isNullAt(4));
            Assert.assertEquals(DecimalDataUtils.castFrom(5.333, 10, 5), row.getDecimal(0, 10, 5));
            Assert.assertEquals(1, row.getInt(1));
            Assert.assertEquals(3, row.getLong(2));
            Assert.assertEquals("f5", row.getString(4).toString());
            cnt.incrementAndGet();
        });
    }
    // check that all rows have been read
    assertEquals(1920800, cnt.get());
    assertEquals(1844737280400L, totalF0.get());
}
Also used : Path(org.apache.flink.core.fs.Path) PartitionPathUtils.generatePartitionPath(org.apache.flink.table.utils.PartitionPathUtils.generatePartitionPath) AtomicLong(java.util.concurrent.atomic.AtomicLong) FileSourceSplit(org.apache.flink.connector.file.src.FileSourceSplit) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) RowType(org.apache.flink.table.types.logical.RowType) LinkedHashMap(java.util.LinkedHashMap) Test(org.junit.Test)

Example 33 with FileSourceSplit

use of org.apache.flink.connector.file.src.FileSourceSplit in project flink by apache.

the class OrcColumnarRowInputFormatTest method testReadFileAndRestore.

@Test
public void testReadFileAndRestore() throws IOException {
    OrcColumnarRowInputFormat<?, FileSourceSplit> format = createFormat(FLAT_FILE_TYPE, new int[] { 0, 1 });
    // pick a middle split
    FileSourceSplit split = createSplits(flatFile, 3).get(1);
    int expectedCnt = 660000;
    innerTestRestore(format, split, expectedCnt / 2, expectedCnt, 656700330000L);
}
Also used : FileSourceSplit(org.apache.flink.connector.file.src.FileSourceSplit) Test(org.junit.Test)

Example 34 with FileSourceSplit

use of org.apache.flink.connector.file.src.FileSourceSplit in project flink by apache.

the class OrcColumnarRowInputFormatTest method testReadFileWithSelectFields.

@Test
public void testReadFileWithSelectFields() throws IOException {
    OrcColumnarRowInputFormat<?, FileSourceSplit> format = createFormat(FLAT_FILE_TYPE, new int[] { 2, 0, 1 });
    AtomicInteger cnt = new AtomicInteger(0);
    AtomicLong totalF0 = new AtomicLong(0);
    // read all splits
    for (FileSourceSplit split : createSplits(flatFile, 4)) {
        forEach(format, split, row -> {
            Assert.assertFalse(row.isNullAt(0));
            Assert.assertFalse(row.isNullAt(1));
            Assert.assertFalse(row.isNullAt(2));
            Assert.assertNotNull(row.getString(0).toString());
            totalF0.addAndGet(row.getInt(1));
            Assert.assertNotNull(row.getString(2).toString());
            cnt.incrementAndGet();
        });
    }
    // check that all rows have been read
    assertEquals(1920800, cnt.get());
    assertEquals(1844737280400L, totalF0.get());
}
Also used : AtomicLong(java.util.concurrent.atomic.AtomicLong) FileSourceSplit(org.apache.flink.connector.file.src.FileSourceSplit) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Test(org.junit.Test)

Example 35 with FileSourceSplit

use of org.apache.flink.connector.file.src.FileSourceSplit in project flink by apache.

the class OrcColumnarRowInputFormatTest method innerTestRestore.

private void innerTestRestore(OrcColumnarRowInputFormat<?, FileSourceSplit> format, FileSourceSplit split, int breakCnt, int expectedCnt, long expectedTotalF0) throws IOException {
    AtomicInteger cnt = new AtomicInteger(0);
    AtomicLong totalF0 = new AtomicLong(0);
    Consumer<RowData> consumer = row -> {
        Assert.assertFalse(row.isNullAt(0));
        Assert.assertFalse(row.isNullAt(1));
        totalF0.addAndGet(row.getInt(0));
        assertNotNull(row.getString(1).toString());
        cnt.incrementAndGet();
    };
    // ---------- restore reading ---------------
    long offset = -1;
    long recordSkipCount = -1;
    try (BulkFormat.Reader<RowData> reader = createReader(format, split)) {
        while (cnt.get() < breakCnt) {
            BulkFormat.RecordIterator<RowData> batch = reader.readBatch();
            Assert.assertNotNull(batch);
            RecordAndPosition<RowData> record;
            while ((record = batch.next()) != null && cnt.get() < breakCnt) {
                consumer.accept(record.getRecord());
                offset = record.getOffset();
                recordSkipCount = record.getRecordSkipCount();
            }
            batch.releaseBatch();
        }
    }
    Utils.forEachRemaining(restoreReader(format, split, offset, recordSkipCount), consumer);
    // ---------- end restore reading ---------------
    // the results should be the same as:
    // forEach(format, split, consumer);
    // check that all rows have been read
    assertEquals(expectedCnt, cnt.get());
    assertEquals(expectedTotalF0, totalF0.get());
}
Also used : PartitionFieldExtractor(org.apache.flink.connector.file.table.PartitionFieldExtractor) DecimalDataUtils(org.apache.flink.table.data.DecimalDataUtils) Between(org.apache.flink.orc.OrcFilters.Between) RowType(org.apache.flink.table.types.logical.RowType) CheckpointedPosition(org.apache.flink.connector.file.src.util.CheckpointedPosition) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) Equals(org.apache.flink.orc.OrcFilters.Equals) BigDecimal(java.math.BigDecimal) DecimalType(org.apache.flink.table.types.logical.DecimalType) FileSourceSplit(org.apache.flink.connector.file.src.FileSourceSplit) Path(org.apache.flink.core.fs.Path) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Predicate(org.apache.flink.orc.OrcFilters.Predicate) Configuration(org.apache.hadoop.conf.Configuration) Or(org.apache.flink.orc.OrcFilters.Or) ClassRule(org.junit.ClassRule) Utils(org.apache.flink.connector.file.src.util.Utils) FileStatus(org.apache.flink.core.fs.FileStatus) RowData(org.apache.flink.table.data.RowData) IOUtils(org.apache.flink.util.IOUtils) Assert.assertNotNull(org.junit.Assert.assertNotNull) FileOutputStream(java.io.FileOutputStream) DataTypes(org.apache.flink.table.api.DataTypes) Test(org.junit.Test) IOException(java.io.IOException) UUID(java.util.UUID) File(java.io.File) PredicateLeaf(org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf) Consumer(java.util.function.Consumer) AtomicLong(java.util.concurrent.atomic.AtomicLong) List(java.util.List) InternalTypeInfo(org.apache.flink.table.runtime.typeutils.InternalTypeInfo) LogicalType(org.apache.flink.table.types.logical.LogicalType) RecordAndPosition(org.apache.flink.connector.file.src.util.RecordAndPosition) OrcShim(org.apache.flink.orc.shim.OrcShim) PartitionPathUtils.generatePartitionPath(org.apache.flink.table.utils.PartitionPathUtils.generatePartitionPath) BulkFormat(org.apache.flink.connector.file.src.reader.BulkFormat) Assert(org.junit.Assert) Collections(java.util.Collections) TemporaryFolder(org.junit.rules.TemporaryFolder) Assert.assertEquals(org.junit.Assert.assertEquals) AtomicLong(java.util.concurrent.atomic.AtomicLong) RowData(org.apache.flink.table.data.RowData) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) BulkFormat(org.apache.flink.connector.file.src.reader.BulkFormat)

Aggregations

FileSourceSplit (org.apache.flink.connector.file.src.FileSourceSplit)50 Test (org.junit.Test)32 Path (org.apache.flink.core.fs.Path)20 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)11 BulkFormat (org.apache.flink.connector.file.src.reader.BulkFormat)11 Configuration (org.apache.flink.configuration.Configuration)10 ArrayList (java.util.ArrayList)9 TestingSplitEnumeratorContext (org.apache.flink.connector.testutils.source.reader.TestingSplitEnumeratorContext)7 IOException (java.io.IOException)6 RowData (org.apache.flink.table.data.RowData)6 LogicalType (org.apache.flink.table.types.logical.LogicalType)6 LinkedHashMap (java.util.LinkedHashMap)5 TestingFileSystem (org.apache.flink.connector.file.src.testutils.TestingFileSystem)5 FileStatus (org.apache.flink.core.fs.FileStatus)5 AtomicLong (java.util.concurrent.atomic.AtomicLong)4 BigIntType (org.apache.flink.table.types.logical.BigIntType)4 DoubleType (org.apache.flink.table.types.logical.DoubleType)4 IntType (org.apache.flink.table.types.logical.IntType)4 SmallIntType (org.apache.flink.table.types.logical.SmallIntType)4 TinyIntType (org.apache.flink.table.types.logical.TinyIntType)4