Search in sources :

Example 6 with ContinuousFileMonitoringFunction

use of org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction in project flink by apache.

the class ContinuousFileProcessingTest method testFilePathFiltering.

////				Monitoring Function Tests				//////
@Test
public void testFilePathFiltering() throws Exception {
    String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
    Set<org.apache.hadoop.fs.Path> filesCreated = new HashSet<>();
    Set<String> filesKept = new TreeSet<>();
    // create the files to be discarded
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "**file", i, "This is test line.");
        filesCreated.add(file.f0);
    }
    // create the files to be kept
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
        filesCreated.add(file.f0);
        filesKept.add(file.f0.getName());
    }
    TextInputFormat format = new TextInputFormat(new Path(testBasePath));
    format.setFilesFilter(new FilePathFilter() {

        private static final long serialVersionUID = 2611449927338589804L;

        @Override
        public boolean filterPath(Path filePath) {
            return filePath.getName().startsWith("**");
        }
    });
    ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
    final FileVerifyingSourceContext context = new FileVerifyingSourceContext(new OneShotLatch(), monitoringFunction);
    monitoringFunction.open(new Configuration());
    monitoringFunction.run(context);
    Assert.assertArrayEquals(filesKept.toArray(), context.getSeenFiles().toArray());
    // finally delete the files created for the test.
    for (org.apache.hadoop.fs.Path file : filesCreated) {
        hdfs.delete(file, false);
    }
}
Also used : Path(org.apache.flink.core.fs.Path) Configuration(org.apache.flink.configuration.Configuration) ContinuousFileMonitoringFunction(org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) TreeSet(java.util.TreeSet) FilePathFilter(org.apache.flink.api.common.io.FilePathFilter) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 7 with ContinuousFileMonitoringFunction

use of org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction in project flink by apache.

the class ContinuousFileProcessingTest method testSortingOnModTime.

@Test
public void testSortingOnModTime() throws Exception {
    String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
    final long[] modTimes = new long[NO_OF_FILES];
    final org.apache.hadoop.fs.Path[] filesCreated = new org.apache.hadoop.fs.Path[NO_OF_FILES];
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
        Thread.sleep(400);
        filesCreated[i] = file.f0;
        modTimes[i] = hdfs.getFileStatus(file.f0).getModificationTime();
    }
    TextInputFormat format = new TextInputFormat(new Path(testBasePath));
    format.setFilesFilter(FilePathFilter.createDefaultFilter());
    // this is just to verify that all splits have been forwarded later.
    FileInputSplit[] splits = format.createInputSplits(1);
    ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
    ModTimeVerifyingSourceContext context = new ModTimeVerifyingSourceContext(modTimes);
    monitoringFunction.open(new Configuration());
    monitoringFunction.run(context);
    Assert.assertEquals(splits.length, context.getCounter());
    // delete the created files.
    for (int i = 0; i < NO_OF_FILES; i++) {
        hdfs.delete(filesCreated[i], false);
    }
}
Also used : Path(org.apache.flink.core.fs.Path) Configuration(org.apache.flink.configuration.Configuration) ContinuousFileMonitoringFunction(org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) TimestampedFileInputSplit(org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) Test(org.junit.Test)

Example 8 with ContinuousFileMonitoringFunction

use of org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction in project flink by apache.

the class ContinuousFileProcessingTest method testInvalidPathSpecification.

@Test
public void testInvalidPathSpecification() throws Exception {
    String invalidPath = "hdfs://" + hdfsCluster.getURI().getHost() + ":" + hdfsCluster.getNameNodePort() + "/invalid/";
    TextInputFormat format = new TextInputFormat(new Path(invalidPath));
    ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
    try {
        monitoringFunction.run(new DummySourceContext() {

            @Override
            public void collect(TimestampedFileInputSplit element) {
                // we should never arrive here with an invalid path
                Assert.fail("Test passes with an invalid path.");
            }
        });
        // we should never arrive here with an invalid path
        Assert.fail("Test passed with an invalid path.");
    } catch (FileNotFoundException e) {
        Assert.assertEquals("The provided file path " + format.getFilePath() + " does not exist.", e.getMessage());
    }
}
Also used : Path(org.apache.flink.core.fs.Path) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) TimestampedFileInputSplit(org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit) FileNotFoundException(java.io.FileNotFoundException) ContinuousFileMonitoringFunction(org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction) Test(org.junit.Test)

Example 9 with ContinuousFileMonitoringFunction

use of org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction in project flink by apache.

the class ContinuousFileProcessingTest method testProcessContinuously.

@Test
public void testProcessContinuously() throws Exception {
    String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
    final OneShotLatch latch = new OneShotLatch();
    // create a single file in the directory
    Tuple2<org.apache.hadoop.fs.Path, String> bootstrap = createFileAndFillWithData(testBasePath, "file", NO_OF_FILES + 1, "This is test line.");
    Assert.assertTrue(hdfs.exists(bootstrap.f0));
    final Set<String> filesToBeRead = new TreeSet<>();
    filesToBeRead.add(bootstrap.f0.getName());
    TextInputFormat format = new TextInputFormat(new Path(testBasePath));
    format.setFilesFilter(FilePathFilter.createDefaultFilter());
    final ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_CONTINUOUSLY, 1, INTERVAL);
    // 1 for the bootstrap + NO_OF_FILES
    final int totalNoOfFilesToBeRead = NO_OF_FILES + 1;
    final FileVerifyingSourceContext context = new FileVerifyingSourceContext(latch, monitoringFunction, 1, totalNoOfFilesToBeRead);
    final Thread t = new Thread() {

        @Override
        public void run() {
            try {
                monitoringFunction.open(new Configuration());
                monitoringFunction.run(context);
            } catch (Exception e) {
                Assert.fail(e.getMessage());
            }
        }
    };
    t.start();
    if (!latch.isTriggered()) {
        latch.await();
    }
    // create some additional files that will be processed in the case of PROCESS_CONTINUOUSLY
    final org.apache.hadoop.fs.Path[] filesCreated = new org.apache.hadoop.fs.Path[NO_OF_FILES];
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
        filesCreated[i] = file.f0;
        filesToBeRead.add(file.f0.getName());
    }
    // wait until the monitoring thread exits
    t.join();
    Assert.assertArrayEquals(filesToBeRead.toArray(), context.getSeenFiles().toArray());
    // finally delete the files created for the test.
    hdfs.delete(bootstrap.f0, false);
    for (org.apache.hadoop.fs.Path path : filesCreated) {
        hdfs.delete(path, false);
    }
}
Also used : Path(org.apache.flink.core.fs.Path) Configuration(org.apache.flink.configuration.Configuration) ContinuousFileMonitoringFunction(org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) TreeSet(java.util.TreeSet) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) Test(org.junit.Test)

Aggregations

TextInputFormat (org.apache.flink.api.java.io.TextInputFormat)9 Path (org.apache.flink.core.fs.Path)9 ContinuousFileMonitoringFunction (org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction)9 Test (org.junit.Test)8 Configuration (org.apache.flink.configuration.Configuration)5 OneShotLatch (org.apache.flink.core.testutils.OneShotLatch)5 TimestampedFileInputSplit (org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit)5 TreeSet (java.util.TreeSet)4 FileNotFoundException (java.io.FileNotFoundException)3 IOException (java.io.IOException)3 HashSet (java.util.HashSet)2 StreamSource (org.apache.flink.streaming.api.operators.StreamSource)2 AbstractStreamOperatorTestHarness (org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness)2 FilePathFilter (org.apache.flink.api.common.io.FilePathFilter)1 FileInputSplit (org.apache.flink.core.fs.FileInputSplit)1 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)1 ContinuousFileReaderOperator (org.apache.flink.streaming.api.functions.source.ContinuousFileReaderOperator)1 OperatorStateHandles (org.apache.flink.streaming.runtime.tasks.OperatorStateHandles)1