Search in sources :

Example 1 with ContinuousFileMonitoringFunction

use of org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction in project flink by apache.

the class ContinuousFileProcessingITCase method testProgram.

//						END OF PREPARATIONS
@Override
protected void testProgram() throws Exception {
    /*
		* This test checks the interplay between the monitor and the reader
		* and also the failExternally() functionality. To test the latter we
		* set the parallelism to 1 so that we have the chaining between the sink,
		* which throws the SuccessException to signal the end of the test, and the
		* reader.
		* */
    TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
    format.setFilePath(hdfsURI);
    format.setFilesFilter(FilePathFilter.createDefaultFilter());
    // create the stream execution environment with a parallelism > 1 to test
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(PARALLELISM);
    ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_CONTINUOUSLY, env.getParallelism(), INTERVAL);
    // the monitor has always DOP 1
    DataStream<TimestampedFileInputSplit> splits = env.addSource(monitoringFunction);
    Assert.assertEquals(1, splits.getParallelism());
    ContinuousFileReaderOperator<String> reader = new ContinuousFileReaderOperator<>(format);
    TypeInformation<String> typeInfo = TypeExtractor.getInputFormatTypes(format);
    // the readers can be multiple
    DataStream<String> content = splits.transform("FileSplitReader", typeInfo, reader);
    Assert.assertEquals(PARALLELISM, content.getParallelism());
    // finally for the sink we set the parallelism to 1 so that we can verify the output
    TestingSinkFunction sink = new TestingSinkFunction();
    content.addSink(sink).setParallelism(1);
    Thread job = new Thread() {

        @Override
        public void run() {
            try {
                env.execute("ContinuousFileProcessingITCase Job.");
            } catch (Exception e) {
                Throwable th = e;
                for (int depth = 0; depth < 20; depth++) {
                    if (th instanceof SuccessException) {
                        try {
                            postSubmit();
                        } catch (Exception e1) {
                            e1.printStackTrace();
                        }
                        return;
                    } else if (th.getCause() != null) {
                        th = th.getCause();
                    } else {
                        break;
                    }
                }
                e.printStackTrace();
                Assert.fail(e.getMessage());
            }
        }
    };
    job.start();
    // The modification time of the last created file.
    long lastCreatedModTime = Long.MIN_VALUE;
    // create the files to be read
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> tmpFile;
        long modTime;
        do {
            // give it some time so that the files have
            // different modification timestamps.
            Thread.sleep(50);
            tmpFile = fillWithData(hdfsURI, "file", i, "This is test line.");
            modTime = hdfs.getFileStatus(tmpFile.f0).getModificationTime();
            if (modTime <= lastCreatedModTime) {
                // delete the last created file to recreate it with a different timestamp
                hdfs.delete(tmpFile.f0, false);
            }
        } while (modTime <= lastCreatedModTime);
        lastCreatedModTime = modTime;
        // put the contents in the expected results list before the reader picks them
        // this is to guarantee that they are in before the reader finishes (avoid race conditions)
        expectedContents.put(i, tmpFile.f1);
        org.apache.hadoop.fs.Path file = new org.apache.hadoop.fs.Path(hdfsURI + "/file" + i);
        hdfs.rename(tmpFile.f0, file);
        Assert.assertTrue(hdfs.exists(file));
    }
    // wait for the job to finish.
    job.join();
}
Also used : TimestampedFileInputSplit(org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit) Path(org.apache.flink.core.fs.Path) ContinuousFileMonitoringFunction(org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction) IOException(java.io.IOException) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) ContinuousFileReaderOperator(org.apache.flink.streaming.api.functions.source.ContinuousFileReaderOperator)

Example 2 with ContinuousFileMonitoringFunction

use of org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction in project flink by apache.

the class ContinuousFileProcessingMigrationTest method testFunctionRestore.

////				Monitoring Function Tests				//////
@Test
public void testFunctionRestore() throws Exception {
    /*
		org.apache.hadoop.fs.Path path = null;
		long fileModTime = Long.MIN_VALUE;
		for (int i = 0; i < 1; i++) {
			Tuple2<org.apache.hadoop.fs.Path, String> file = fillWithData(hdfsURI, "file", i, "This is test line.");
			path = file.f0;
			fileModTime = hdfs.getFileStatus(file.f0).getModificationTime();
		}

		TextInputFormat format = new TextInputFormat(new Path(hdfsURI));

		final ContinuousFileMonitoringFunction<String> monitoringFunction =
			new ContinuousFileMonitoringFunction<>(format, format.getFilePath().toString(), new PathFilter(), FileProcessingMode.PROCESS_CONTINUOUSLY, 1, INTERVAL);

		StreamSource<FileInputSplit, ContinuousFileMonitoringFunction<String>> src =
			new StreamSource<>(monitoringFunction);

		final OneInputStreamOperatorTestHarness<Void, FileInputSplit> testHarness =
			new OneInputStreamOperatorTestHarness<>(src);
		testHarness.open();

		final Throwable[] error = new Throwable[1];

		final OneShotLatch latch = new OneShotLatch();

		// run the source asynchronously
		Thread runner = new Thread() {
			@Override
			public void run() {
				try {
					monitoringFunction.run(new DummySourceContext() {
						@Override
						public void collect(FileInputSplit element) {
							latch.trigger();
						}
					});
				}
				catch (Throwable t) {
					t.printStackTrace();
					error[0] = t;
				}
			}
		};
		runner.start();

		if (!latch.isTriggered()) {
			latch.await();
		}

		StreamTaskState snapshot = testHarness.snapshot(0, 0);
		testHarness.snaphotToFile(snapshot, "src/test/resources/monitoring-function-migration-test-" + fileModTime +"-flink1.1-snapshot");
		monitoringFunction.cancel();
		runner.join();

		testHarness.close();
		*/
    Long expectedModTime = Long.parseLong("1482144479339");
    TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
    final ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_CONTINUOUSLY, 1, INTERVAL);
    StreamSource<TimestampedFileInputSplit, ContinuousFileMonitoringFunction<String>> src = new StreamSource<>(monitoringFunction);
    final AbstractStreamOperatorTestHarness<TimestampedFileInputSplit> testHarness = new AbstractStreamOperatorTestHarness<>(src, 1, 1, 0);
    testHarness.setup();
    testHarness.initializeStateFromLegacyCheckpoint(getResourceFilename("monitoring-function-migration-test-1482144479339-flink1.1-snapshot"));
    testHarness.open();
    Assert.assertEquals((long) expectedModTime, monitoringFunction.getGlobalModificationTime());
}
Also used : Path(org.apache.flink.core.fs.Path) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) TimestampedFileInputSplit(org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit) StreamSource(org.apache.flink.streaming.api.operators.StreamSource) ContinuousFileMonitoringFunction(org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction) AbstractStreamOperatorTestHarness(org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness) Test(org.junit.Test)

Example 3 with ContinuousFileMonitoringFunction

use of org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction in project flink by apache.

the class ContinuousFileProcessingTest method testProcessOnce.

@Test
public void testProcessOnce() throws Exception {
    String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
    final OneShotLatch latch = new OneShotLatch();
    // create a single file in the directory
    Tuple2<org.apache.hadoop.fs.Path, String> bootstrap = createFileAndFillWithData(testBasePath, "file", NO_OF_FILES + 1, "This is test line.");
    Assert.assertTrue(hdfs.exists(bootstrap.f0));
    // the source is supposed to read only this file.
    final Set<String> filesToBeRead = new TreeSet<>();
    filesToBeRead.add(bootstrap.f0.getName());
    TextInputFormat format = new TextInputFormat(new Path(testBasePath));
    format.setFilesFilter(FilePathFilter.createDefaultFilter());
    final ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
    final FileVerifyingSourceContext context = new FileVerifyingSourceContext(latch, monitoringFunction);
    final Thread t = new Thread() {

        @Override
        public void run() {
            try {
                monitoringFunction.open(new Configuration());
                monitoringFunction.run(context);
                // we would never arrive here if we were in
                // PROCESS_CONTINUOUSLY mode.
                // this will trigger the latch
                context.close();
            } catch (Exception e) {
                Assert.fail(e.getMessage());
            }
        }
    };
    t.start();
    if (!latch.isTriggered()) {
        latch.await();
    }
    // create some additional files that should be processed in the case of PROCESS_CONTINUOUSLY
    final org.apache.hadoop.fs.Path[] filesCreated = new org.apache.hadoop.fs.Path[NO_OF_FILES];
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> ignoredFile = createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
        filesCreated[i] = ignoredFile.f0;
    }
    // wait until the monitoring thread exits
    t.join();
    Assert.assertArrayEquals(filesToBeRead.toArray(), context.getSeenFiles().toArray());
    // finally delete the files created for the test.
    hdfs.delete(bootstrap.f0, false);
    for (org.apache.hadoop.fs.Path path : filesCreated) {
        hdfs.delete(path, false);
    }
}
Also used : Path(org.apache.flink.core.fs.Path) Configuration(org.apache.flink.configuration.Configuration) ContinuousFileMonitoringFunction(org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) TreeSet(java.util.TreeSet) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) Test(org.junit.Test)

Example 4 with ContinuousFileMonitoringFunction

use of org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction in project flink by apache.

the class ContinuousFileProcessingTest method testFunctionRestore.

@Test
public void testFunctionRestore() throws Exception {
    String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
    org.apache.hadoop.fs.Path path = null;
    long fileModTime = Long.MIN_VALUE;
    for (int i = 0; i < 1; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
        path = file.f0;
        fileModTime = hdfs.getFileStatus(file.f0).getModificationTime();
    }
    TextInputFormat format = new TextInputFormat(new Path(testBasePath));
    final ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_CONTINUOUSLY, 1, INTERVAL);
    StreamSource<TimestampedFileInputSplit, ContinuousFileMonitoringFunction<String>> src = new StreamSource<>(monitoringFunction);
    final AbstractStreamOperatorTestHarness<TimestampedFileInputSplit> testHarness = new AbstractStreamOperatorTestHarness<>(src, 1, 1, 0);
    testHarness.open();
    final Throwable[] error = new Throwable[1];
    final OneShotLatch latch = new OneShotLatch();
    final DummySourceContext sourceContext = new DummySourceContext() {

        @Override
        public void collect(TimestampedFileInputSplit element) {
            latch.trigger();
        }
    };
    // run the source asynchronously
    Thread runner = new Thread() {

        @Override
        public void run() {
            try {
                monitoringFunction.run(sourceContext);
            } catch (Throwable t) {
                t.printStackTrace();
                error[0] = t;
            }
        }
    };
    runner.start();
    // first condition for the source to have updated its state: emit at least one element
    if (!latch.isTriggered()) {
        latch.await();
    }
    // this means it has processed all the splits and updated its state.
    synchronized (sourceContext.getCheckpointLock()) {
    }
    OperatorStateHandles snapshot = testHarness.snapshot(0, 0);
    monitoringFunction.cancel();
    runner.join();
    testHarness.close();
    final ContinuousFileMonitoringFunction<String> monitoringFunctionCopy = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_CONTINUOUSLY, 1, INTERVAL);
    StreamSource<TimestampedFileInputSplit, ContinuousFileMonitoringFunction<String>> srcCopy = new StreamSource<>(monitoringFunctionCopy);
    AbstractStreamOperatorTestHarness<TimestampedFileInputSplit> testHarnessCopy = new AbstractStreamOperatorTestHarness<>(srcCopy, 1, 1, 0);
    testHarnessCopy.initializeState(snapshot);
    testHarnessCopy.open();
    Assert.assertNull(error[0]);
    Assert.assertEquals(fileModTime, monitoringFunctionCopy.getGlobalModificationTime());
    hdfs.delete(path, false);
}
Also used : Path(org.apache.flink.core.fs.Path) TimestampedFileInputSplit(org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit) StreamSource(org.apache.flink.streaming.api.operators.StreamSource) ContinuousFileMonitoringFunction(org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction) AbstractStreamOperatorTestHarness(org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness) OperatorStateHandles(org.apache.flink.streaming.runtime.tasks.OperatorStateHandles) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) Test(org.junit.Test)

Example 5 with ContinuousFileMonitoringFunction

use of org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction in project flink by apache.

the class ContinuousFileProcessingTest method testNestedFilesProcessing.

@Test
public void testNestedFilesProcessing() throws Exception {
    String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
    final Set<org.apache.hadoop.fs.Path> filesCreated = new HashSet<>();
    final Set<String> filesToBeRead = new TreeSet<>();
    // create two nested directories
    org.apache.hadoop.fs.Path firstLevelDir = new org.apache.hadoop.fs.Path(testBasePath + "/" + "firstLevelDir");
    org.apache.hadoop.fs.Path secondLevelDir = new org.apache.hadoop.fs.Path(testBasePath + "/" + "firstLevelDir" + "/" + "secondLevelDir");
    Assert.assertFalse(hdfs.exists(firstLevelDir));
    hdfs.mkdirs(firstLevelDir);
    hdfs.mkdirs(secondLevelDir);
    // create files in the base dir, the first level dir and the second level dir
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "firstLevelFile", i, "This is test line.");
        filesCreated.add(file.f0);
        filesToBeRead.add(file.f0.getName());
    }
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(firstLevelDir.toString(), "secondLevelFile", i, "This is test line.");
        filesCreated.add(file.f0);
        filesToBeRead.add(file.f0.getName());
    }
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(secondLevelDir.toString(), "thirdLevelFile", i, "This is test line.");
        filesCreated.add(file.f0);
        filesToBeRead.add(file.f0.getName());
    }
    TextInputFormat format = new TextInputFormat(new Path(testBasePath));
    format.setFilesFilter(FilePathFilter.createDefaultFilter());
    format.setNestedFileEnumeration(true);
    ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
    final FileVerifyingSourceContext context = new FileVerifyingSourceContext(new OneShotLatch(), monitoringFunction);
    monitoringFunction.open(new Configuration());
    monitoringFunction.run(context);
    Assert.assertArrayEquals(filesToBeRead.toArray(), context.getSeenFiles().toArray());
    // finally delete the dirs and the files created for the test.
    for (org.apache.hadoop.fs.Path file : filesCreated) {
        hdfs.delete(file, false);
    }
    hdfs.delete(secondLevelDir, false);
    hdfs.delete(firstLevelDir, false);
}
Also used : Path(org.apache.flink.core.fs.Path) Configuration(org.apache.flink.configuration.Configuration) ContinuousFileMonitoringFunction(org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) TreeSet(java.util.TreeSet) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

TextInputFormat (org.apache.flink.api.java.io.TextInputFormat)9 Path (org.apache.flink.core.fs.Path)9 ContinuousFileMonitoringFunction (org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction)9 Test (org.junit.Test)8 Configuration (org.apache.flink.configuration.Configuration)5 OneShotLatch (org.apache.flink.core.testutils.OneShotLatch)5 TimestampedFileInputSplit (org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit)5 TreeSet (java.util.TreeSet)4 FileNotFoundException (java.io.FileNotFoundException)3 IOException (java.io.IOException)3 HashSet (java.util.HashSet)2 StreamSource (org.apache.flink.streaming.api.operators.StreamSource)2 AbstractStreamOperatorTestHarness (org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness)2 FilePathFilter (org.apache.flink.api.common.io.FilePathFilter)1 FileInputSplit (org.apache.flink.core.fs.FileInputSplit)1 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)1 ContinuousFileReaderOperator (org.apache.flink.streaming.api.functions.source.ContinuousFileReaderOperator)1 OperatorStateHandles (org.apache.flink.streaming.runtime.tasks.OperatorStateHandles)1