use of org.apache.flink.api.java.io.TextInputFormat in project flink by apache.
the class ContinuousFileProcessingTest method testFunctionRestore.
@Test
public void testFunctionRestore() throws Exception {
String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
org.apache.hadoop.fs.Path path = null;
long fileModTime = Long.MIN_VALUE;
for (int i = 0; i < 1; i++) {
Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
path = file.f0;
fileModTime = hdfs.getFileStatus(file.f0).getModificationTime();
}
TextInputFormat format = new TextInputFormat(new Path(testBasePath));
final ContinuousFileMonitoringFunction<String> monitoringFunction = createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_CONTINUOUSLY);
StreamSource<TimestampedFileInputSplit, ContinuousFileMonitoringFunction<String>> src = new StreamSource<>(monitoringFunction);
final AbstractStreamOperatorTestHarness<TimestampedFileInputSplit> testHarness = new AbstractStreamOperatorTestHarness<>(src, 1, 1, 0);
testHarness.open();
final Throwable[] error = new Throwable[1];
final OneShotLatch latch = new OneShotLatch();
final DummySourceContext sourceContext = new DummySourceContext() {
@Override
public void collect(TimestampedFileInputSplit element) {
latch.trigger();
}
};
// run the source asynchronously
Thread runner = new Thread() {
@Override
public void run() {
try {
monitoringFunction.run(sourceContext);
} catch (Throwable t) {
t.printStackTrace();
error[0] = t;
}
}
};
runner.start();
// first condition for the source to have updated its state: emit at least one element
if (!latch.isTriggered()) {
latch.await();
}
// this means it has processed all the splits and updated its state.
synchronized (sourceContext.getCheckpointLock()) {
}
OperatorSubtaskState snapshot = testHarness.snapshot(0, 0);
monitoringFunction.cancel();
runner.join();
testHarness.close();
final ContinuousFileMonitoringFunction<String> monitoringFunctionCopy = createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_CONTINUOUSLY);
StreamSource<TimestampedFileInputSplit, ContinuousFileMonitoringFunction<String>> srcCopy = new StreamSource<>(monitoringFunctionCopy);
AbstractStreamOperatorTestHarness<TimestampedFileInputSplit> testHarnessCopy = new AbstractStreamOperatorTestHarness<>(srcCopy, 1, 1, 0);
testHarnessCopy.initializeState(snapshot);
testHarnessCopy.open();
Assert.assertNull(error[0]);
Assert.assertEquals(fileModTime, monitoringFunctionCopy.getGlobalModificationTime());
hdfs.delete(path, false);
}
use of org.apache.flink.api.java.io.TextInputFormat in project flink by apache.
the class ContinuousFileProcessingTest method testProcessOnce.
@Test
public void testProcessOnce() throws Exception {
String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
final OneShotLatch latch = new OneShotLatch();
// create a single file in the directory
Tuple2<org.apache.hadoop.fs.Path, String> bootstrap = createFileAndFillWithData(testBasePath, "file", NO_OF_FILES + 1, "This is test line.");
Assert.assertTrue(hdfs.exists(bootstrap.f0));
// the source is supposed to read only this file.
final Set<String> filesToBeRead = new TreeSet<>();
filesToBeRead.add(bootstrap.f0.getName());
TextInputFormat format = new TextInputFormat(new Path(testBasePath));
format.setFilesFilter(FilePathFilter.createDefaultFilter());
final ContinuousFileMonitoringFunction<String> monitoringFunction = createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_ONCE);
final FileVerifyingSourceContext context = new FileVerifyingSourceContext(latch, monitoringFunction);
final Thread t = new Thread() {
@Override
public void run() {
try {
monitoringFunction.open(new Configuration());
monitoringFunction.run(context);
// we would never arrive here if we were in
// PROCESS_CONTINUOUSLY mode.
// this will trigger the latch
context.close();
} catch (Exception e) {
Assert.fail(e.getMessage());
}
}
};
t.start();
if (!latch.isTriggered()) {
latch.await();
}
// create some additional files that should be processed in the case of PROCESS_CONTINUOUSLY
final org.apache.hadoop.fs.Path[] filesCreated = new org.apache.hadoop.fs.Path[NO_OF_FILES];
for (int i = 0; i < NO_OF_FILES; i++) {
Tuple2<org.apache.hadoop.fs.Path, String> ignoredFile = createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
filesCreated[i] = ignoredFile.f0;
}
// wait until the monitoring thread exits
t.join();
Assert.assertArrayEquals(filesToBeRead.toArray(), context.getSeenFiles().toArray());
// finally delete the files created for the test.
hdfs.delete(bootstrap.f0, false);
for (org.apache.hadoop.fs.Path path : filesCreated) {
hdfs.delete(path, false);
}
}
use of org.apache.flink.api.java.io.TextInputFormat in project flink by apache.
the class ContinuousFileProcessingTest method testNestedFilesProcessing.
@Test
public void testNestedFilesProcessing() throws Exception {
String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
final Set<org.apache.hadoop.fs.Path> filesCreated = new HashSet<>();
final Set<String> filesToBeRead = new TreeSet<>();
// create two nested directories
org.apache.hadoop.fs.Path firstLevelDir = new org.apache.hadoop.fs.Path(testBasePath + "/" + "firstLevelDir");
org.apache.hadoop.fs.Path secondLevelDir = new org.apache.hadoop.fs.Path(testBasePath + "/" + "firstLevelDir" + "/" + "secondLevelDir");
Assert.assertFalse(hdfs.exists(firstLevelDir));
hdfs.mkdirs(firstLevelDir);
hdfs.mkdirs(secondLevelDir);
// create files in the base dir, the first level dir and the second level dir
for (int i = 0; i < NO_OF_FILES; i++) {
Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "firstLevelFile", i, "This is test line.");
filesCreated.add(file.f0);
filesToBeRead.add(file.f0.getName());
}
for (int i = 0; i < NO_OF_FILES; i++) {
Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(firstLevelDir.toString(), "secondLevelFile", i, "This is test line.");
filesCreated.add(file.f0);
filesToBeRead.add(file.f0.getName());
}
for (int i = 0; i < NO_OF_FILES; i++) {
Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(secondLevelDir.toString(), "thirdLevelFile", i, "This is test line.");
filesCreated.add(file.f0);
filesToBeRead.add(file.f0.getName());
}
TextInputFormat format = new TextInputFormat(new Path(testBasePath));
format.setFilesFilter(FilePathFilter.createDefaultFilter());
format.setNestedFileEnumeration(true);
ContinuousFileMonitoringFunction<String> monitoringFunction = createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_ONCE);
final FileVerifyingSourceContext context = new FileVerifyingSourceContext(new OneShotLatch(), monitoringFunction);
monitoringFunction.open(new Configuration());
monitoringFunction.run(context);
Assert.assertArrayEquals(filesToBeRead.toArray(), context.getSeenFiles().toArray());
// finally delete the dirs and the files created for the test.
for (org.apache.hadoop.fs.Path file : filesCreated) {
hdfs.delete(file, false);
}
hdfs.delete(secondLevelDir, false);
hdfs.delete(firstLevelDir, false);
}
use of org.apache.flink.api.java.io.TextInputFormat in project flink by apache.
the class ContinuousFileProcessingMigrationTest method writeMonitoringSourceSnapshot.
/**
* Manually run this to write binary snapshot data. Remove @Ignore to run.
*/
@Ignore
@Test
public void writeMonitoringSourceSnapshot() throws Exception {
File testFolder = tempFolder.newFolder();
long fileModTime = Long.MIN_VALUE;
for (int i = 0; i < 1; i++) {
Tuple2<File, String> file = createFileAndFillWithData(testFolder, "file", i, "This is test line.");
fileModTime = file.f0.lastModified();
}
TextInputFormat format = new TextInputFormat(new Path(testFolder.getAbsolutePath()));
final ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_CONTINUOUSLY, 1, INTERVAL);
StreamSource<TimestampedFileInputSplit, ContinuousFileMonitoringFunction<String>> src = new StreamSource<>(monitoringFunction);
final AbstractStreamOperatorTestHarness<TimestampedFileInputSplit> testHarness = new AbstractStreamOperatorTestHarness<>(src, 1, 1, 0);
testHarness.open();
final Throwable[] error = new Throwable[1];
final OneShotLatch latch = new OneShotLatch();
// run the source asynchronously
Thread runner = new Thread() {
@Override
public void run() {
try {
monitoringFunction.run(new DummySourceContext() {
@Override
public void collect(TimestampedFileInputSplit element) {
latch.trigger();
}
@Override
public void markAsTemporarilyIdle() {
}
});
} catch (Throwable t) {
t.printStackTrace();
error[0] = t;
}
}
};
runner.start();
if (!latch.isTriggered()) {
latch.await();
}
final OperatorSubtaskState snapshot;
synchronized (testHarness.getCheckpointLock()) {
snapshot = testHarness.snapshot(0L, 0L);
}
OperatorSnapshotUtil.writeStateHandle(snapshot, "src/test/resources/monitoring-function-migration-test-" + fileModTime + "-flink" + flinkGenerateSavepointVersion + "-snapshot");
monitoringFunction.cancel();
runner.join();
testHarness.close();
}
use of org.apache.flink.api.java.io.TextInputFormat in project flink by apache.
the class ContinuousFileProcessingMigrationTest method testMonitoringSourceRestore.
@Test
public void testMonitoringSourceRestore() throws Exception {
File testFolder = tempFolder.newFolder();
TextInputFormat format = new TextInputFormat(new Path(testFolder.getAbsolutePath()));
final ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_CONTINUOUSLY, 1, INTERVAL);
StreamSource<TimestampedFileInputSplit, ContinuousFileMonitoringFunction<String>> src = new StreamSource<>(monitoringFunction);
final AbstractStreamOperatorTestHarness<TimestampedFileInputSplit> testHarness = new AbstractStreamOperatorTestHarness<>(src, 1, 1, 0);
testHarness.setup();
testHarness.initializeState(OperatorSnapshotUtil.getResourceFilename("monitoring-function-migration-test-" + expectedModTime + "-flink" + testMigrateVersion + "-snapshot"));
testHarness.open();
Assert.assertEquals((long) expectedModTime, monitoringFunction.getGlobalModificationTime());
}
Aggregations