Examples with TextInputFormat - org.apache.flink.api.java.io.TextInputFormat

Example 1 with TextInputFormat

use of org.apache.flink.api.java.io.TextInputFormat in project flink by apache.

the class ContinuousFileProcessingMigrationTest method testFunctionRestore.

////				Monitoring Function Tests				//////
@Test
public void testFunctionRestore() throws Exception {
    /*
		org.apache.hadoop.fs.Path path = null;
		long fileModTime = Long.MIN_VALUE;
		for (int i = 0; i < 1; i++) {
			Tuple2<org.apache.hadoop.fs.Path, String> file = fillWithData(hdfsURI, "file", i, "This is test line.");
			path = file.f0;
			fileModTime = hdfs.getFileStatus(file.f0).getModificationTime();
		}

		TextInputFormat format = new TextInputFormat(new Path(hdfsURI));

		final ContinuousFileMonitoringFunction<String> monitoringFunction =
			new ContinuousFileMonitoringFunction<>(format, format.getFilePath().toString(), new PathFilter(), FileProcessingMode.PROCESS_CONTINUOUSLY, 1, INTERVAL);

		StreamSource<FileInputSplit, ContinuousFileMonitoringFunction<String>> src =
			new StreamSource<>(monitoringFunction);

		final OneInputStreamOperatorTestHarness<Void, FileInputSplit> testHarness =
			new OneInputStreamOperatorTestHarness<>(src);
		testHarness.open();

		final Throwable[] error = new Throwable[1];

		final OneShotLatch latch = new OneShotLatch();

		// run the source asynchronously
		Thread runner = new Thread() {
			@Override
			public void run() {
				try {
					monitoringFunction.run(new DummySourceContext() {
						@Override
						public void collect(FileInputSplit element) {
							latch.trigger();
						}
					});
				}
				catch (Throwable t) {
					t.printStackTrace();
					error[0] = t;
				}
			}
		};
		runner.start();

		if (!latch.isTriggered()) {
			latch.await();
		}

		StreamTaskState snapshot = testHarness.snapshot(0, 0);
		testHarness.snaphotToFile(snapshot, "src/test/resources/monitoring-function-migration-test-" + fileModTime +"-flink1.1-snapshot");
		monitoringFunction.cancel();
		runner.join();

		testHarness.close();
		*/
    Long expectedModTime = Long.parseLong("1482144479339");
    TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
    final ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_CONTINUOUSLY, 1, INTERVAL);
    StreamSource<TimestampedFileInputSplit, ContinuousFileMonitoringFunction<String>> src = new StreamSource<>(monitoringFunction);
    final AbstractStreamOperatorTestHarness<TimestampedFileInputSplit> testHarness = new AbstractStreamOperatorTestHarness<>(src, 1, 1, 0);
    testHarness.setup();
    testHarness.initializeStateFromLegacyCheckpoint(getResourceFilename("monitoring-function-migration-test-1482144479339-flink1.1-snapshot"));
    testHarness.open();
    Assert.assertEquals((long) expectedModTime, monitoringFunction.getGlobalModificationTime());
}

Also used : Path(org.apache.flink.core.fs.Path) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) TimestampedFileInputSplit(org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit) StreamSource(org.apache.flink.streaming.api.operators.StreamSource) ContinuousFileMonitoringFunction(org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction) AbstractStreamOperatorTestHarness(org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness) Test(org.junit.Test)

Example 2 with TextInputFormat

use of org.apache.flink.api.java.io.TextInputFormat in project flink by apache.

the class ContinuousFileProcessingITCase method testProgram.

// END OF PREPARATIONS
@Test
public void testProgram() throws Exception {
    /*
         * This test checks the interplay between the monitor and the reader
         * and also the failExternally() functionality. To test the latter we
         * set the parallelism to 1 so that we have the chaining between the sink,
         * which throws the SuccessException to signal the end of the test, and the
         * reader.
         * */
    TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
    format.setFilePath(hdfsURI);
    format.setFilesFilter(FilePathFilter.createDefaultFilter());
    // create the stream execution environment with a parallelism > 1 to test
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(PARALLELISM);
    ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_CONTINUOUSLY, env.getParallelism(), INTERVAL);
    // the monitor has always DOP 1
    DataStream<TimestampedFileInputSplit> splits = env.addSource(monitoringFunction);
    Assert.assertEquals(1, splits.getParallelism());
    TypeInformation<String> typeInfo = TypeExtractor.getInputFormatTypes(format);
    // the readers can be multiple
    DataStream<String> content = splits.transform("FileSplitReader", typeInfo, new ContinuousFileReaderOperatorFactory<>(format));
    Assert.assertEquals(PARALLELISM, content.getParallelism());
    // finally for the sink we set the parallelism to 1 so that we can verify the output
    TestingSinkFunction sink = new TestingSinkFunction();
    content.addSink(sink).setParallelism(1);
    CompletableFuture<Void> jobFuture = new CompletableFuture<>();
    new Thread(() -> {
        try {
            env.execute("ContinuousFileProcessingITCase Job.");
            jobFuture.complete(null);
        } catch (Exception e) {
            if (ExceptionUtils.findThrowable(e, SuccessException.class).isPresent()) {
                jobFuture.complete(null);
            } else {
                jobFuture.completeExceptionally(e);
            }
        }
    }).start();
    // The modification time of the last created file.
    long lastCreatedModTime = Long.MIN_VALUE;
    // create the files to be read
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> tmpFile;
        long modTime;
        do {
            // give it some time so that the files have
            // different modification timestamps.
            Thread.sleep(50);
            tmpFile = fillWithData(hdfsURI, "file", i, "This is test line.");
            modTime = hdfs.getFileStatus(tmpFile.f0).getModificationTime();
            if (modTime <= lastCreatedModTime) {
                // delete the last created file to recreate it with a different timestamp
                hdfs.delete(tmpFile.f0, false);
            }
        } while (modTime <= lastCreatedModTime);
        lastCreatedModTime = modTime;
        // put the contents in the expected results list before the reader picks them
        // this is to guarantee that they are in before the reader finishes (avoid race
        // conditions)
        expectedContents.put(i, tmpFile.f1);
        org.apache.hadoop.fs.Path file = new org.apache.hadoop.fs.Path(hdfsURI + "/file" + i);
        hdfs.rename(tmpFile.f0, file);
        Assert.assertTrue(hdfs.exists(file));
    }
    jobFuture.get();
}

Also used : TimestampedFileInputSplit(org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit) CompletableFuture(java.util.concurrent.CompletableFuture) Path(org.apache.flink.core.fs.Path) ContinuousFileMonitoringFunction(org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction) IOException(java.io.IOException) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Test(org.junit.Test)

Example 3 with TextInputFormat

use of org.apache.flink.api.java.io.TextInputFormat in project flink by apache.

the class ContinuousFileProcessingTest method testFileReadingOperatorWithEventTime.

@Test
public void testFileReadingOperatorWithEventTime() throws Exception {
    String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
    Set<org.apache.hadoop.fs.Path> filesCreated = new HashSet<>();
    Map<String, Long> modTimes = new HashMap<>();
    Map<Integer, String> expectedFileContents = new HashMap<>();
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
        modTimes.put(file.f0.getName(), hdfs.getFileStatus(file.f0).getModificationTime());
        filesCreated.add(file.f0);
        expectedFileContents.put(i, file.f1);
    }
    TextInputFormat format = new TextInputFormat(new Path(testBasePath));
    TypeInformation<String> typeInfo = TypeExtractor.getInputFormatTypes(format);
    OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, String> tester = createHarness(format);
    tester.setTimeCharacteristic(TimeCharacteristic.EventTime);
    tester.open();
    // create the necessary splits for the test
    FileInputSplit[] splits = format.createInputSplits(tester.getExecutionConfig().getParallelism());
    // and feed them to the operator
    for (FileInputSplit split : splits) {
        tester.processElement(new StreamRecord<>(new TimestampedFileInputSplit(modTimes.get(split.getPath().getName()), split.getSplitNumber(), split.getPath(), split.getStart(), split.getLength(), split.getHostnames())));
    }
    // then close the reader gracefully (and wait to finish reading)
    synchronized (tester.getCheckpointLock()) {
        tester.close();
    }
    // the lines received must be the elements in the files +1 for for the longMax watermark
    // we are in event time, which emits no watermarks, so the last watermark will mark the
    // of the input stream.
    Assert.assertEquals(NO_OF_FILES * LINES_PER_FILE + 1, tester.getOutput().size());
    Map<Integer, List<String>> actualFileContents = new HashMap<>();
    Object lastElement = null;
    for (Object line : tester.getOutput()) {
        lastElement = line;
        if (line instanceof StreamRecord) {
            @SuppressWarnings("unchecked") StreamRecord<String> element = (StreamRecord<String>) line;
            int fileIdx = Character.getNumericValue(element.getValue().charAt(0));
            List<String> content = actualFileContents.get(fileIdx);
            if (content == null) {
                content = new ArrayList<>();
                actualFileContents.put(fileIdx, content);
            }
            content.add(element.getValue() + "\n");
        }
    }
    // check if the last element is the LongMax watermark
    Assert.assertTrue(lastElement instanceof Watermark);
    Assert.assertEquals(Long.MAX_VALUE, ((Watermark) lastElement).getTimestamp());
    Assert.assertEquals(expectedFileContents.size(), actualFileContents.size());
    for (Integer fileIdx : expectedFileContents.keySet()) {
        Assert.assertTrue("file" + fileIdx + " not found", actualFileContents.keySet().contains(fileIdx));
        List<String> cntnt = actualFileContents.get(fileIdx);
        Collections.sort(cntnt, new Comparator<String>() {

            @Override
            public int compare(String o1, String o2) {
                return getLineNo(o1) - getLineNo(o2);
            }
        });
        StringBuilder cntntStr = new StringBuilder();
        for (String line : cntnt) {
            cntntStr.append(line);
        }
        Assert.assertEquals(expectedFileContents.get(fileIdx), cntntStr.toString());
    }
    for (org.apache.hadoop.fs.Path file : filesCreated) {
        hdfs.delete(file, false);
    }
}

Also used : TimestampedFileInputSplit(org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit) HashMap(java.util.HashMap) List(java.util.List) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Path(org.apache.flink.core.fs.Path) StreamRecord(org.apache.flink.streaming.runtime.streamrecord.StreamRecord) TimestampedFileInputSplit(org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) Watermark(org.apache.flink.streaming.api.watermark.Watermark) Test(org.junit.Test)

Example 4 with TextInputFormat

use of org.apache.flink.api.java.io.TextInputFormat in project flink by apache.

the class ContinuousFileProcessingTest method testSortingOnModTime.

@Test
public void testSortingOnModTime() throws Exception {
    String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
    final long[] modTimes = new long[NO_OF_FILES];
    final org.apache.hadoop.fs.Path[] filesCreated = new org.apache.hadoop.fs.Path[NO_OF_FILES];
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
        Thread.sleep(400);
        filesCreated[i] = file.f0;
        modTimes[i] = hdfs.getFileStatus(file.f0).getModificationTime();
    }
    TextInputFormat format = new TextInputFormat(new Path(testBasePath));
    format.setFilesFilter(FilePathFilter.createDefaultFilter());
    // this is just to verify that all splits have been forwarded later.
    FileInputSplit[] splits = format.createInputSplits(1);
    ContinuousFileMonitoringFunction<String> monitoringFunction = createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_ONCE);
    ModTimeVerifyingSourceContext context = new ModTimeVerifyingSourceContext(modTimes);
    monitoringFunction.open(new Configuration());
    monitoringFunction.run(context);
    Assert.assertEquals(splits.length, context.getCounter());
    // delete the created files.
    for (int i = 0; i < NO_OF_FILES; i++) {
        hdfs.delete(filesCreated[i], false);
    }
}

Also used : Path(org.apache.flink.core.fs.Path) Configuration(org.apache.flink.configuration.Configuration) TimestampedFileInputSplit(org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) Test(org.junit.Test)

Example 5 with TextInputFormat

use of org.apache.flink.api.java.io.TextInputFormat in project flink by apache.

the class ContinuousFileProcessingTest method testFilePathFiltering.

// //				Monitoring Function Tests				//////
@Test
public void testFilePathFiltering() throws Exception {
    String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
    Set<org.apache.hadoop.fs.Path> filesCreated = new HashSet<>();
    Set<String> filesKept = new TreeSet<>();
    // create the files to be discarded
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "**file", i, "This is test line.");
        filesCreated.add(file.f0);
    }
    // create the files to be kept
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
        filesCreated.add(file.f0);
        filesKept.add(file.f0.getName());
    }
    TextInputFormat format = new TextInputFormat(new Path(testBasePath));
    format.setFilesFilter(new FilePathFilter() {

        private static final long serialVersionUID = 2611449927338589804L;

        @Override
        public boolean filterPath(Path filePath) {
            return filePath.getName().startsWith("**");
        }
    });
    ContinuousFileMonitoringFunction<String> monitoringFunction = createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_ONCE);
    final FileVerifyingSourceContext context = new FileVerifyingSourceContext(new OneShotLatch(), monitoringFunction);
    monitoringFunction.open(new Configuration());
    monitoringFunction.run(context);
    Assert.assertArrayEquals(filesKept.toArray(), context.getSeenFiles().toArray());
    // finally delete the files created for the test.
    for (org.apache.hadoop.fs.Path file : filesCreated) {
        hdfs.delete(file, false);
    }
}

Also used : Path(org.apache.flink.core.fs.Path) Configuration(org.apache.flink.configuration.Configuration) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) TreeSet(java.util.TreeSet) FilePathFilter(org.apache.flink.api.common.io.FilePathFilter) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

TextInputFormat (org.apache.flink.api.java.io.TextInputFormat)16 Path (org.apache.flink.core.fs.Path)15 Test (org.junit.Test)13 TimestampedFileInputSplit (org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit)9 OneShotLatch (org.apache.flink.core.testutils.OneShotLatch)6 ContinuousFileMonitoringFunction (org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction)6 Configuration (org.apache.flink.configuration.Configuration)5 IOException (java.io.IOException)4 HashSet (java.util.HashSet)4 TreeSet (java.util.TreeSet)4 StreamSource (org.apache.flink.streaming.api.operators.StreamSource)4 AbstractStreamOperatorTestHarness (org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness)4 FileNotFoundException (java.io.FileNotFoundException)3 FileInputSplit (org.apache.flink.core.fs.FileInputSplit)3 RunnableWithException (org.apache.flink.util.function.RunnableWithException)3 File (java.io.File)2 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 List (java.util.List)2 OperatorSubtaskState (org.apache.flink.runtime.checkpoint.OperatorSubtaskState)2