Search in sources :

Example 96 with Path

use of org.apache.flink.core.fs.Path in project flink by apache.

the class ContinuousFileProcessingTest method testProcessOnce.

@Test
public void testProcessOnce() throws Exception {
    String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
    final OneShotLatch latch = new OneShotLatch();
    // create a single file in the directory
    Tuple2<org.apache.hadoop.fs.Path, String> bootstrap = createFileAndFillWithData(testBasePath, "file", NO_OF_FILES + 1, "This is test line.");
    Assert.assertTrue(hdfs.exists(bootstrap.f0));
    // the source is supposed to read only this file.
    final Set<String> filesToBeRead = new TreeSet<>();
    filesToBeRead.add(bootstrap.f0.getName());
    TextInputFormat format = new TextInputFormat(new Path(testBasePath));
    format.setFilesFilter(FilePathFilter.createDefaultFilter());
    final ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
    final FileVerifyingSourceContext context = new FileVerifyingSourceContext(latch, monitoringFunction);
    final Thread t = new Thread() {

        @Override
        public void run() {
            try {
                monitoringFunction.open(new Configuration());
                monitoringFunction.run(context);
                // we would never arrive here if we were in
                // PROCESS_CONTINUOUSLY mode.
                // this will trigger the latch
                context.close();
            } catch (Exception e) {
                Assert.fail(e.getMessage());
            }
        }
    };
    t.start();
    if (!latch.isTriggered()) {
        latch.await();
    }
    // create some additional files that should be processed in the case of PROCESS_CONTINUOUSLY
    final org.apache.hadoop.fs.Path[] filesCreated = new org.apache.hadoop.fs.Path[NO_OF_FILES];
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> ignoredFile = createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
        filesCreated[i] = ignoredFile.f0;
    }
    // wait until the monitoring thread exits
    t.join();
    Assert.assertArrayEquals(filesToBeRead.toArray(), context.getSeenFiles().toArray());
    // finally delete the files created for the test.
    hdfs.delete(bootstrap.f0, false);
    for (org.apache.hadoop.fs.Path path : filesCreated) {
        hdfs.delete(path, false);
    }
}
Also used : Path(org.apache.flink.core.fs.Path) Configuration(org.apache.flink.configuration.Configuration) ContinuousFileMonitoringFunction(org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) TreeSet(java.util.TreeSet) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) Test(org.junit.Test)

Example 97 with Path

use of org.apache.flink.core.fs.Path in project flink by apache.

the class ContinuousFileProcessingTest method testFunctionRestore.

@Test
public void testFunctionRestore() throws Exception {
    String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
    org.apache.hadoop.fs.Path path = null;
    long fileModTime = Long.MIN_VALUE;
    for (int i = 0; i < 1; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
        path = file.f0;
        fileModTime = hdfs.getFileStatus(file.f0).getModificationTime();
    }
    TextInputFormat format = new TextInputFormat(new Path(testBasePath));
    final ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_CONTINUOUSLY, 1, INTERVAL);
    StreamSource<TimestampedFileInputSplit, ContinuousFileMonitoringFunction<String>> src = new StreamSource<>(monitoringFunction);
    final AbstractStreamOperatorTestHarness<TimestampedFileInputSplit> testHarness = new AbstractStreamOperatorTestHarness<>(src, 1, 1, 0);
    testHarness.open();
    final Throwable[] error = new Throwable[1];
    final OneShotLatch latch = new OneShotLatch();
    final DummySourceContext sourceContext = new DummySourceContext() {

        @Override
        public void collect(TimestampedFileInputSplit element) {
            latch.trigger();
        }
    };
    // run the source asynchronously
    Thread runner = new Thread() {

        @Override
        public void run() {
            try {
                monitoringFunction.run(sourceContext);
            } catch (Throwable t) {
                t.printStackTrace();
                error[0] = t;
            }
        }
    };
    runner.start();
    // first condition for the source to have updated its state: emit at least one element
    if (!latch.isTriggered()) {
        latch.await();
    }
    // this means it has processed all the splits and updated its state.
    synchronized (sourceContext.getCheckpointLock()) {
    }
    OperatorStateHandles snapshot = testHarness.snapshot(0, 0);
    monitoringFunction.cancel();
    runner.join();
    testHarness.close();
    final ContinuousFileMonitoringFunction<String> monitoringFunctionCopy = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_CONTINUOUSLY, 1, INTERVAL);
    StreamSource<TimestampedFileInputSplit, ContinuousFileMonitoringFunction<String>> srcCopy = new StreamSource<>(monitoringFunctionCopy);
    AbstractStreamOperatorTestHarness<TimestampedFileInputSplit> testHarnessCopy = new AbstractStreamOperatorTestHarness<>(srcCopy, 1, 1, 0);
    testHarnessCopy.initializeState(snapshot);
    testHarnessCopy.open();
    Assert.assertNull(error[0]);
    Assert.assertEquals(fileModTime, monitoringFunctionCopy.getGlobalModificationTime());
    hdfs.delete(path, false);
}
Also used : Path(org.apache.flink.core.fs.Path) TimestampedFileInputSplit(org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit) StreamSource(org.apache.flink.streaming.api.operators.StreamSource) ContinuousFileMonitoringFunction(org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction) AbstractStreamOperatorTestHarness(org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness) OperatorStateHandles(org.apache.flink.streaming.runtime.tasks.OperatorStateHandles) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) Test(org.junit.Test)

Example 98 with Path

use of org.apache.flink.core.fs.Path in project flink by apache.

the class ContinuousFileProcessingTest method testFileReadingOperatorWithEventTime.

@Test
public void testFileReadingOperatorWithEventTime() throws Exception {
    String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
    Set<org.apache.hadoop.fs.Path> filesCreated = new HashSet<>();
    Map<String, Long> modTimes = new HashMap<>();
    Map<Integer, String> expectedFileContents = new HashMap<>();
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
        modTimes.put(file.f0.getName(), hdfs.getFileStatus(file.f0).getModificationTime());
        filesCreated.add(file.f0);
        expectedFileContents.put(i, file.f1);
    }
    TextInputFormat format = new TextInputFormat(new Path(testBasePath));
    TypeInformation<String> typeInfo = TypeExtractor.getInputFormatTypes(format);
    ContinuousFileReaderOperator<String> reader = new ContinuousFileReaderOperator<>(format);
    reader.setOutputType(typeInfo, new ExecutionConfig());
    OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, String> tester = new OneInputStreamOperatorTestHarness<>(reader);
    tester.setTimeCharacteristic(TimeCharacteristic.EventTime);
    tester.open();
    // create the necessary splits for the test
    FileInputSplit[] splits = format.createInputSplits(reader.getRuntimeContext().getNumberOfParallelSubtasks());
    // and feed them to the operator
    for (FileInputSplit split : splits) {
        tester.processElement(new StreamRecord<>(new TimestampedFileInputSplit(modTimes.get(split.getPath().getName()), split.getSplitNumber(), split.getPath(), split.getStart(), split.getLength(), split.getHostnames())));
    }
    // then close the reader gracefully (and wait to finish reading)
    synchronized (tester.getCheckpointLock()) {
        tester.close();
    }
    // the lines received must be the elements in the files +1 for for the longMax watermark
    // we are in event time, which emits no watermarks, so the last watermark will mark the
    // of the input stream.
    Assert.assertEquals(NO_OF_FILES * LINES_PER_FILE + 1, tester.getOutput().size());
    Map<Integer, List<String>> actualFileContents = new HashMap<>();
    Object lastElement = null;
    for (Object line : tester.getOutput()) {
        lastElement = line;
        if (line instanceof StreamRecord) {
            @SuppressWarnings("unchecked") StreamRecord<String> element = (StreamRecord<String>) line;
            int fileIdx = Character.getNumericValue(element.getValue().charAt(0));
            List<String> content = actualFileContents.get(fileIdx);
            if (content == null) {
                content = new ArrayList<>();
                actualFileContents.put(fileIdx, content);
            }
            content.add(element.getValue() + "\n");
        }
    }
    // check if the last element is the LongMax watermark
    Assert.assertTrue(lastElement instanceof Watermark);
    Assert.assertEquals(Long.MAX_VALUE, ((Watermark) lastElement).getTimestamp());
    Assert.assertEquals(expectedFileContents.size(), actualFileContents.size());
    for (Integer fileIdx : expectedFileContents.keySet()) {
        Assert.assertTrue("file" + fileIdx + " not found", actualFileContents.keySet().contains(fileIdx));
        List<String> cntnt = actualFileContents.get(fileIdx);
        Collections.sort(cntnt, new Comparator<String>() {

            @Override
            public int compare(String o1, String o2) {
                return getLineNo(o1) - getLineNo(o2);
            }
        });
        StringBuilder cntntStr = new StringBuilder();
        for (String line : cntnt) {
            cntntStr.append(line);
        }
        Assert.assertEquals(expectedFileContents.get(fileIdx), cntntStr.toString());
    }
    for (org.apache.hadoop.fs.Path file : filesCreated) {
        hdfs.delete(file, false);
    }
}
Also used : TimestampedFileInputSplit(org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit) HashMap(java.util.HashMap) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) Path(org.apache.flink.core.fs.Path) StreamRecord(org.apache.flink.streaming.runtime.streamrecord.StreamRecord) OneInputStreamOperatorTestHarness(org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) TimestampedFileInputSplit(org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) ContinuousFileReaderOperator(org.apache.flink.streaming.api.functions.source.ContinuousFileReaderOperator) Watermark(org.apache.flink.streaming.api.watermark.Watermark) Test(org.junit.Test)

Example 99 with Path

use of org.apache.flink.core.fs.Path in project flink by apache.

the class ContinuousFileProcessingTest method testNestedFilesProcessing.

@Test
public void testNestedFilesProcessing() throws Exception {
    String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
    final Set<org.apache.hadoop.fs.Path> filesCreated = new HashSet<>();
    final Set<String> filesToBeRead = new TreeSet<>();
    // create two nested directories
    org.apache.hadoop.fs.Path firstLevelDir = new org.apache.hadoop.fs.Path(testBasePath + "/" + "firstLevelDir");
    org.apache.hadoop.fs.Path secondLevelDir = new org.apache.hadoop.fs.Path(testBasePath + "/" + "firstLevelDir" + "/" + "secondLevelDir");
    Assert.assertFalse(hdfs.exists(firstLevelDir));
    hdfs.mkdirs(firstLevelDir);
    hdfs.mkdirs(secondLevelDir);
    // create files in the base dir, the first level dir and the second level dir
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "firstLevelFile", i, "This is test line.");
        filesCreated.add(file.f0);
        filesToBeRead.add(file.f0.getName());
    }
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(firstLevelDir.toString(), "secondLevelFile", i, "This is test line.");
        filesCreated.add(file.f0);
        filesToBeRead.add(file.f0.getName());
    }
    for (int i = 0; i < NO_OF_FILES; i++) {
        Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(secondLevelDir.toString(), "thirdLevelFile", i, "This is test line.");
        filesCreated.add(file.f0);
        filesToBeRead.add(file.f0.getName());
    }
    TextInputFormat format = new TextInputFormat(new Path(testBasePath));
    format.setFilesFilter(FilePathFilter.createDefaultFilter());
    format.setNestedFileEnumeration(true);
    ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
    final FileVerifyingSourceContext context = new FileVerifyingSourceContext(new OneShotLatch(), monitoringFunction);
    monitoringFunction.open(new Configuration());
    monitoringFunction.run(context);
    Assert.assertArrayEquals(filesToBeRead.toArray(), context.getSeenFiles().toArray());
    // finally delete the dirs and the files created for the test.
    for (org.apache.hadoop.fs.Path file : filesCreated) {
        hdfs.delete(file, false);
    }
    hdfs.delete(secondLevelDir, false);
    hdfs.delete(firstLevelDir, false);
}
Also used : Path(org.apache.flink.core.fs.Path) Configuration(org.apache.flink.configuration.Configuration) ContinuousFileMonitoringFunction(org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction) TextInputFormat(org.apache.flink.api.java.io.TextInputFormat) TreeSet(java.util.TreeSet) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 100 with Path

use of org.apache.flink.core.fs.Path in project flink by apache.

the class DataSet method write.

/**
	 * Writes a DataSet using a {@link FileOutputFormat} to a specified location.
	 * This method adds a data sink to the program.
	 * 
	 * @param outputFormat The FileOutputFormat to write the DataSet.
	 * @param filePath The path to the location where the DataSet is written.
	 * @param writeMode The mode of writing, indicating whether to overwrite existing files.
	 * @return The DataSink that writes the DataSet.
	 * 
	 * @see FileOutputFormat
	 */
public DataSink<T> write(FileOutputFormat<T> outputFormat, String filePath, WriteMode writeMode) {
    Preconditions.checkNotNull(filePath, "File path must not be null.");
    Preconditions.checkNotNull(writeMode, "Write mode must not be null.");
    Preconditions.checkNotNull(outputFormat, "Output format must not be null.");
    outputFormat.setOutputFilePath(new Path(filePath));
    outputFormat.setWriteMode(writeMode);
    return output(outputFormat);
}
Also used : Path(org.apache.flink.core.fs.Path)

Aggregations

Path (org.apache.flink.core.fs.Path)214 Test (org.junit.Test)116 File (java.io.File)71 FileInputSplit (org.apache.flink.core.fs.FileInputSplit)59 IOException (java.io.IOException)48 Configuration (org.apache.flink.configuration.Configuration)46 FileSystem (org.apache.flink.core.fs.FileSystem)34 FileOutputStream (java.io.FileOutputStream)26 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)23 ArrayList (java.util.ArrayList)21 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)21 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)21 OutputStreamWriter (java.io.OutputStreamWriter)18 HashMap (java.util.HashMap)17 TaskInfo (org.apache.flink.api.common.TaskInfo)17 FileStatus (org.apache.flink.core.fs.FileStatus)15 OptimizedPlan (org.apache.flink.optimizer.plan.OptimizedPlan)14 TimestampedFileInputSplit (org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit)14 AvroInputFormat (org.apache.flink.api.java.io.AvroInputFormat)13 TextInputFormat (org.apache.flink.api.java.io.TextInputFormat)13