use of org.apache.flink.api.java.io.TextInputFormat in project flink by apache.
the class StreamExecutionEnvironment method readTextFile.
/**
* Reads the given file line-by-line and creates a data stream that contains a string with the
* contents of each such line. The {@link java.nio.charset.Charset} with the given name will be
* used to read the files.
*
* <p><b>NOTES ON CHECKPOINTING: </b> The source monitors the path, creates the {@link
* org.apache.flink.core.fs.FileInputSplit FileInputSplits} to be processed, forwards them to
* the downstream readers to read the actual data, and exits, without waiting for the readers to
* finish reading. This implies that no more checkpoint barriers are going to be forwarded after
* the source exits, thus having no checkpoints after that point.
*
* @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or
* "hdfs://host:port/file/path")
* @param charsetName The name of the character set used to read the file
* @return The data stream that represents the data read from the given file as text lines
*/
public DataStreamSource<String> readTextFile(String filePath, String charsetName) {
Preconditions.checkArgument(!StringUtils.isNullOrWhitespaceOnly(filePath), "The file path must not be null or blank.");
TextInputFormat format = new TextInputFormat(new Path(filePath));
format.setFilesFilter(FilePathFilter.createDefaultFilter());
TypeInformation<String> typeInfo = BasicTypeInfo.STRING_TYPE_INFO;
format.setCharsetName(charsetName);
return readFile(format, filePath, FileProcessingMode.PROCESS_ONCE, -1, typeInfo);
}
use of org.apache.flink.api.java.io.TextInputFormat in project flink by apache.
the class ContinuousFileProcessingCheckpointITCase method testProgram.
@Override
public void testProgram(StreamExecutionEnvironment env) {
env.enableCheckpointing(10);
// create and start the file creating thread.
fc = new FileCreator();
fc.start();
// create the monitoring source along with the necessary readers.
TextInputFormat format = new TextInputFormat(new org.apache.flink.core.fs.Path(localFsURI));
format.setFilesFilter(FilePathFilter.createDefaultFilter());
DataStream<String> inputStream = env.readFile(format, localFsURI, FileProcessingMode.PROCESS_CONTINUOUSLY, INTERVAL);
TestingSinkFunction sink = new TestingSinkFunction();
inputStream.flatMap(new FlatMapFunction<String, String>() {
@Override
public void flatMap(String value, Collector<String> out) throws Exception {
out.collect(value);
}
}).addSink(sink).setParallelism(1);
}
use of org.apache.flink.api.java.io.TextInputFormat in project flink by apache.
the class ContinuousFileProcessingTest method testInvalidPathSpecification.
@Test
public void testInvalidPathSpecification() throws Exception {
String invalidPath = "hdfs://" + hdfsCluster.getURI().getHost() + ":" + hdfsCluster.getNameNodePort() + "/invalid/";
TextInputFormat format = new TextInputFormat(new Path(invalidPath));
ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
try {
monitoringFunction.run(new DummySourceContext() {
@Override
public void collect(TimestampedFileInputSplit element) {
// we should never arrive here with an invalid path
Assert.fail("Test passes with an invalid path.");
}
});
// we should never arrive here with an invalid path
Assert.fail("Test passed with an invalid path.");
} catch (FileNotFoundException e) {
Assert.assertEquals("The provided file path " + format.getFilePath() + " does not exist.", e.getMessage());
}
}
use of org.apache.flink.api.java.io.TextInputFormat in project flink by apache.
the class ContinuousFileProcessingTest method testFileReadingOperatorWithIngestionTime.
@Test
public void testFileReadingOperatorWithIngestionTime() throws Exception {
String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
Set<org.apache.hadoop.fs.Path> filesCreated = new HashSet<>();
Map<Integer, String> expectedFileContents = new HashMap<>();
Map<String, Long> modTimes = new HashMap<>();
for (int i = 0; i < NO_OF_FILES; i++) {
Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
filesCreated.add(file.f0);
modTimes.put(file.f0.getName(), hdfs.getFileStatus(file.f0).getModificationTime());
expectedFileContents.put(i, file.f1);
}
TextInputFormat format = new TextInputFormat(new Path(testBasePath));
final long watermarkInterval = 10;
final OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, String> tester = createHarness(format);
SteppingMailboxProcessor localMailbox = createLocalMailbox(tester);
tester.getExecutionConfig().setAutoWatermarkInterval(watermarkInterval);
tester.setTimeCharacteristic(TimeCharacteristic.IngestionTime);
tester.open();
Assert.assertEquals(TimeCharacteristic.IngestionTime, tester.getTimeCharacteristic());
tester.setProcessingTime(201);
// test that watermarks are correctly emitted
ConcurrentLinkedQueue<Object> output = tester.getOutput();
while (output.isEmpty()) {
localMailbox.runMailboxStep();
}
Assert.assertTrue(output.toString(), output.peek() instanceof Watermark);
Assert.assertEquals(200, ((Watermark) output.poll()).getTimestamp());
tester.setProcessingTime(301);
Assert.assertTrue(output.peek() instanceof Watermark);
Assert.assertEquals(300, ((Watermark) output.poll()).getTimestamp());
tester.setProcessingTime(401);
Assert.assertTrue(output.peek() instanceof Watermark);
Assert.assertEquals(400, ((Watermark) output.poll()).getTimestamp());
tester.setProcessingTime(501);
Assert.assertTrue(output.peek() instanceof Watermark);
Assert.assertEquals(500, ((Watermark) output.poll()).getTimestamp());
Assert.assertTrue(output.isEmpty());
// create the necessary splits for the test
FileInputSplit[] splits = format.createInputSplits(tester.getExecutionConfig().getParallelism());
// and feed them to the operator
Map<Integer, List<String>> actualFileContents = new HashMap<>();
long lastSeenWatermark = Long.MIN_VALUE;
// counter for the lines read from the splits
int lineCounter = 0;
int watermarkCounter = 0;
for (FileInputSplit split : splits) {
// set the next "current processing time".
long nextTimestamp = tester.getProcessingTime() + watermarkInterval;
tester.setProcessingTime(nextTimestamp);
// send the next split to be read and wait until it is fully read, the +1 is for the
// watermark.
RunnableWithException runnableWithException = () -> tester.processElement(new StreamRecord<>(new TimestampedFileInputSplit(modTimes.get(split.getPath().getName()), split.getSplitNumber(), split.getPath(), split.getStart(), split.getLength(), split.getHostnames())));
runnableWithException.run();
// BUT THIS IS JUST FOR THIS TEST
while (tester.getOutput().isEmpty() || tester.getOutput().size() != (LINES_PER_FILE + 1)) {
localMailbox.runMailboxStep();
}
// verify that the results are the expected
for (Object line : tester.getOutput()) {
if (line instanceof StreamRecord) {
@SuppressWarnings("unchecked") StreamRecord<String> element = (StreamRecord<String>) line;
lineCounter++;
Assert.assertEquals(nextTimestamp, element.getTimestamp());
int fileIdx = Character.getNumericValue(element.getValue().charAt(0));
List<String> content = actualFileContents.get(fileIdx);
if (content == null) {
content = new ArrayList<>();
actualFileContents.put(fileIdx, content);
}
content.add(element.getValue() + "\n");
} else if (line instanceof Watermark) {
long watermark = ((Watermark) line).getTimestamp();
Assert.assertEquals(nextTimestamp - (nextTimestamp % watermarkInterval), watermark);
Assert.assertTrue(watermark > lastSeenWatermark);
watermarkCounter++;
lastSeenWatermark = watermark;
} else {
Assert.fail("Unknown element in the list.");
}
}
// clean the output to be ready for the next split
tester.getOutput().clear();
}
// now we are processing one split after the other,
// so all the elements must be here by now.
Assert.assertEquals(NO_OF_FILES * LINES_PER_FILE, lineCounter);
// because we expect one watermark per split.
Assert.assertEquals(splits.length, watermarkCounter);
// then close the reader gracefully so that the Long.MAX watermark is emitted
synchronized (tester.getCheckpointLock()) {
tester.close();
}
for (org.apache.hadoop.fs.Path file : filesCreated) {
hdfs.delete(file, false);
}
// check if the last element is the LongMax watermark (by now this must be the only element)
Assert.assertEquals(1, tester.getOutput().size());
Assert.assertTrue(tester.getOutput().peek() instanceof Watermark);
Assert.assertEquals(Long.MAX_VALUE, ((Watermark) tester.getOutput().poll()).getTimestamp());
// check if the elements are the expected ones.
Assert.assertEquals(expectedFileContents.size(), actualFileContents.size());
for (Integer fileIdx : expectedFileContents.keySet()) {
Assert.assertTrue("file" + fileIdx + " not found", actualFileContents.keySet().contains(fileIdx));
List<String> cntnt = actualFileContents.get(fileIdx);
Collections.sort(cntnt, new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
return getLineNo(o1) - getLineNo(o2);
}
});
StringBuilder cntntStr = new StringBuilder();
for (String line : cntnt) {
cntntStr.append(line);
}
Assert.assertEquals(expectedFileContents.get(fileIdx), cntntStr.toString());
}
}
use of org.apache.flink.api.java.io.TextInputFormat in project flink by apache.
the class ContinuousFileProcessingTest method testProcessContinuously.
@Test
public void testProcessContinuously() throws Exception {
String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";
final OneShotLatch latch = new OneShotLatch();
// create a single file in the directory
Tuple2<org.apache.hadoop.fs.Path, String> bootstrap = createFileAndFillWithData(testBasePath, "file", NO_OF_FILES + 1, "This is test line.");
Assert.assertTrue(hdfs.exists(bootstrap.f0));
final Set<String> filesToBeRead = new TreeSet<>();
filesToBeRead.add(bootstrap.f0.getName());
TextInputFormat format = new TextInputFormat(new Path(testBasePath));
format.setFilesFilter(FilePathFilter.createDefaultFilter());
final ContinuousFileMonitoringFunction<String> monitoringFunction = createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_CONTINUOUSLY);
// 1 for the bootstrap + NO_OF_FILES
final int totalNoOfFilesToBeRead = NO_OF_FILES + 1;
final FileVerifyingSourceContext context = new FileVerifyingSourceContext(latch, monitoringFunction, 1, totalNoOfFilesToBeRead);
final Thread t = new Thread() {
@Override
public void run() {
try {
monitoringFunction.open(new Configuration());
monitoringFunction.run(context);
} catch (Exception e) {
Assert.fail(e.getMessage());
}
}
};
t.start();
if (!latch.isTriggered()) {
latch.await();
}
// create some additional files that will be processed in the case of PROCESS_CONTINUOUSLY
final org.apache.hadoop.fs.Path[] filesCreated = new org.apache.hadoop.fs.Path[NO_OF_FILES];
for (int i = 0; i < NO_OF_FILES; i++) {
Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
filesCreated[i] = file.f0;
filesToBeRead.add(file.f0.getName());
}
// wait until the monitoring thread exits
t.join();
Assert.assertArrayEquals(filesToBeRead.toArray(), context.getSeenFiles().toArray());
// finally delete the files created for the test.
hdfs.delete(bootstrap.f0, false);
for (org.apache.hadoop.fs.Path path : filesCreated) {
hdfs.delete(path, false);
}
}
Aggregations