use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class CsvInputFormatTest method readStringFieldsWithTrailingDelimiters.
@Test
public void readStringFieldsWithTrailingDelimiters() {
try {
final String fileContent = "abc|-def|-ghijk\nabc|-|-hhg\n|-|-|-\n";
final FileInputSplit split = createTempFile(fileContent);
final TupleTypeInfo<Tuple3<String, String, String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class, String.class, String.class);
final CsvInputFormat<Tuple3<String, String, String>> format = new TupleCsvInputFormat<Tuple3<String, String, String>>(PATH, typeInfo);
format.setFieldDelimiter("|-");
format.configure(new Configuration());
format.open(split);
Tuple3<String, String, String> result = new Tuple3<String, String, String>();
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("abc", result.f0);
assertEquals("def", result.f1);
assertEquals("ghijk", result.f2);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("abc", result.f0);
assertEquals("", result.f1);
assertEquals("hhg", result.f2);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("", result.f0);
assertEquals("", result.f1);
assertEquals("", result.f2);
result = format.nextRecord(result);
assertNull(result);
assertTrue(format.reachedEnd());
} catch (Exception ex) {
fail("Test failed due to a " + ex.getClass().getName() + ": " + ex.getMessage());
}
}
use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class CsvInputFormatTest method testPojoTypeWithPartialFieldInCSV.
@Test
public void testPojoTypeWithPartialFieldInCSV() throws Exception {
File tempFile = File.createTempFile("CsvReaderPojoType", "tmp");
tempFile.deleteOnExit();
tempFile.setWritable(true);
OutputStreamWriter wrt = new OutputStreamWriter(new FileOutputStream(tempFile));
wrt.write("123,NODATA,AAA,NODATA,3.123,BBB\n");
wrt.write("456,NODATA,BBB,NODATA,1.123,AAA\n");
wrt.close();
@SuppressWarnings("unchecked") PojoTypeInfo<PojoItem> typeInfo = (PojoTypeInfo<PojoItem>) TypeExtractor.createTypeInfo(PojoItem.class);
CsvInputFormat<PojoItem> inputFormat = new PojoCsvInputFormat<PojoItem>(new Path(tempFile.toURI().toString()), typeInfo, new boolean[] { true, false, true, false, true, true });
inputFormat.configure(new Configuration());
FileInputSplit[] splits = inputFormat.createInputSplits(1);
inputFormat.open(splits[0]);
validatePojoItem(inputFormat);
}
use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class CsvInputFormatTest method testQuotedStringParsingWithIncludeFields.
@Test
public void testQuotedStringParsingWithIncludeFields() throws Exception {
final String fileContent = "\"20:41:52-1-3-2015\"|\"Re: Taskmanager memory error in Eclipse\"|" + "\"Blahblah <blah@blahblah.org>\"|\"blaaa|\"blubb\"";
final File tempFile = File.createTempFile("CsvReaderQuotedString", "tmp");
tempFile.deleteOnExit();
tempFile.setWritable(true);
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(tempFile));
writer.write(fileContent);
writer.close();
TupleTypeInfo<Tuple2<String, String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class, String.class);
CsvInputFormat<Tuple2<String, String>> inputFormat = new TupleCsvInputFormat<Tuple2<String, String>>(new Path(tempFile.toURI().toString()), typeInfo, new boolean[] { true, false, true });
inputFormat.enableQuotedStringParsing('"');
inputFormat.setFieldDelimiter("|");
inputFormat.setDelimiter('\n');
inputFormat.configure(new Configuration());
FileInputSplit[] splits = inputFormat.createInputSplits(1);
inputFormat.open(splits[0]);
Tuple2<String, String> record = inputFormat.nextRecord(new Tuple2<String, String>());
assertEquals("20:41:52-1-3-2015", record.f0);
assertEquals("Blahblah <blah@blahblah.org>", record.f1);
}
use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class ContinuousFileReaderOperator method restoreState.
// ------------------------------------------------------------------------
// Restoring / Migrating from an older Flink version.
// ------------------------------------------------------------------------
@Override
public void restoreState(FSDataInputStream in) throws Exception {
LOG.info("{} (taskIdx={}) restoring state from an older Flink version.", getClass().getSimpleName(), getRuntimeContext().getIndexOfThisSubtask());
// this is just to read the byte indicating if we have udf state or not
int hasUdfState = in.read();
Preconditions.checkArgument(hasUdfState == 0);
final ObjectInputStream ois = new ObjectInputStream(in);
final DataInputViewStreamWrapper div = new DataInputViewStreamWrapper(in);
// read the split that was being read
FileInputSplit currSplit = (FileInputSplit) ois.readObject();
// read the pending splits list
List<FileInputSplit> pendingSplits = new LinkedList<>();
int noOfSplits = div.readInt();
for (int i = 0; i < noOfSplits; i++) {
FileInputSplit split = (FileInputSplit) ois.readObject();
pendingSplits.add(split);
}
// read the state of the format
Serializable formatState = (Serializable) ois.readObject();
div.close();
if (restoredReaderState == null) {
restoredReaderState = new ArrayList<>();
}
// we do not know the modification time of the retrieved splits, so we assign them
// artificial ones, with the only constraint that they respect the relative order of the
// retrieved splits, because modification time is going to be used to sort the splits within
// the "pending splits" priority queue.
long now = getProcessingTimeService().getCurrentProcessingTime();
long runningModTime = Math.max(now, noOfSplits + 1);
TimestampedFileInputSplit currentSplit = createTimestampedFileSplit(currSplit, --runningModTime, formatState);
restoredReaderState.add(currentSplit);
for (FileInputSplit split : pendingSplits) {
TimestampedFileInputSplit timestampedSplit = createTimestampedFileSplit(split, --runningModTime);
restoredReaderState.add(timestampedSplit);
}
if (LOG.isDebugEnabled()) {
if (LOG.isDebugEnabled()) {
LOG.debug("{} (taskIdx={}) restored {} splits from legacy: {}.", getClass().getSimpleName(), getRuntimeContext().getIndexOfThisSubtask(), restoredReaderState.size(), restoredReaderState);
}
}
}
use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class ContinuousFileMonitoringFunction method getInputSplitsSortedByModTime.
/**
* Creates the input splits to be forwarded to the downstream tasks of the
* {@link ContinuousFileReaderOperator}. Splits are sorted <b>by modification time</b> before
* being forwarded and only splits belonging to files in the {@code eligibleFiles}
* list will be processed.
* @param eligibleFiles The files to process.
*/
private Map<Long, List<TimestampedFileInputSplit>> getInputSplitsSortedByModTime(Map<Path, FileStatus> eligibleFiles) throws IOException {
Map<Long, List<TimestampedFileInputSplit>> splitsByModTime = new TreeMap<>();
if (eligibleFiles.isEmpty()) {
return splitsByModTime;
}
for (FileInputSplit split : format.createInputSplits(readerParallelism)) {
FileStatus fileStatus = eligibleFiles.get(split.getPath());
if (fileStatus != null) {
Long modTime = fileStatus.getModificationTime();
List<TimestampedFileInputSplit> splitsToForward = splitsByModTime.get(modTime);
if (splitsToForward == null) {
splitsToForward = new ArrayList<>();
splitsByModTime.put(modTime, splitsToForward);
}
splitsToForward.add(new TimestampedFileInputSplit(modTime, split.getSplitNumber(), split.getPath(), split.getStart(), split.getLength(), split.getHostnames()));
}
}
return splitsByModTime;
}
Aggregations