use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class FileInputFormatTest method testReadMultiplePatterns.
@Test
public void testReadMultiplePatterns() throws Exception {
final String contents = "CONTENTS";
// create some accepted, some ignored files
File child1 = temporaryFolder.newFile("dataFile1.txt");
File child2 = temporaryFolder.newFile("another_file.bin");
createTempFiles(contents.getBytes(ConfigConstants.DEFAULT_CHARSET), child1, child2);
// test that only the valid files are accepted
Configuration configuration = new Configuration();
final DummyFileInputFormat format = new DummyFileInputFormat();
format.setFilePath(temporaryFolder.getRoot().toURI().toString());
format.configure(configuration);
format.setFilesFilter(new GlobFilePathFilter(Collections.singletonList("**"), Arrays.asList("**/another_file.bin", "**/dataFile1.txt")));
FileInputSplit[] splits = format.createInputSplits(1);
Assert.assertEquals(0, splits.length);
}
use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class GenericCsvInputFormatTest method testReadWithCharset.
@Test
public void testReadWithCharset() throws IOException {
// Unicode row fragments
String[] records = new String[] { "Ȏȟ", "Flink", "ȋȏ" };
// Unicode delimiter
String delimiter = "׀׀";
String fileContent = StringUtils.join(records, delimiter);
// StringValueParser does not use charset so rely on StringParser
GenericCsvInputFormat<String[]> format = new GenericCsvInputFormat<String[]>() {
@Override
public String[] readRecord(String[] target, byte[] bytes, int offset, int numBytes) throws IOException {
return parseRecord(target, bytes, offset, numBytes) ? target : null;
}
};
format.setFilePath("file:///some/file/that/will/not/be/read");
for (String charset : new String[] { "UTF-8", "UTF-16BE", "UTF-16LE" }) {
File tempFile = File.createTempFile("test_contents", "tmp");
tempFile.deleteOnExit();
// write string with proper encoding
try (Writer out = new OutputStreamWriter(new FileOutputStream(tempFile), charset)) {
out.write(fileContent);
}
FileInputSplit split = new FileInputSplit(0, new Path(tempFile.toURI().toString()), 0, tempFile.length(), new String[] { "localhost" });
format.setFieldDelimiter(delimiter);
format.setFieldTypesGeneric(String.class, String.class, String.class);
// use the same encoding to parse the file as used to read the file;
// the field delimiter is reinterpreted when the charset is set
format.setCharset(charset);
format.configure(new Configuration());
format.open(split);
String[] values = new String[] { "", "", "" };
values = format.nextRecord(values);
// validate results
assertNotNull(values);
for (int i = 0; i < records.length; i++) {
assertEquals(records[i], values[i]);
}
assertNull(format.nextRecord(values));
assertTrue(format.reachedEnd());
}
format.close();
}
use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class GenericCsvInputFormatTest method readWithHeaderLine.
@Test
public void readWithHeaderLine() {
try {
final String fileContent = "colname-1|colname-2|some name 3|column four|\n" + "123|abc|456|def|\n" + "987|xyz|654|pqr|\n";
final FileInputSplit split = createTempFile(fileContent);
final Configuration parameters = new Configuration();
format.setFieldDelimiter("|");
format.setFieldTypesGeneric(IntValue.class, StringValue.class, IntValue.class, StringValue.class);
format.setSkipFirstLineAsHeader(true);
format.configure(parameters);
format.open(split);
Value[] values = new Value[] { new IntValue(), new StringValue(), new IntValue(), new StringValue() };
// first line is skipped as header
// first row (= second line)
assertNotNull(format.nextRecord(values));
// second row (= third line)
assertNotNull(format.nextRecord(values));
// exhausted
assertNull(format.nextRecord(values));
// exhausted
assertTrue(format.reachedEnd());
} catch (Exception ex) {
fail("Test failed due to a " + ex.getClass().getSimpleName() + ": " + ex.getMessage());
}
}
use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class DelimitedInputFormatTest method testOpen.
@Test
public void testOpen() throws IOException {
final String myString = "my mocked line 1\nmy mocked line 2\n";
final FileInputSplit split = createTempFile(myString);
int bufferSize = 5;
format.setBufferSize(bufferSize);
format.open(split);
assertEquals(0, format.splitStart);
assertEquals(myString.length() - bufferSize, format.splitLength);
assertEquals(bufferSize, format.getBufferSize());
}
use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class DelimitedInputFormatTest method testReadRecordsLargerThanBuffer.
@Test
public void testReadRecordsLargerThanBuffer() throws IOException {
final String myString = "aaaaaaaaaaaaaaaaaaaaa\n" + "bbbbbbbbbbbbbbbbbbbbbbbbb\n" + "ccccccccccccccccccc\n" + "ddddddddddddddddddddddddddddddddddd\n";
final FileInputSplit split = createTempFile(myString);
FileInputSplit split1 = new FileInputSplit(0, split.getPath(), 0, split.getLength() / 2, split.getHostnames());
FileInputSplit split2 = new FileInputSplit(1, split.getPath(), split1.getLength(), split.getLength(), split.getHostnames());
final Configuration parameters = new Configuration();
format.setBufferSize(8);
format.configure(parameters);
String next;
List<String> result = new ArrayList<String>();
format.open(split1);
while ((next = format.nextRecord(null)) != null) {
result.add(next);
}
assertNull(format.nextRecord(null));
assertTrue(format.reachedEnd());
format.close();
format.open(split2);
while ((next = format.nextRecord(null)) != null) {
result.add(next);
}
assertNull(format.nextRecord(null));
assertTrue(format.reachedEnd());
format.close();
assertEquals(4, result.size());
assertEquals(Arrays.asList(myString.split("\n")), result);
}
Aggregations