use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class CsvInputFormatTest method ignoreInvalidLines.
private void ignoreInvalidLines(int bufferSize) {
try {
final String fileContent = "#description of the data\n" + "header1|header2|header3|\n" + "this is|1|2.0|\n" + "//a comment\n" + "a test|3|4.0|\n" + "#next|5|6.0|\n" + "asdasdas";
final FileInputSplit split = createTempFile(fileContent);
final TupleTypeInfo<Tuple3<String, Integer, Double>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class, Integer.class, Double.class);
final CsvInputFormat<Tuple3<String, Integer, Double>> format = new TupleCsvInputFormat<Tuple3<String, Integer, Double>>(PATH, "\n", "|", typeInfo);
format.setLenient(true);
format.setBufferSize(bufferSize);
final Configuration parameters = new Configuration();
format.configure(parameters);
format.open(split);
Tuple3<String, Integer, Double> result = new Tuple3<String, Integer, Double>();
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("this is", result.f0);
assertEquals(Integer.valueOf(1), result.f1);
assertEquals(new Double(2.0), result.f2);
assertEquals((long) format.getCurrentState(), 65);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("a test", result.f0);
assertEquals(Integer.valueOf(3), result.f1);
assertEquals(new Double(4.0), result.f2);
assertEquals((long) format.getCurrentState(), 91);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("#next", result.f0);
assertEquals(Integer.valueOf(5), result.f1);
assertEquals(new Double(6.0), result.f2);
assertEquals((long) format.getCurrentState(), 104);
result = format.nextRecord(result);
assertNull(result);
assertEquals(fileContent.length(), (long) format.getCurrentState());
} catch (Exception ex) {
ex.printStackTrace();
fail("Test failed due to a " + ex.getClass().getName() + ": " + ex.getMessage());
}
}
use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class CsvInputFormatTest method ignoreMultiCharPrefixComments.
@Test
public void ignoreMultiCharPrefixComments() {
try {
final String fileContent = "//description of the data\n" + "//successive commented line\n" + "this is|1|2.0|\n" + "a test|3|4.0|\n" + "//next|5|6.0|\n";
final FileInputSplit split = createTempFile(fileContent);
final TupleTypeInfo<Tuple3<String, Integer, Double>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class, Integer.class, Double.class);
final CsvInputFormat<Tuple3<String, Integer, Double>> format = new TupleCsvInputFormat<Tuple3<String, Integer, Double>>(PATH, "\n", "|", typeInfo);
format.setCommentPrefix("//");
final Configuration parameters = new Configuration();
format.configure(parameters);
format.open(split);
Tuple3<String, Integer, Double> result = new Tuple3<String, Integer, Double>();
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("this is", result.f0);
assertEquals(Integer.valueOf(1), result.f1);
assertEquals(new Double(2.0), result.f2);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("a test", result.f0);
assertEquals(Integer.valueOf(3), result.f1);
assertEquals(new Double(4.0), result.f2);
result = format.nextRecord(result);
assertNull(result);
} catch (Exception ex) {
ex.printStackTrace();
fail("Test failed due to a " + ex.getClass().getName() + ": " + ex.getMessage());
}
}
use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class CsvInputFormatTest method readStringFieldsWithTrailingDelimiters.
@Test
public void readStringFieldsWithTrailingDelimiters() {
try {
final String fileContent = "abc|-def|-ghijk\nabc|-|-hhg\n|-|-|-\n";
final FileInputSplit split = createTempFile(fileContent);
final TupleTypeInfo<Tuple3<String, String, String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class, String.class, String.class);
final CsvInputFormat<Tuple3<String, String, String>> format = new TupleCsvInputFormat<Tuple3<String, String, String>>(PATH, typeInfo);
format.setFieldDelimiter("|-");
format.configure(new Configuration());
format.open(split);
Tuple3<String, String, String> result = new Tuple3<String, String, String>();
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("abc", result.f0);
assertEquals("def", result.f1);
assertEquals("ghijk", result.f2);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("abc", result.f0);
assertEquals("", result.f1);
assertEquals("hhg", result.f2);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("", result.f0);
assertEquals("", result.f1);
assertEquals("", result.f2);
result = format.nextRecord(result);
assertNull(result);
assertTrue(format.reachedEnd());
} catch (Exception ex) {
fail("Test failed due to a " + ex.getClass().getName() + ": " + ex.getMessage());
}
}
use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class CsvInputFormatTest method testPojoTypeWithPartialFieldInCSV.
@Test
public void testPojoTypeWithPartialFieldInCSV() throws Exception {
File tempFile = File.createTempFile("CsvReaderPojoType", "tmp");
tempFile.deleteOnExit();
tempFile.setWritable(true);
OutputStreamWriter wrt = new OutputStreamWriter(new FileOutputStream(tempFile));
wrt.write("123,NODATA,AAA,NODATA,3.123,BBB\n");
wrt.write("456,NODATA,BBB,NODATA,1.123,AAA\n");
wrt.close();
@SuppressWarnings("unchecked") PojoTypeInfo<PojoItem> typeInfo = (PojoTypeInfo<PojoItem>) TypeExtractor.createTypeInfo(PojoItem.class);
CsvInputFormat<PojoItem> inputFormat = new PojoCsvInputFormat<PojoItem>(new Path(tempFile.toURI().toString()), typeInfo, new boolean[] { true, false, true, false, true, true });
inputFormat.configure(new Configuration());
FileInputSplit[] splits = inputFormat.createInputSplits(1);
inputFormat.open(splits[0]);
validatePojoItem(inputFormat);
}
use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class CsvInputFormatTest method testQuotedStringParsingWithIncludeFields.
@Test
public void testQuotedStringParsingWithIncludeFields() throws Exception {
final String fileContent = "\"20:41:52-1-3-2015\"|\"Re: Taskmanager memory error in Eclipse\"|" + "\"Blahblah <blah@blahblah.org>\"|\"blaaa|\"blubb\"";
final File tempFile = File.createTempFile("CsvReaderQuotedString", "tmp");
tempFile.deleteOnExit();
tempFile.setWritable(true);
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(tempFile));
writer.write(fileContent);
writer.close();
TupleTypeInfo<Tuple2<String, String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class, String.class);
CsvInputFormat<Tuple2<String, String>> inputFormat = new TupleCsvInputFormat<Tuple2<String, String>>(new Path(tempFile.toURI().toString()), typeInfo, new boolean[] { true, false, true });
inputFormat.enableQuotedStringParsing('"');
inputFormat.setFieldDelimiter("|");
inputFormat.setDelimiter('\n');
inputFormat.configure(new Configuration());
FileInputSplit[] splits = inputFormat.createInputSplits(1);
inputFormat.open(splits[0]);
Tuple2<String, String> record = inputFormat.nextRecord(new Tuple2<String, String>());
assertEquals("20:41:52-1-3-2015", record.f0);
assertEquals("Blahblah <blah@blahblah.org>", record.f1);
}
Aggregations