use of org.apache.flink.api.common.typeinfo.TypeInformation in project flink by apache.
the class RowCsvInputFormatTest method ignoreSingleCharPrefixComments.
@Test
public void ignoreSingleCharPrefixComments() throws Exception {
String fileContent = "#description of the data\n" + "#successive commented line\n" + "this is|1|2.0|\n" + "a test|3|4.0|\n" + "#next|5|6.0|\n";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.DOUBLE_TYPE_INFO };
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
format.setCommentPrefix("#");
format.configure(new Configuration());
format.open(split);
Row result = new Row(3);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("this is", result.getField(0));
assertEquals(1, result.getField(1));
assertEquals(2.0, result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("a test", result.getField(0));
assertEquals(3, result.getField(1));
assertEquals(4.0, result.getField(2));
result = format.nextRecord(result);
assertNull(result);
}
use of org.apache.flink.api.common.typeinfo.TypeInformation in project flink by apache.
the class RowCsvInputFormatTest method testReadSparseWithMask.
@Test
public void testReadSparseWithMask() throws Exception {
String fileContent = "111&&222&&333&&444&&555&&666&&777&&888&&999&&000&&\n" + "000&&999&&888&&777&&666&&555&&444&&333&&222&&111&&";
FileInputSplit split = RowCsvInputFormatTest.createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO };
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, new int[] { 0, 3, 7 });
format.setFieldDelimiter("&&");
format.configure(new Configuration());
format.open(split);
Row result = new Row(3);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(111, result.getField(0));
assertEquals(444, result.getField(1));
assertEquals(888, result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(0, result.getField(0));
assertEquals(777, result.getField(1));
assertEquals(333, result.getField(2));
result = format.nextRecord(result);
assertNull(result);
assertTrue(format.reachedEnd());
}
use of org.apache.flink.api.common.typeinfo.TypeInformation in project flink by apache.
the class RowCsvInputFormatTest method testParserCorrectness.
// Test disabled because we do not support double-quote escaped quotes right now.
@Test
@Ignore
public void testParserCorrectness() throws Exception {
// RFC 4180 Compliance Test content
// Taken from http://en.wikipedia.org/wiki/Comma-separated_values#Example
String fileContent = "Year,Make,Model,Description,Price\n" + "1997,Ford,E350,\"ac, abs, moon\",3000.00\n" + "1999,Chevy,\"Venture \"\"Extended Edition\"\"\",\"\",4900.00\n" + "1996,Jeep,Grand Cherokee,\"MUST SELL! air, moon roof, loaded\",4799.00\n" + "1999,Chevy,\"Venture \"\"Extended Edition, Very Large\"\"\",,5000.00\n" + ",,\"Venture \"\"Extended Edition\"\"\",\"\",4900.00";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.DOUBLE_TYPE_INFO };
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes);
format.setSkipFirstLineAsHeader(true);
format.setFieldDelimiter(",");
format.configure(new Configuration());
format.open(split);
Row result = new Row(5);
Row r1 = new Row(5);
r1.setField(0, 1997);
r1.setField(1, "Ford");
r1.setField(2, "E350");
r1.setField(3, "ac, abs, moon");
r1.setField(4, 3000.0);
Row r2 = new Row(5);
r2.setField(0, 1999);
r2.setField(1, "Chevy");
r2.setField(2, "Venture \"Extended Edition\"");
r2.setField(3, "");
r2.setField(4, 4900.0);
Row r3 = new Row(5);
r3.setField(0, 1996);
r3.setField(1, "Jeep");
r3.setField(2, "Grand Cherokee");
r3.setField(3, "MUST SELL! air, moon roof, loaded");
r3.setField(4, 4799.0);
Row r4 = new Row(5);
r4.setField(0, 1999);
r4.setField(1, "Chevy");
r4.setField(2, "Venture \"Extended Edition, Very Large\"");
r4.setField(3, "");
r4.setField(4, 5000.0);
Row r5 = new Row(5);
r5.setField(0, 0);
r5.setField(1, "");
r5.setField(2, "Venture \"Extended Edition\"");
r5.setField(3, "");
r5.setField(4, 4900.0);
Row[] expectedLines = new Row[] { r1, r2, r3, r4, r5 };
for (Row expected : expectedLines) {
result = format.nextRecord(result);
assertEquals(expected, result);
}
assertNull(format.nextRecord(result));
assertTrue(format.reachedEnd());
}
use of org.apache.flink.api.common.typeinfo.TypeInformation in project flink by apache.
the class RowCsvInputFormatTest method testQuotedStringParsingWithIncludeFields.
@Test
public void testQuotedStringParsingWithIncludeFields() throws Exception {
String fileContent = "\"20:41:52-1-3-2015\"|\"Re: Taskmanager memory error in Eclipse\"|" + "\"Blahblah <blah@blahblah.org>\"|\"blaaa|\"blubb\"";
File tempFile = File.createTempFile("CsvReaderQuotedString", "tmp");
tempFile.deleteOnExit();
tempFile.setWritable(true);
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(tempFile));
writer.write(fileContent);
writer.close();
TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO };
RowCsvInputFormat inputFormat = new RowCsvInputFormat(new Path(tempFile.toURI().toString()), fieldTypes, new int[] { 0, 2 });
inputFormat.enableQuotedStringParsing('"');
inputFormat.setFieldDelimiter("|");
inputFormat.setDelimiter('\n');
inputFormat.configure(new Configuration());
FileInputSplit[] splits = inputFormat.createInputSplits(1);
inputFormat.open(splits[0]);
Row record = inputFormat.nextRecord(new Row(2));
assertEquals("20:41:52-1-3-2015", record.getField(0));
assertEquals("Blahblah <blah@blahblah.org>", record.getField(1));
}
use of org.apache.flink.api.common.typeinfo.TypeInformation in project flink by apache.
the class RowCsvInputFormatTest method testQuotedStringParsingWithEscapedQuotes.
@Test
public void testQuotedStringParsingWithEscapedQuotes() throws Exception {
String fileContent = "\"\\\"Hello\\\" World\"|\"We are\\\" young\"";
File tempFile = File.createTempFile("CsvReaderQuotedString", "tmp");
tempFile.deleteOnExit();
tempFile.setWritable(true);
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(tempFile));
writer.write(fileContent);
writer.close();
TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO };
RowCsvInputFormat inputFormat = new RowCsvInputFormat(new Path(tempFile.toURI().toString()), fieldTypes);
inputFormat.enableQuotedStringParsing('"');
inputFormat.setFieldDelimiter("|");
inputFormat.setDelimiter('\n');
inputFormat.configure(new Configuration());
FileInputSplit[] splits = inputFormat.createInputSplits(1);
inputFormat.open(splits[0]);
Row record = inputFormat.nextRecord(new Row(2));
assertEquals("\\\"Hello\\\" World", record.getField(0));
assertEquals("We are\\\" young", record.getField(1));
}
Aggregations