use of org.apache.flink.api.common.io.ParseException in project flink by apache.
the class RowCsvInputFormat method parseRecord.
@Override
protected boolean parseRecord(Object[] holders, byte[] bytes, int offset, int numBytes) throws ParseException {
byte[] fieldDelimiter = this.getFieldDelimiter();
boolean[] fieldIncluded = this.fieldIncluded;
int startPos = offset;
int limit = offset + numBytes;
int field = 0;
int output = 0;
while (field < fieldIncluded.length) {
// check valid start position
if (startPos > limit || (startPos == limit && field != fieldIncluded.length - 1)) {
if (isLenient()) {
return false;
} else {
throw new ParseException("Row too short: " + new String(bytes, offset, numBytes, getCharset()));
}
}
if (fieldIncluded[field]) {
// parse field
FieldParser<Object> parser = (FieldParser<Object>) this.getFieldParsers()[fieldPosMap[output]];
int latestValidPos = startPos;
startPos = parser.resetErrorStateAndParse(bytes, startPos, limit, fieldDelimiter, holders[fieldPosMap[output]]);
if (!isLenient() && (parser.getErrorState() != FieldParser.ParseErrorState.NONE)) {
// the error state EMPTY_COLUMN is ignored
if (parser.getErrorState() != FieldParser.ParseErrorState.EMPTY_COLUMN) {
throw new ParseException(String.format("Parsing error for column %1$s of row '%2$s' originated by %3$s: %4$s.", field + 1, new String(bytes, offset, numBytes), parser.getClass().getSimpleName(), parser.getErrorState()));
}
}
holders[fieldPosMap[output]] = parser.getLastResult();
// or empty with emptyColumnAsNull enabled
if (startPos < 0 || (emptyColumnAsNull && (parser.getErrorState().equals(FieldParser.ParseErrorState.EMPTY_COLUMN)))) {
holders[fieldPosMap[output]] = null;
startPos = skipFields(bytes, latestValidPos, limit, fieldDelimiter);
}
output++;
} else {
// skip field
startPos = skipFields(bytes, startPos, limit, fieldDelimiter);
}
// check if something went wrong
if (startPos < 0) {
throw new ParseException(String.format("Unexpected parser position for column %1$s of row '%2$s'", field + 1, new String(bytes, offset, numBytes)));
} else if (startPos == limit && field != fieldIncluded.length - 1 && !FieldParser.endsWithDelimiter(bytes, startPos - 1, fieldDelimiter)) {
// and the end is not a field delimiter indicating an empty last field.
if (isLenient()) {
return false;
} else {
throw new ParseException("Row too short: " + new String(bytes, offset, numBytes));
}
}
field++;
}
return true;
}
use of org.apache.flink.api.common.io.ParseException in project flink by apache.
the class RowCsvInputFormatTest method testTailingEmptyFields.
@Test
public void testTailingEmptyFields() throws Exception {
String fileContent = "abc|-def|-ghijk\n" + "abc|-def|-\n" + "abc|-|-\n" + "|-|-|-\n" + "|-|-\n" + "abc|-def\n";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO };
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
format.setFieldDelimiter("|-");
format.configure(new Configuration());
format.open(split);
Row result = new Row(3);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("abc", result.getField(0));
assertEquals("def", result.getField(1));
assertEquals("ghijk", result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("abc", result.getField(0));
assertEquals("def", result.getField(1));
assertEquals("", result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("abc", result.getField(0));
assertEquals("", result.getField(1));
assertEquals("", result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("", result.getField(0));
assertEquals("", result.getField(1));
assertEquals("", result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("", result.getField(0));
assertEquals("", result.getField(1));
assertEquals("", result.getField(2));
try {
format.nextRecord(result);
fail("Parse Exception was not thrown! (Row too short)");
} catch (ParseException e) {
}
}
use of org.apache.flink.api.common.io.ParseException in project flink by apache.
the class RowCsvInputFormatTest method ignoreInvalidLines.
@Test
public void ignoreInvalidLines() throws Exception {
String fileContent = "#description of the data\n" + "header1|header2|header3|\n" + "this is|1|2.0|\n" + "//a comment\n" + "a test|3|4.0|\n" + "#next|5|6.0|\n";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.DOUBLE_TYPE_INFO };
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
format.setLenient(false);
Configuration parameters = new Configuration();
format.configure(new Configuration());
format.open(split);
Row result = new Row(3);
try {
result = format.nextRecord(result);
fail("Parse Exception was not thrown! (Row too short)");
} catch (ParseException ignored) {
}
try {
result = format.nextRecord(result);
fail("Parse Exception was not thrown! (Invalid int value)");
} catch (ParseException ignored) {
}
// => ok
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("this is", result.getField(0));
assertEquals(1, result.getField(1));
assertEquals(2.0, result.getField(2));
try {
result = format.nextRecord(result);
fail("Parse Exception was not thrown! (Row too short)");
} catch (ParseException ignored) {
}
// => ok
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("a test", result.getField(0));
assertEquals(3, result.getField(1));
assertEquals(4.0, result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("#next", result.getField(0));
assertEquals(5, result.getField(1));
assertEquals(6.0, result.getField(2));
result = format.nextRecord(result);
assertNull(result);
// re-open with lenient = true
format.setLenient(true);
format.configure(parameters);
format.open(split);
result = new Row(3);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("header1", result.getField(0));
assertNull(result.getField(1));
assertNull(result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("this is", result.getField(0));
assertEquals(1, result.getField(1));
assertEquals(2.0, result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("a test", result.getField(0));
assertEquals(3, result.getField(1));
assertEquals(4.0, result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("#next", result.getField(0));
assertEquals(5, result.getField(1));
assertEquals(6.0, result.getField(2));
result = format.nextRecord(result);
assertNull(result);
}
use of org.apache.flink.api.common.io.ParseException in project flink by apache.
the class CsvInputFormatTest method testTailingEmptyFields.
@Test
public void testTailingEmptyFields() throws Exception {
final String fileContent = "aa,bb,cc\n" + // ok
"aa,bb,\n" + // the last field is empty
"aa,,\n" + // the last two fields are empty
",,\n" + // all fields are empty
"aa,bb";
// row too short
final FileInputSplit split = createTempFile(fileContent);
final TupleTypeInfo<Tuple3<String, String, String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class, String.class, String.class);
final CsvInputFormat<Tuple3<String, String, String>> format = new TupleCsvInputFormat<Tuple3<String, String, String>>(PATH, typeInfo);
format.setFieldDelimiter(",");
format.configure(new Configuration());
format.open(split);
Tuple3<String, String, String> result = new Tuple3<String, String, String>();
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("aa", result.f0);
assertEquals("bb", result.f1);
assertEquals("cc", result.f2);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("aa", result.f0);
assertEquals("bb", result.f1);
assertEquals("", result.f2);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("aa", result.f0);
assertEquals("", result.f1);
assertEquals("", result.f2);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("", result.f0);
assertEquals("", result.f1);
assertEquals("", result.f2);
try {
format.nextRecord(result);
fail("Parse Exception was not thrown! (Row too short)");
} catch (ParseException e) {
}
}
use of org.apache.flink.api.common.io.ParseException in project flink by apache.
the class CsvInputFormatTest method testEmptyFields.
@Test
public void testEmptyFields() throws IOException {
try {
final String fileContent = "|0|0|0|0|0|\n" + "1||1|1|1|1|\n" + "2|2||2|2|2|\n" + "3|3|3| |3|3|\n" + "4|4|4|4||4|\n" + "5|5|5|5|5||\n";
final FileInputSplit split = createTempFile(fileContent);
final TupleTypeInfo<Tuple6<Short, Integer, Long, Float, Double, Byte>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(Short.class, Integer.class, Long.class, Float.class, Double.class, Byte.class);
final CsvInputFormat<Tuple6<Short, Integer, Long, Float, Double, Byte>> format = new TupleCsvInputFormat<Tuple6<Short, Integer, Long, Float, Double, Byte>>(PATH, typeInfo);
format.setFieldDelimiter("|");
format.configure(new Configuration());
format.open(split);
Tuple6<Short, Integer, Long, Float, Double, Byte> result = new Tuple6<Short, Integer, Long, Float, Double, Byte>();
try {
result = format.nextRecord(result);
fail("Empty String Parse Exception was not thrown! (ShortParser)");
} catch (ParseException e) {
}
try {
result = format.nextRecord(result);
fail("Empty String Parse Exception was not thrown! (IntegerParser)");
} catch (ParseException e) {
}
try {
result = format.nextRecord(result);
fail("Empty String Parse Exception was not thrown! (LongParser)");
} catch (ParseException e) {
}
try {
result = format.nextRecord(result);
fail("Empty String Parse Exception was not thrown! (FloatParser)");
} catch (ParseException e) {
}
try {
result = format.nextRecord(result);
fail("Empty String Parse Exception was not thrown! (DoubleParser)");
} catch (ParseException e) {
}
try {
result = format.nextRecord(result);
fail("Empty String Parse Exception was not thrown! (ByteParser)");
} catch (ParseException e) {
}
result = format.nextRecord(result);
assertNull(result);
assertTrue(format.reachedEnd());
} catch (Exception ex) {
fail("Test failed due to a " + ex.getClass().getName() + ": " + ex.getMessage());
}
}
Aggregations