Search in sources :

Example 1 with ParseException

use of org.apache.flink.api.common.io.ParseException in project flink by apache.

the class RowCsvInputFormat method parseRecord.

@Override
protected boolean parseRecord(Object[] holders, byte[] bytes, int offset, int numBytes) throws ParseException {
    byte[] fieldDelimiter = this.getFieldDelimiter();
    boolean[] fieldIncluded = this.fieldIncluded;
    int startPos = offset;
    int limit = offset + numBytes;
    int field = 0;
    int output = 0;
    while (field < fieldIncluded.length) {
        // check valid start position
        if (startPos > limit || (startPos == limit && field != fieldIncluded.length - 1)) {
            if (isLenient()) {
                return false;
            } else {
                throw new ParseException("Row too short: " + new String(bytes, offset, numBytes, getCharset()));
            }
        }
        if (fieldIncluded[field]) {
            // parse field
            FieldParser<Object> parser = (FieldParser<Object>) this.getFieldParsers()[fieldPosMap[output]];
            int latestValidPos = startPos;
            startPos = parser.resetErrorStateAndParse(bytes, startPos, limit, fieldDelimiter, holders[fieldPosMap[output]]);
            if (!isLenient() && (parser.getErrorState() != FieldParser.ParseErrorState.NONE)) {
                // the error state EMPTY_COLUMN is ignored
                if (parser.getErrorState() != FieldParser.ParseErrorState.EMPTY_COLUMN) {
                    throw new ParseException(String.format("Parsing error for column %1$s of row '%2$s' originated by %3$s: %4$s.", field + 1, new String(bytes, offset, numBytes), parser.getClass().getSimpleName(), parser.getErrorState()));
                }
            }
            holders[fieldPosMap[output]] = parser.getLastResult();
            // or empty with emptyColumnAsNull enabled
            if (startPos < 0 || (emptyColumnAsNull && (parser.getErrorState().equals(FieldParser.ParseErrorState.EMPTY_COLUMN)))) {
                holders[fieldPosMap[output]] = null;
                startPos = skipFields(bytes, latestValidPos, limit, fieldDelimiter);
            }
            output++;
        } else {
            // skip field
            startPos = skipFields(bytes, startPos, limit, fieldDelimiter);
        }
        // check if something went wrong
        if (startPos < 0) {
            throw new ParseException(String.format("Unexpected parser position for column %1$s of row '%2$s'", field + 1, new String(bytes, offset, numBytes)));
        } else if (startPos == limit && field != fieldIncluded.length - 1 && !FieldParser.endsWithDelimiter(bytes, startPos - 1, fieldDelimiter)) {
            // and the end is not a field delimiter indicating an empty last field.
            if (isLenient()) {
                return false;
            } else {
                throw new ParseException("Row too short: " + new String(bytes, offset, numBytes));
            }
        }
        field++;
    }
    return true;
}
Also used : ParseException(org.apache.flink.api.common.io.ParseException) FieldParser(org.apache.flink.types.parser.FieldParser)

Example 2 with ParseException

use of org.apache.flink.api.common.io.ParseException in project flink by apache.

the class RowCsvInputFormatTest method testTailingEmptyFields.

@Test
public void testTailingEmptyFields() throws Exception {
    String fileContent = "abc|-def|-ghijk\n" + "abc|-def|-\n" + "abc|-|-\n" + "|-|-|-\n" + "|-|-\n" + "abc|-def\n";
    FileInputSplit split = createTempFile(fileContent);
    TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO };
    RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
    format.setFieldDelimiter("|-");
    format.configure(new Configuration());
    format.open(split);
    Row result = new Row(3);
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("abc", result.getField(0));
    assertEquals("def", result.getField(1));
    assertEquals("ghijk", result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("abc", result.getField(0));
    assertEquals("def", result.getField(1));
    assertEquals("", result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("abc", result.getField(0));
    assertEquals("", result.getField(1));
    assertEquals("", result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("", result.getField(0));
    assertEquals("", result.getField(1));
    assertEquals("", result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("", result.getField(0));
    assertEquals("", result.getField(1));
    assertEquals("", result.getField(2));
    try {
        format.nextRecord(result);
        fail("Parse Exception was not thrown! (Row too short)");
    } catch (ParseException e) {
    }
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) Row(org.apache.flink.types.Row) ParseException(org.apache.flink.api.common.io.ParseException) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Test(org.junit.Test)

Example 3 with ParseException

use of org.apache.flink.api.common.io.ParseException in project flink by apache.

the class RowCsvInputFormatTest method ignoreInvalidLines.

@Test
public void ignoreInvalidLines() throws Exception {
    String fileContent = "#description of the data\n" + "header1|header2|header3|\n" + "this is|1|2.0|\n" + "//a comment\n" + "a test|3|4.0|\n" + "#next|5|6.0|\n";
    FileInputSplit split = createTempFile(fileContent);
    TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.DOUBLE_TYPE_INFO };
    RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
    format.setLenient(false);
    Configuration parameters = new Configuration();
    format.configure(new Configuration());
    format.open(split);
    Row result = new Row(3);
    try {
        result = format.nextRecord(result);
        fail("Parse Exception was not thrown! (Row too short)");
    } catch (ParseException ignored) {
    }
    try {
        result = format.nextRecord(result);
        fail("Parse Exception was not thrown! (Invalid int value)");
    } catch (ParseException ignored) {
    }
    // => ok
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("this is", result.getField(0));
    assertEquals(1, result.getField(1));
    assertEquals(2.0, result.getField(2));
    try {
        result = format.nextRecord(result);
        fail("Parse Exception was not thrown! (Row too short)");
    } catch (ParseException ignored) {
    }
    // => ok
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("a test", result.getField(0));
    assertEquals(3, result.getField(1));
    assertEquals(4.0, result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("#next", result.getField(0));
    assertEquals(5, result.getField(1));
    assertEquals(6.0, result.getField(2));
    result = format.nextRecord(result);
    assertNull(result);
    // re-open with lenient = true
    format.setLenient(true);
    format.configure(parameters);
    format.open(split);
    result = new Row(3);
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("header1", result.getField(0));
    assertNull(result.getField(1));
    assertNull(result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("this is", result.getField(0));
    assertEquals(1, result.getField(1));
    assertEquals(2.0, result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("a test", result.getField(0));
    assertEquals(3, result.getField(1));
    assertEquals(4.0, result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("#next", result.getField(0));
    assertEquals(5, result.getField(1));
    assertEquals(6.0, result.getField(2));
    result = format.nextRecord(result);
    assertNull(result);
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) Row(org.apache.flink.types.Row) ParseException(org.apache.flink.api.common.io.ParseException) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Test(org.junit.Test)

Example 4 with ParseException

use of org.apache.flink.api.common.io.ParseException in project flink by apache.

the class CsvInputFormatTest method testTailingEmptyFields.

@Test
public void testTailingEmptyFields() throws Exception {
    final String fileContent = "aa,bb,cc\n" + // ok
    "aa,bb,\n" + // the last field is empty
    "aa,,\n" + // the last two fields are empty
    ",,\n" + // all fields are empty
    "aa,bb";
    // row too short
    final FileInputSplit split = createTempFile(fileContent);
    final TupleTypeInfo<Tuple3<String, String, String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class, String.class, String.class);
    final CsvInputFormat<Tuple3<String, String, String>> format = new TupleCsvInputFormat<Tuple3<String, String, String>>(PATH, typeInfo);
    format.setFieldDelimiter(",");
    format.configure(new Configuration());
    format.open(split);
    Tuple3<String, String, String> result = new Tuple3<String, String, String>();
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("aa", result.f0);
    assertEquals("bb", result.f1);
    assertEquals("cc", result.f2);
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("aa", result.f0);
    assertEquals("bb", result.f1);
    assertEquals("", result.f2);
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("aa", result.f0);
    assertEquals("", result.f1);
    assertEquals("", result.f2);
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("", result.f0);
    assertEquals("", result.f1);
    assertEquals("", result.f2);
    try {
        format.nextRecord(result);
        fail("Parse Exception was not thrown! (Row too short)");
    } catch (ParseException e) {
    }
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) Tuple3(org.apache.flink.api.java.tuple.Tuple3) ParseException(org.apache.flink.api.common.io.ParseException) Test(org.junit.Test)

Example 5 with ParseException

use of org.apache.flink.api.common.io.ParseException in project flink by apache.

the class CsvInputFormatTest method testEmptyFields.

@Test
public void testEmptyFields() throws IOException {
    try {
        final String fileContent = "|0|0|0|0|0|\n" + "1||1|1|1|1|\n" + "2|2||2|2|2|\n" + "3|3|3| |3|3|\n" + "4|4|4|4||4|\n" + "5|5|5|5|5||\n";
        final FileInputSplit split = createTempFile(fileContent);
        final TupleTypeInfo<Tuple6<Short, Integer, Long, Float, Double, Byte>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(Short.class, Integer.class, Long.class, Float.class, Double.class, Byte.class);
        final CsvInputFormat<Tuple6<Short, Integer, Long, Float, Double, Byte>> format = new TupleCsvInputFormat<Tuple6<Short, Integer, Long, Float, Double, Byte>>(PATH, typeInfo);
        format.setFieldDelimiter("|");
        format.configure(new Configuration());
        format.open(split);
        Tuple6<Short, Integer, Long, Float, Double, Byte> result = new Tuple6<Short, Integer, Long, Float, Double, Byte>();
        try {
            result = format.nextRecord(result);
            fail("Empty String Parse Exception was not thrown! (ShortParser)");
        } catch (ParseException e) {
        }
        try {
            result = format.nextRecord(result);
            fail("Empty String Parse Exception was not thrown! (IntegerParser)");
        } catch (ParseException e) {
        }
        try {
            result = format.nextRecord(result);
            fail("Empty String Parse Exception was not thrown! (LongParser)");
        } catch (ParseException e) {
        }
        try {
            result = format.nextRecord(result);
            fail("Empty String Parse Exception was not thrown! (FloatParser)");
        } catch (ParseException e) {
        }
        try {
            result = format.nextRecord(result);
            fail("Empty String Parse Exception was not thrown! (DoubleParser)");
        } catch (ParseException e) {
        }
        try {
            result = format.nextRecord(result);
            fail("Empty String Parse Exception was not thrown! (ByteParser)");
        } catch (ParseException e) {
        }
        result = format.nextRecord(result);
        assertNull(result);
        assertTrue(format.reachedEnd());
    } catch (Exception ex) {
        fail("Test failed due to a " + ex.getClass().getName() + ": " + ex.getMessage());
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) IOException(java.io.IOException) ParseException(org.apache.flink.api.common.io.ParseException) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Tuple6(org.apache.flink.api.java.tuple.Tuple6) ParseException(org.apache.flink.api.common.io.ParseException) Test(org.junit.Test)

Aggregations

ParseException (org.apache.flink.api.common.io.ParseException)5 Configuration (org.apache.flink.configuration.Configuration)4 FileInputSplit (org.apache.flink.core.fs.FileInputSplit)4 Test (org.junit.Test)4 TypeInformation (org.apache.flink.api.common.typeinfo.TypeInformation)2 Row (org.apache.flink.types.Row)2 IOException (java.io.IOException)1 Tuple3 (org.apache.flink.api.java.tuple.Tuple3)1 Tuple6 (org.apache.flink.api.java.tuple.Tuple6)1 FieldParser (org.apache.flink.types.parser.FieldParser)1