Search in sources :

Example 11 with RowTypeInfo

use of org.apache.flink.api.java.typeutils.RowTypeInfo in project flink by apache.

the class RowCsvInputFormatTest method ignoreInvalidLines.

@Test
public void ignoreInvalidLines() throws Exception {
    String fileContent = "#description of the data\n" + "header1|header2|header3|\n" + "this is|1|2.0|\n" + "//a comment\n" + "a test|3|4.0|\n" + "#next|5|6.0|\n";
    FileInputSplit split = createTempFile(fileContent);
    TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.DOUBLE_TYPE_INFO };
    RowCsvInputFormat.Builder builder = RowCsvInputFormat.builder(new RowTypeInfo(fieldTypes), PATH).setFieldDelimiter('|').setIgnoreParseErrors(false);
    RowCsvInputFormat format = builder.build();
    Configuration parameters = new Configuration();
    format.configure(parameters);
    format.open(split);
    Row result = new Row(3);
    try {
        result = format.nextRecord(result);
        fail("RuntimeException was not thrown! (Row length mismatch. 3 fields expected but was 1)");
    } catch (IOException ignored) {
    }
    try {
        result = format.nextRecord(result);
        fail("NumberFormatException was not thrown! (For input string: \"header2\")");
    } catch (IOException ignored) {
    }
    // => ok
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("this is", result.getField(0));
    assertEquals(1, result.getField(1));
    assertEquals(2.0, result.getField(2));
    try {
        result = format.nextRecord(result);
        fail("RuntimeException was not thrown! (Row length mismatch. 3 fields expected but was 1)");
    } catch (IOException ignored) {
    }
    // => ok
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("a test", result.getField(0));
    assertEquals(3, result.getField(1));
    assertEquals(4.0, result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("#next", result.getField(0));
    assertEquals(5, result.getField(1));
    assertEquals(6.0, result.getField(2));
    result = format.nextRecord(result);
    assertNull(result);
    // re-open with lenient = true
    builder.setIgnoreParseErrors(true);
    format = builder.build();
    format.configure(parameters);
    format.open(split);
    result = new Row(3);
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("#description of the data", result.getField(0));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("header1", result.getField(0));
    assertNull(result.getField(1));
    assertNull(result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("this is", result.getField(0));
    assertEquals(1, result.getField(1));
    assertEquals(2.0, result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("//a comment", result.getField(0));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("a test", result.getField(0));
    assertEquals(3, result.getField(1));
    assertEquals(4.0, result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("#next", result.getField(0));
    assertEquals(5, result.getField(1));
    assertEquals(6.0, result.getField(2));
    result = format.nextRecord(result);
    assertNull(result);
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) Row(org.apache.flink.types.Row) IOException(java.io.IOException) RowTypeInfo(org.apache.flink.api.java.typeutils.RowTypeInfo) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Test(org.junit.Test)

Example 12 with RowTypeInfo

use of org.apache.flink.api.java.typeutils.RowTypeInfo in project flink by apache.

the class RowCsvInputFormatTest method readStringFields.

@Test
public void readStringFields() throws Exception {
    String fileContent = "abc|def|ghijk\nabc||hhg\n|||\n||";
    FileInputSplit split = createTempFile(fileContent);
    TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO };
    RowCsvInputFormat.Builder builder = RowCsvInputFormat.builder(new RowTypeInfo(fieldTypes), PATH).setFieldDelimiter('|');
    RowCsvInputFormat format = builder.build();
    format.configure(new Configuration());
    format.open(split);
    Row result = new Row(3);
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("abc", result.getField(0));
    assertEquals("def", result.getField(1));
    assertEquals("ghijk", result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("abc", result.getField(0));
    assertEquals("", result.getField(1));
    assertEquals("hhg", result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("", result.getField(0));
    assertEquals("", result.getField(1));
    assertEquals("", result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("", result.getField(0));
    assertEquals("", result.getField(1));
    assertEquals("", result.getField(2));
    result = format.nextRecord(result);
    assertNull(result);
    assertTrue(format.reachedEnd());
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) Row(org.apache.flink.types.Row) RowTypeInfo(org.apache.flink.api.java.typeutils.RowTypeInfo) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Test(org.junit.Test)

Example 13 with RowTypeInfo

use of org.apache.flink.api.java.typeutils.RowTypeInfo in project flink by apache.

the class RowCsvInputFormatTest method readMixedQuotedStringFields.

@Test
public void readMixedQuotedStringFields() throws Exception {
    String fileContent = "@a|b|c@|def|@ghijk@\nabc||@|hhg@\n|||\n";
    FileInputSplit split = createTempFile(fileContent);
    TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO };
    RowCsvInputFormat.Builder builder = RowCsvInputFormat.builder(new RowTypeInfo(fieldTypes), PATH).setFieldDelimiter('|').setQuoteCharacter('@');
    RowCsvInputFormat format = builder.build();
    format.configure(new Configuration());
    format.open(split);
    Row result = new Row(3);
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("a|b|c", result.getField(0));
    assertEquals("def", result.getField(1));
    assertEquals("ghijk", result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("abc", result.getField(0));
    assertEquals("", result.getField(1));
    assertEquals("|hhg", result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("", result.getField(0));
    assertEquals("", result.getField(1));
    assertEquals("", result.getField(2));
    result = format.nextRecord(result);
    assertNull(result);
    assertTrue(format.reachedEnd());
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) Row(org.apache.flink.types.Row) RowTypeInfo(org.apache.flink.api.java.typeutils.RowTypeInfo) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Test(org.junit.Test)

Example 14 with RowTypeInfo

use of org.apache.flink.api.java.typeutils.RowTypeInfo in project flink by apache.

the class RowCsvInputFormatTest method testDoubleFields.

@Test
public void testDoubleFields() throws Exception {
    String fileContent = "11.1|22.2|33.3|44.4|55.5\n66.6|77.7|88.8|99.9|00.0|\n";
    FileInputSplit split = createTempFile(fileContent);
    TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.DOUBLE_TYPE_INFO, BasicTypeInfo.DOUBLE_TYPE_INFO, BasicTypeInfo.DOUBLE_TYPE_INFO, BasicTypeInfo.DOUBLE_TYPE_INFO, BasicTypeInfo.DOUBLE_TYPE_INFO };
    RowCsvInputFormat.Builder builder = RowCsvInputFormat.builder(new RowTypeInfo(fieldTypes), PATH).setFieldDelimiter('|');
    RowCsvInputFormat format = builder.build();
    format.configure(new Configuration());
    format.open(split);
    Row result = new Row(5);
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals(11.1, result.getField(0));
    assertEquals(22.2, result.getField(1));
    assertEquals(33.3, result.getField(2));
    assertEquals(44.4, result.getField(3));
    assertEquals(55.5, result.getField(4));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals(66.6, result.getField(0));
    assertEquals(77.7, result.getField(1));
    assertEquals(88.8, result.getField(2));
    assertEquals(99.9, result.getField(3));
    assertEquals(0.0, result.getField(4));
    result = format.nextRecord(result);
    assertNull(result);
    assertTrue(format.reachedEnd());
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) Row(org.apache.flink.types.Row) RowTypeInfo(org.apache.flink.api.java.typeutils.RowTypeInfo) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Test(org.junit.Test)

Example 15 with RowTypeInfo

use of org.apache.flink.api.java.typeutils.RowTypeInfo in project flink by apache.

the class RowCsvInputFormatTest method testScanOrder.

@Test
public void testScanOrder() throws Exception {
    String fileContent = // first row
    "111|222|333|444|555|666|777|888|999|000|\n" + // second row
    "000|999|888|777|666|555|444|333|222|111|";
    FileInputSplit split = createTempFile(fileContent);
    TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO };
    RowCsvInputFormat.Builder builder = RowCsvInputFormat.builder(new RowTypeInfo(fieldTypes), PATH).setFieldDelimiter('|').setSelectedFields(new int[] { 7, 3, 0 });
    RowCsvInputFormat format = builder.build();
    format.configure(new Configuration());
    format.open(split);
    Row result = new Row(3);
    // check first row
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals(888, result.getField(0));
    assertEquals(444, result.getField(1));
    assertEquals(111, result.getField(2));
    // check second row
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals(333, result.getField(0));
    assertEquals(777, result.getField(1));
    assertEquals(0, result.getField(2));
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) Row(org.apache.flink.types.Row) RowTypeInfo(org.apache.flink.api.java.typeutils.RowTypeInfo) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Test(org.junit.Test)

Aggregations

RowTypeInfo (org.apache.flink.api.java.typeutils.RowTypeInfo)50 Test (org.junit.Test)34 Row (org.apache.flink.types.Row)32 TypeInformation (org.apache.flink.api.common.typeinfo.TypeInformation)26 Configuration (org.apache.flink.configuration.Configuration)16 FileInputSplit (org.apache.flink.core.fs.FileInputSplit)15 ArrayList (java.util.ArrayList)10 Transformation (org.apache.flink.api.dag.Transformation)8 OneInputTransformation (org.apache.flink.streaming.api.transformations.OneInputTransformation)8 SourceTransformation (org.apache.flink.streaming.api.transformations.SourceTransformation)8 TwoInputTransformation (org.apache.flink.streaming.api.transformations.TwoInputTransformation)8 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)6 PythonKeyedProcessOperator (org.apache.flink.streaming.api.operators.python.PythonKeyedProcessOperator)6 IOException (java.io.IOException)4 MapTypeInfo (org.apache.flink.api.java.typeutils.MapTypeInfo)4 File (java.io.File)3 FileOutputStream (java.io.FileOutputStream)3 OutputStreamWriter (java.io.OutputStreamWriter)3 LocalDateTime (java.time.LocalDateTime)3 PrimitiveArrayTypeInfo (org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo)3