Search in sources :

Example 16 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class RowCsvInputFormatTest method ignoreInvalidLines.

@Test
public void ignoreInvalidLines() throws Exception {
    String fileContent = "#description of the data\n" + "header1|header2|header3|\n" + "this is|1|2.0|\n" + "//a comment\n" + "a test|3|4.0|\n" + "#next|5|6.0|\n";
    FileInputSplit split = createTempFile(fileContent);
    TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.DOUBLE_TYPE_INFO };
    RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
    format.setLenient(false);
    Configuration parameters = new Configuration();
    format.configure(new Configuration());
    format.open(split);
    Row result = new Row(3);
    try {
        result = format.nextRecord(result);
        fail("Parse Exception was not thrown! (Row too short)");
    } catch (ParseException ignored) {
    }
    try {
        result = format.nextRecord(result);
        fail("Parse Exception was not thrown! (Invalid int value)");
    } catch (ParseException ignored) {
    }
    // => ok
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("this is", result.getField(0));
    assertEquals(1, result.getField(1));
    assertEquals(2.0, result.getField(2));
    try {
        result = format.nextRecord(result);
        fail("Parse Exception was not thrown! (Row too short)");
    } catch (ParseException ignored) {
    }
    // => ok
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("a test", result.getField(0));
    assertEquals(3, result.getField(1));
    assertEquals(4.0, result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("#next", result.getField(0));
    assertEquals(5, result.getField(1));
    assertEquals(6.0, result.getField(2));
    result = format.nextRecord(result);
    assertNull(result);
    // re-open with lenient = true
    format.setLenient(true);
    format.configure(parameters);
    format.open(split);
    result = new Row(3);
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("header1", result.getField(0));
    assertNull(result.getField(1));
    assertNull(result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("this is", result.getField(0));
    assertEquals(1, result.getField(1));
    assertEquals(2.0, result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("a test", result.getField(0));
    assertEquals(3, result.getField(1));
    assertEquals(4.0, result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("#next", result.getField(0));
    assertEquals(5, result.getField(1));
    assertEquals(6.0, result.getField(2));
    result = format.nextRecord(result);
    assertNull(result);
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) Row(org.apache.flink.types.Row) ParseException(org.apache.flink.api.common.io.ParseException) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Test(org.junit.Test)

Example 17 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class RowCsvInputFormatTest method testScanOrder.

@Test
public void testScanOrder() throws Exception {
    String fileContent = // first row
    "111|222|333|444|555|666|777|888|999|000|\n" + // second row
    "000|999|888|777|666|555|444|333|222|111|";
    FileInputSplit split = createTempFile(fileContent);
    TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO };
    int[] order = new int[] { 7, 3, 0 };
    RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, order);
    format.setFieldDelimiter("|");
    format.configure(new Configuration());
    format.open(split);
    Row result = new Row(3);
    // check first row
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals(888, result.getField(0));
    assertEquals(444, result.getField(1));
    assertEquals(111, result.getField(2));
    // check second row
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals(333, result.getField(0));
    assertEquals(777, result.getField(1));
    assertEquals(0, result.getField(2));
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) Row(org.apache.flink.types.Row) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Test(org.junit.Test)

Example 18 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class RowCsvInputFormatTest method testTailingEmptyFields.

@Test
public void testTailingEmptyFields() throws Exception {
    String fileContent = "abc|-def|-ghijk\n" + "abc|-def|-\n" + "abc|-|-\n" + "|-|-|-\n" + "|-|-\n" + "abc|-def\n";
    FileInputSplit split = createTempFile(fileContent);
    TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO };
    RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
    format.setFieldDelimiter("|-");
    format.configure(new Configuration());
    format.open(split);
    Row result = new Row(3);
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("abc", result.getField(0));
    assertEquals("def", result.getField(1));
    assertEquals("ghijk", result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("abc", result.getField(0));
    assertEquals("def", result.getField(1));
    assertEquals("", result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("abc", result.getField(0));
    assertEquals("", result.getField(1));
    assertEquals("", result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("", result.getField(0));
    assertEquals("", result.getField(1));
    assertEquals("", result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("", result.getField(0));
    assertEquals("", result.getField(1));
    assertEquals("", result.getField(2));
    try {
        format.nextRecord(result);
        fail("Parse Exception was not thrown! (Row too short)");
    } catch (ParseException e) {
    }
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) Row(org.apache.flink.types.Row) ParseException(org.apache.flink.api.common.io.ParseException) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Test(org.junit.Test)

Example 19 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class RowCsvInputFormatTest method testIntegerFields.

@Test
public void testIntegerFields() throws Exception {
    String fileContent = "111|222|333|444|555\n666|777|888|999|000|\n";
    FileInputSplit split = createTempFile(fileContent);
    TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO };
    RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
    format.setFieldDelimiter("|");
    format.configure(new Configuration());
    format.open(split);
    Row result = new Row(5);
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals(111, result.getField(0));
    assertEquals(222, result.getField(1));
    assertEquals(333, result.getField(2));
    assertEquals(444, result.getField(3));
    assertEquals(555, result.getField(4));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals(666, result.getField(0));
    assertEquals(777, result.getField(1));
    assertEquals(888, result.getField(2));
    assertEquals(999, result.getField(3));
    assertEquals(0, result.getField(4));
    result = format.nextRecord(result);
    assertNull(result);
    assertTrue(format.reachedEnd());
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) Row(org.apache.flink.types.Row) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Test(org.junit.Test)

Example 20 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class RowCsvInputFormatTest method readStringFieldsWithTrailingDelimiters.

@Test
public void readStringFieldsWithTrailingDelimiters() throws Exception {
    String fileContent = "abc|-def|-ghijk\nabc|-|-hhg\n|-|-|-\n";
    FileInputSplit split = createTempFile(fileContent);
    TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO };
    RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
    format.setFieldDelimiter("|-");
    format.configure(new Configuration());
    format.open(split);
    Row result = new Row(3);
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("abc", result.getField(0));
    assertEquals("def", result.getField(1));
    assertEquals("ghijk", result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("abc", result.getField(0));
    assertEquals("", result.getField(1));
    assertEquals("hhg", result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("", result.getField(0));
    assertEquals("", result.getField(1));
    assertEquals("", result.getField(2));
    result = format.nextRecord(result);
    assertNull(result);
    assertTrue(format.reachedEnd());
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) Row(org.apache.flink.types.Row) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Test(org.junit.Test)

Aggregations

FileInputSplit (org.apache.flink.core.fs.FileInputSplit)140 Test (org.junit.Test)119 Configuration (org.apache.flink.configuration.Configuration)93 Path (org.apache.flink.core.fs.Path)59 IOException (java.io.IOException)45 File (java.io.File)36 FileOutputStream (java.io.FileOutputStream)23 TypeInformation (org.apache.flink.api.common.typeinfo.TypeInformation)20 Row (org.apache.flink.types.Row)20 OutputStreamWriter (java.io.OutputStreamWriter)18 ParseException (org.apache.flink.api.common.io.ParseException)17 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)17 DoubleValue (org.apache.flink.types.DoubleValue)17 IntValue (org.apache.flink.types.IntValue)17 LongValue (org.apache.flink.types.LongValue)17 StringValue (org.apache.flink.types.StringValue)17 Value (org.apache.flink.types.Value)17 Plan (org.apache.flink.api.common.Plan)12 ReplicatingInputFormat (org.apache.flink.api.common.io.ReplicatingInputFormat)12 Tuple1 (org.apache.flink.api.java.tuple.Tuple1)12