Search in sources :

Example 46 with TypeInformation

use of org.apache.flink.api.common.typeinfo.TypeInformation in project flink by apache.

the class RowCsvInputFormatTest method ignoreSingleCharPrefixComments.

@Test
public void ignoreSingleCharPrefixComments() throws Exception {
    String fileContent = "#description of the data\n" + "#successive commented line\n" + "this is|1|2.0|\n" + "a test|3|4.0|\n" + "#next|5|6.0|\n";
    FileInputSplit split = createTempFile(fileContent);
    TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.DOUBLE_TYPE_INFO };
    RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
    format.setCommentPrefix("#");
    format.configure(new Configuration());
    format.open(split);
    Row result = new Row(3);
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("this is", result.getField(0));
    assertEquals(1, result.getField(1));
    assertEquals(2.0, result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals("a test", result.getField(0));
    assertEquals(3, result.getField(1));
    assertEquals(4.0, result.getField(2));
    result = format.nextRecord(result);
    assertNull(result);
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) Row(org.apache.flink.types.Row) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Test(org.junit.Test)

Example 47 with TypeInformation

use of org.apache.flink.api.common.typeinfo.TypeInformation in project flink by apache.

the class RowCsvInputFormatTest method testReadSparseWithMask.

@Test
public void testReadSparseWithMask() throws Exception {
    String fileContent = "111&&222&&333&&444&&555&&666&&777&&888&&999&&000&&\n" + "000&&999&&888&&777&&666&&555&&444&&333&&222&&111&&";
    FileInputSplit split = RowCsvInputFormatTest.createTempFile(fileContent);
    TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO };
    RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, new int[] { 0, 3, 7 });
    format.setFieldDelimiter("&&");
    format.configure(new Configuration());
    format.open(split);
    Row result = new Row(3);
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals(111, result.getField(0));
    assertEquals(444, result.getField(1));
    assertEquals(888, result.getField(2));
    result = format.nextRecord(result);
    assertNotNull(result);
    assertEquals(0, result.getField(0));
    assertEquals(777, result.getField(1));
    assertEquals(333, result.getField(2));
    result = format.nextRecord(result);
    assertNull(result);
    assertTrue(format.reachedEnd());
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) Row(org.apache.flink.types.Row) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Test(org.junit.Test)

Example 48 with TypeInformation

use of org.apache.flink.api.common.typeinfo.TypeInformation in project flink by apache.

the class RowCsvInputFormatTest method testParserCorrectness.

// Test disabled because we do not support double-quote escaped quotes right now.
@Test
@Ignore
public void testParserCorrectness() throws Exception {
    // RFC 4180 Compliance Test content
    // Taken from http://en.wikipedia.org/wiki/Comma-separated_values#Example
    String fileContent = "Year,Make,Model,Description,Price\n" + "1997,Ford,E350,\"ac, abs, moon\",3000.00\n" + "1999,Chevy,\"Venture \"\"Extended Edition\"\"\",\"\",4900.00\n" + "1996,Jeep,Grand Cherokee,\"MUST SELL! air, moon roof, loaded\",4799.00\n" + "1999,Chevy,\"Venture \"\"Extended Edition, Very Large\"\"\",,5000.00\n" + ",,\"Venture \"\"Extended Edition\"\"\",\"\",4900.00";
    FileInputSplit split = createTempFile(fileContent);
    TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.DOUBLE_TYPE_INFO };
    RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes);
    format.setSkipFirstLineAsHeader(true);
    format.setFieldDelimiter(",");
    format.configure(new Configuration());
    format.open(split);
    Row result = new Row(5);
    Row r1 = new Row(5);
    r1.setField(0, 1997);
    r1.setField(1, "Ford");
    r1.setField(2, "E350");
    r1.setField(3, "ac, abs, moon");
    r1.setField(4, 3000.0);
    Row r2 = new Row(5);
    r2.setField(0, 1999);
    r2.setField(1, "Chevy");
    r2.setField(2, "Venture \"Extended Edition\"");
    r2.setField(3, "");
    r2.setField(4, 4900.0);
    Row r3 = new Row(5);
    r3.setField(0, 1996);
    r3.setField(1, "Jeep");
    r3.setField(2, "Grand Cherokee");
    r3.setField(3, "MUST SELL! air, moon roof, loaded");
    r3.setField(4, 4799.0);
    Row r4 = new Row(5);
    r4.setField(0, 1999);
    r4.setField(1, "Chevy");
    r4.setField(2, "Venture \"Extended Edition, Very Large\"");
    r4.setField(3, "");
    r4.setField(4, 5000.0);
    Row r5 = new Row(5);
    r5.setField(0, 0);
    r5.setField(1, "");
    r5.setField(2, "Venture \"Extended Edition\"");
    r5.setField(3, "");
    r5.setField(4, 4900.0);
    Row[] expectedLines = new Row[] { r1, r2, r3, r4, r5 };
    for (Row expected : expectedLines) {
        result = format.nextRecord(result);
        assertEquals(expected, result);
    }
    assertNull(format.nextRecord(result));
    assertTrue(format.reachedEnd());
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) Row(org.apache.flink.types.Row) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 49 with TypeInformation

use of org.apache.flink.api.common.typeinfo.TypeInformation in project flink by apache.

the class RowCsvInputFormatTest method testQuotedStringParsingWithIncludeFields.

@Test
public void testQuotedStringParsingWithIncludeFields() throws Exception {
    String fileContent = "\"20:41:52-1-3-2015\"|\"Re: Taskmanager memory error in Eclipse\"|" + "\"Blahblah <blah@blahblah.org>\"|\"blaaa|\"blubb\"";
    File tempFile = File.createTempFile("CsvReaderQuotedString", "tmp");
    tempFile.deleteOnExit();
    tempFile.setWritable(true);
    OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(tempFile));
    writer.write(fileContent);
    writer.close();
    TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO };
    RowCsvInputFormat inputFormat = new RowCsvInputFormat(new Path(tempFile.toURI().toString()), fieldTypes, new int[] { 0, 2 });
    inputFormat.enableQuotedStringParsing('"');
    inputFormat.setFieldDelimiter("|");
    inputFormat.setDelimiter('\n');
    inputFormat.configure(new Configuration());
    FileInputSplit[] splits = inputFormat.createInputSplits(1);
    inputFormat.open(splits[0]);
    Row record = inputFormat.nextRecord(new Row(2));
    assertEquals("20:41:52-1-3-2015", record.getField(0));
    assertEquals("Blahblah <blah@blahblah.org>", record.getField(1));
}
Also used : Path(org.apache.flink.core.fs.Path) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) Row(org.apache.flink.types.Row) File(java.io.File) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Test(org.junit.Test)

Example 50 with TypeInformation

use of org.apache.flink.api.common.typeinfo.TypeInformation in project flink by apache.

the class RowCsvInputFormatTest method testQuotedStringParsingWithEscapedQuotes.

@Test
public void testQuotedStringParsingWithEscapedQuotes() throws Exception {
    String fileContent = "\"\\\"Hello\\\" World\"|\"We are\\\" young\"";
    File tempFile = File.createTempFile("CsvReaderQuotedString", "tmp");
    tempFile.deleteOnExit();
    tempFile.setWritable(true);
    OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(tempFile));
    writer.write(fileContent);
    writer.close();
    TypeInformation[] fieldTypes = new TypeInformation[] { BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO };
    RowCsvInputFormat inputFormat = new RowCsvInputFormat(new Path(tempFile.toURI().toString()), fieldTypes);
    inputFormat.enableQuotedStringParsing('"');
    inputFormat.setFieldDelimiter("|");
    inputFormat.setDelimiter('\n');
    inputFormat.configure(new Configuration());
    FileInputSplit[] splits = inputFormat.createInputSplits(1);
    inputFormat.open(splits[0]);
    Row record = inputFormat.nextRecord(new Row(2));
    assertEquals("\\\"Hello\\\" World", record.getField(0));
    assertEquals("We are\\\" young", record.getField(1));
}
Also used : Path(org.apache.flink.core.fs.Path) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) Row(org.apache.flink.types.Row) File(java.io.File) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Test(org.junit.Test)

Aggregations

TypeInformation (org.apache.flink.api.common.typeinfo.TypeInformation)51 Test (org.junit.Test)28 Row (org.apache.flink.types.Row)21 Configuration (org.apache.flink.configuration.Configuration)20 FileInputSplit (org.apache.flink.core.fs.FileInputSplit)20 TupleTypeInfo (org.apache.flink.api.java.typeutils.TupleTypeInfo)10 ArrayList (java.util.ArrayList)9 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)8 CompositeType (org.apache.flink.api.common.typeutils.CompositeType)8 IOException (java.io.IOException)7 Type (java.lang.reflect.Type)7 GenericArrayType (java.lang.reflect.GenericArrayType)6 ParameterizedType (java.lang.reflect.ParameterizedType)6 Random (java.util.Random)6 InvalidTypesException (org.apache.flink.api.common.functions.InvalidTypesException)6 TypeExtractionUtils.isClassType (org.apache.flink.api.java.typeutils.TypeExtractionUtils.isClassType)6 ValueTypeInfo (org.apache.flink.api.java.typeutils.ValueTypeInfo)6 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)5 TypeVariable (java.lang.reflect.TypeVariable)4 MutableObjectIterator (org.apache.flink.util.MutableObjectIterator)4