Search in sources :

Example 86 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class FileInputFormatTest method testReadMultiplePatterns.

@Test
public void testReadMultiplePatterns() throws Exception {
    final String contents = "CONTENTS";
    // create some accepted, some ignored files
    File child1 = temporaryFolder.newFile("dataFile1.txt");
    File child2 = temporaryFolder.newFile("another_file.bin");
    createTempFiles(contents.getBytes(ConfigConstants.DEFAULT_CHARSET), child1, child2);
    // test that only the valid files are accepted
    Configuration configuration = new Configuration();
    final DummyFileInputFormat format = new DummyFileInputFormat();
    format.setFilePath(temporaryFolder.getRoot().toURI().toString());
    format.configure(configuration);
    format.setFilesFilter(new GlobFilePathFilter(Collections.singletonList("**"), Arrays.asList("**/another_file.bin", "**/dataFile1.txt")));
    FileInputSplit[] splits = format.createInputSplits(1);
    Assert.assertEquals(0, splits.length);
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) File(java.io.File) Test(org.junit.Test)

Example 87 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class GenericCsvInputFormatTest method testReadWithCharset.

@Test
public void testReadWithCharset() throws IOException {
    // Unicode row fragments
    String[] records = new String[] { "Ȏȟ", "Flink", "ȋȏ" };
    // Unicode delimiter
    String delimiter = "׀׀";
    String fileContent = StringUtils.join(records, delimiter);
    // StringValueParser does not use charset so rely on StringParser
    GenericCsvInputFormat<String[]> format = new GenericCsvInputFormat<String[]>() {

        @Override
        public String[] readRecord(String[] target, byte[] bytes, int offset, int numBytes) throws IOException {
            return parseRecord(target, bytes, offset, numBytes) ? target : null;
        }
    };
    format.setFilePath("file:///some/file/that/will/not/be/read");
    for (String charset : new String[] { "UTF-8", "UTF-16BE", "UTF-16LE" }) {
        File tempFile = File.createTempFile("test_contents", "tmp");
        tempFile.deleteOnExit();
        // write string with proper encoding
        try (Writer out = new OutputStreamWriter(new FileOutputStream(tempFile), charset)) {
            out.write(fileContent);
        }
        FileInputSplit split = new FileInputSplit(0, new Path(tempFile.toURI().toString()), 0, tempFile.length(), new String[] { "localhost" });
        format.setFieldDelimiter(delimiter);
        format.setFieldTypesGeneric(String.class, String.class, String.class);
        // use the same encoding to parse the file as used to read the file;
        // the field delimiter is reinterpreted when the charset is set
        format.setCharset(charset);
        format.configure(new Configuration());
        format.open(split);
        String[] values = new String[] { "", "", "" };
        values = format.nextRecord(values);
        // validate results
        assertNotNull(values);
        for (int i = 0; i < records.length; i++) {
            assertEquals(records[i], values[i]);
        }
        assertNull(format.nextRecord(values));
        assertTrue(format.reachedEnd());
    }
    format.close();
}
Also used : Path(org.apache.flink.core.fs.Path) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) DelimitedInputFormatTest.createTempFile(org.apache.flink.api.common.io.DelimitedInputFormatTest.createTempFile) File(java.io.File) OutputStreamWriter(java.io.OutputStreamWriter) Writer(java.io.Writer) Test(org.junit.Test)

Example 88 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class GenericCsvInputFormatTest method readWithHeaderLine.

@Test
public void readWithHeaderLine() {
    try {
        final String fileContent = "colname-1|colname-2|some name 3|column four|\n" + "123|abc|456|def|\n" + "987|xyz|654|pqr|\n";
        final FileInputSplit split = createTempFile(fileContent);
        final Configuration parameters = new Configuration();
        format.setFieldDelimiter("|");
        format.setFieldTypesGeneric(IntValue.class, StringValue.class, IntValue.class, StringValue.class);
        format.setSkipFirstLineAsHeader(true);
        format.configure(parameters);
        format.open(split);
        Value[] values = new Value[] { new IntValue(), new StringValue(), new IntValue(), new StringValue() };
        // first line is skipped as header
        //  first row (= second line)
        assertNotNull(format.nextRecord(values));
        // second row (= third line) 
        assertNotNull(format.nextRecord(values));
        // exhausted
        assertNull(format.nextRecord(values));
        // exhausted
        assertTrue(format.reachedEnd());
    } catch (Exception ex) {
        fail("Test failed due to a " + ex.getClass().getSimpleName() + ": " + ex.getMessage());
    }
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) IntValue(org.apache.flink.types.IntValue) DoubleValue(org.apache.flink.types.DoubleValue) LongValue(org.apache.flink.types.LongValue) Value(org.apache.flink.types.Value) StringValue(org.apache.flink.types.StringValue) StringValue(org.apache.flink.types.StringValue) IntValue(org.apache.flink.types.IntValue) IOException(java.io.IOException) Test(org.junit.Test)

Example 89 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class DelimitedInputFormatTest method testOpen.

@Test
public void testOpen() throws IOException {
    final String myString = "my mocked line 1\nmy mocked line 2\n";
    final FileInputSplit split = createTempFile(myString);
    int bufferSize = 5;
    format.setBufferSize(bufferSize);
    format.open(split);
    assertEquals(0, format.splitStart);
    assertEquals(myString.length() - bufferSize, format.splitLength);
    assertEquals(bufferSize, format.getBufferSize());
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Test(org.junit.Test)

Example 90 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class DelimitedInputFormatTest method testReadRecordsLargerThanBuffer.

@Test
public void testReadRecordsLargerThanBuffer() throws IOException {
    final String myString = "aaaaaaaaaaaaaaaaaaaaa\n" + "bbbbbbbbbbbbbbbbbbbbbbbbb\n" + "ccccccccccccccccccc\n" + "ddddddddddddddddddddddddddddddddddd\n";
    final FileInputSplit split = createTempFile(myString);
    FileInputSplit split1 = new FileInputSplit(0, split.getPath(), 0, split.getLength() / 2, split.getHostnames());
    FileInputSplit split2 = new FileInputSplit(1, split.getPath(), split1.getLength(), split.getLength(), split.getHostnames());
    final Configuration parameters = new Configuration();
    format.setBufferSize(8);
    format.configure(parameters);
    String next;
    List<String> result = new ArrayList<String>();
    format.open(split1);
    while ((next = format.nextRecord(null)) != null) {
        result.add(next);
    }
    assertNull(format.nextRecord(null));
    assertTrue(format.reachedEnd());
    format.close();
    format.open(split2);
    while ((next = format.nextRecord(null)) != null) {
        result.add(next);
    }
    assertNull(format.nextRecord(null));
    assertTrue(format.reachedEnd());
    format.close();
    assertEquals(4, result.size());
    assertEquals(Arrays.asList(myString.split("\n")), result);
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) ArrayList(java.util.ArrayList) Test(org.junit.Test)

Aggregations

FileInputSplit (org.apache.flink.core.fs.FileInputSplit)140 Test (org.junit.Test)119 Configuration (org.apache.flink.configuration.Configuration)93 Path (org.apache.flink.core.fs.Path)59 IOException (java.io.IOException)45 File (java.io.File)36 FileOutputStream (java.io.FileOutputStream)23 TypeInformation (org.apache.flink.api.common.typeinfo.TypeInformation)20 Row (org.apache.flink.types.Row)20 OutputStreamWriter (java.io.OutputStreamWriter)18 ParseException (org.apache.flink.api.common.io.ParseException)17 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)17 DoubleValue (org.apache.flink.types.DoubleValue)17 IntValue (org.apache.flink.types.IntValue)17 LongValue (org.apache.flink.types.LongValue)17 StringValue (org.apache.flink.types.StringValue)17 Value (org.apache.flink.types.Value)17 Plan (org.apache.flink.api.common.Plan)12 ReplicatingInputFormat (org.apache.flink.api.common.io.ReplicatingInputFormat)12 Tuple1 (org.apache.flink.api.java.tuple.Tuple1)12