Examples with FileInputSplit - org.apache.flink.core.fs.FileInputSplit

Example 81 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class AvroRecordInputFormatTest method doTestDeserializationGenericRecord.

/**
	 * Helper method to test GenericRecord serialisation
	 * 
	 * @param format
	 *            the format to test
	 * @param parameters
	 *            the configuration to use
	 * @throws IOException
	 *             thrown id there is a issue
	 */
@SuppressWarnings("unchecked")
private void doTestDeserializationGenericRecord(final AvroInputFormat<GenericRecord> format, final Configuration parameters) throws IOException {
    try {
        format.configure(parameters);
        FileInputSplit[] splits = format.createInputSplits(1);
        assertEquals(splits.length, 1);
        format.open(splits[0]);
        GenericRecord u = format.nextRecord(null);
        assertNotNull(u);
        assertEquals("The schemas should be equal", userSchema, u.getSchema());
        String name = u.get("name").toString();
        assertNotNull("empty record", name);
        assertEquals("name not equal", TEST_NAME, name);
        // check arrays
        List<CharSequence> sl = (List<CharSequence>) u.get("type_array_string");
        assertEquals("element 0 not equal", TEST_ARRAY_STRING_1, sl.get(0).toString());
        assertEquals("element 1 not equal", TEST_ARRAY_STRING_2, sl.get(1).toString());
        List<Boolean> bl = (List<Boolean>) u.get("type_array_boolean");
        assertEquals("element 0 not equal", TEST_ARRAY_BOOLEAN_1, bl.get(0));
        assertEquals("element 1 not equal", TEST_ARRAY_BOOLEAN_2, bl.get(1));
        // check enums
        GenericData.EnumSymbol enumValue = (GenericData.EnumSymbol) u.get("type_enum");
        assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), enumValue.toString());
        // check maps
        Map<CharSequence, Long> lm = (Map<CharSequence, Long>) u.get("type_map");
        assertEquals("map value of key 1 not equal", TEST_MAP_VALUE1, lm.get(new Utf8(TEST_MAP_KEY1)).longValue());
        assertEquals("map value of key 2 not equal", TEST_MAP_VALUE2, lm.get(new Utf8(TEST_MAP_KEY2)).longValue());
        assertFalse("expecting second element", format.reachedEnd());
        assertNotNull("expecting second element", format.nextRecord(u));
        assertNull(format.nextRecord(u));
        assertTrue(format.reachedEnd());
    } finally {
        format.close();
    }
}

Also used : GenericData(org.apache.avro.generic.GenericData) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Utf8(org.apache.avro.util.Utf8) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 82 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class AvroRecordInputFormatTest method testDeserialisationReuseAvroRecordFalse.

/**
	 * Test if the AvroInputFormat is able to properly read data from an avro file.
	 * @throws IOException
	 */
@Test
public void testDeserialisationReuseAvroRecordFalse() throws IOException {
    Configuration parameters = new Configuration();
    AvroInputFormat<User> format = new AvroInputFormat<User>(new Path(testFile.getAbsolutePath()), User.class);
    format.setReuseAvroValue(false);
    format.configure(parameters);
    FileInputSplit[] splits = format.createInputSplits(1);
    assertEquals(splits.length, 1);
    format.open(splits[0]);
    User u = format.nextRecord(null);
    assertNotNull(u);
    String name = u.getName().toString();
    assertNotNull("empty record", name);
    assertEquals("name not equal", TEST_NAME, name);
    // check arrays
    List<CharSequence> sl = u.getTypeArrayString();
    assertEquals("element 0 not equal", TEST_ARRAY_STRING_1, sl.get(0).toString());
    assertEquals("element 1 not equal", TEST_ARRAY_STRING_2, sl.get(1).toString());
    List<Boolean> bl = u.getTypeArrayBoolean();
    assertEquals("element 0 not equal", TEST_ARRAY_BOOLEAN_1, bl.get(0));
    assertEquals("element 1 not equal", TEST_ARRAY_BOOLEAN_2, bl.get(1));
    // check enums
    Colors enumValue = u.getTypeEnum();
    assertEquals("enum not equal", TEST_ENUM_COLOR, enumValue);
    // check maps
    Map<CharSequence, Long> lm = u.getTypeMap();
    assertEquals("map value of key 1 not equal", TEST_MAP_VALUE1, lm.get(new Utf8(TEST_MAP_KEY1)).longValue());
    assertEquals("map value of key 2 not equal", TEST_MAP_VALUE2, lm.get(new Utf8(TEST_MAP_KEY2)).longValue());
    assertFalse("expecting second element", format.reachedEnd());
    assertNotNull("expecting second element", format.nextRecord(u));
    assertNull(format.nextRecord(u));
    assertTrue(format.reachedEnd());
    format.close();
}

Also used : Path(org.apache.flink.core.fs.Path) User(org.apache.flink.api.io.avro.generated.User) Configuration(org.apache.flink.configuration.Configuration) AvroInputFormat(org.apache.flink.api.java.io.AvroInputFormat) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Colors(org.apache.flink.api.io.avro.generated.Colors) Utf8(org.apache.avro.util.Utf8) Test(org.junit.Test)

Example 83 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class FileInputFormat method createInputSplits.

/**
	 * Computes the input splits for the file. By default, one file block is one split. If more splits
	 * are requested than blocks are available, then a split may be a fraction of a block and splits may cross
	 * block boundaries.
	 * 
	 * @param minNumSplits The minimum desired number of file splits.
	 * @return The computed file splits.
	 * 
	 * @see org.apache.flink.api.common.io.InputFormat#createInputSplits(int)
	 */
@Override
public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
    if (minNumSplits < 1) {
        throw new IllegalArgumentException("Number of input splits has to be at least 1.");
    }
    // take the desired number of splits into account
    minNumSplits = Math.max(minNumSplits, this.numSplits);
    final Path path = this.filePath;
    final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits);
    // get all the files that are involved in the splits
    List<FileStatus> files = new ArrayList<FileStatus>();
    long totalLength = 0;
    final FileSystem fs = path.getFileSystem();
    final FileStatus pathFile = fs.getFileStatus(path);
    if (pathFile.isDir()) {
        totalLength += addFilesInDir(path, files, true);
    } else {
        testForUnsplittable(pathFile);
        files.add(pathFile);
        totalLength += pathFile.getLen();
    }
    // returns if unsplittable
    if (unsplittable) {
        int splitNum = 0;
        for (final FileStatus file : files) {
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, file.getLen());
            Set<String> hosts = new HashSet<String>();
            for (BlockLocation block : blocks) {
                hosts.addAll(Arrays.asList(block.getHosts()));
            }
            long len = file.getLen();
            if (testForUnsplittable(file)) {
                len = READ_WHOLE_SPLIT_FLAG;
            }
            FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, len, hosts.toArray(new String[hosts.size()]));
            inputSplits.add(fis);
        }
        return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
    }
    final long maxSplitSize = (minNumSplits < 1) ? Long.MAX_VALUE : (totalLength / minNumSplits + (totalLength % minNumSplits == 0 ? 0 : 1));
    // now that we have the files, generate the splits
    int splitNum = 0;
    for (final FileStatus file : files) {
        final long len = file.getLen();
        final long blockSize = file.getBlockSize();
        final long minSplitSize;
        if (this.minSplitSize <= blockSize) {
            minSplitSize = this.minSplitSize;
        } else {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Minimal split size of " + this.minSplitSize + " is larger than the block size of " + blockSize + ". Decreasing minimal split size to block size.");
            }
            minSplitSize = blockSize;
        }
        final long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize));
        final long halfSplit = splitSize >>> 1;
        final long maxBytesForLastSplit = (long) (splitSize * MAX_SPLIT_SIZE_DISCREPANCY);
        if (len > 0) {
            // get the block locations and make sure they are in order with respect to their offset
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
            Arrays.sort(blocks);
            long bytesUnassigned = len;
            long position = 0;
            int blockIndex = 0;
            while (bytesUnassigned > maxBytesForLastSplit) {
                // get the block containing the majority of the data
                blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
                // create a new split
                FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, splitSize, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
                // adjust the positions
                position += splitSize;
                bytesUnassigned -= splitSize;
            }
            // assign the last split
            if (bytesUnassigned > 0) {
                blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
                final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, bytesUnassigned, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
            }
        } else {
            // special case with a file of zero bytes size
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
            String[] hosts;
            if (blocks.length > 0) {
                hosts = blocks[0].getHosts();
            } else {
                hosts = new String[0];
            }
            final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, 0, hosts);
            inputSplits.add(fis);
        }
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}

Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) ArrayList(java.util.ArrayList) BlockLocation(org.apache.flink.core.fs.BlockLocation) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) FileSystem(org.apache.flink.core.fs.FileSystem) HashSet(java.util.HashSet)

Example 84 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class FileInputFormatTest method testFileInputSplit.

// ------------------------------------------------------------------------
//  Unsplittable input files
// ------------------------------------------------------------------------
// ---- Tests for .deflate ---------
/**
	 * Create directory with files with .deflate extension and see if it creates a split
	 * for each file. Each split has to start from the beginning.
	 */
@Test
public void testFileInputSplit() {
    try {
        String tempFile = TestFileUtils.createTempFileDirExtension(".deflate", "some", "stupid", "meaningless", "files");
        final DummyFileInputFormat format = new DummyFileInputFormat();
        format.setFilePath(tempFile);
        format.configure(new Configuration());
        FileInputSplit[] splits = format.createInputSplits(2);
        Assert.assertEquals(4, splits.length);
        for (FileInputSplit split : splits) {
            // unsplittable deflate files have this size as a flag for "read whole file"
            Assert.assertEquals(-1L, split.getLength());
            // always read from the beginning.
            Assert.assertEquals(0L, split.getStart());
        }
        // test if this also works for "mixed" directories
        TestFileUtils.createTempFileInDirectory(tempFile.replace("file:", ""), "this creates a test file with a random extension (at least not .deflate)");
        final DummyFileInputFormat formatMixed = new DummyFileInputFormat();
        formatMixed.setFilePath(tempFile);
        formatMixed.configure(new Configuration());
        FileInputSplit[] splitsMixed = formatMixed.createInputSplits(2);
        Assert.assertEquals(5, splitsMixed.length);
        for (FileInputSplit split : splitsMixed) {
            if (split.getPath().getName().endsWith(".deflate")) {
                // unsplittable deflate files have this size as a flag for "read whole file"
                Assert.assertEquals(-1L, split.getLength());
                // always read from the beginning.
                Assert.assertEquals(0L, split.getStart());
            } else {
                Assert.assertEquals(0L, split.getStart());
                Assert.assertTrue("split size not correct", split.getLength() > 0);
            }
        }
    } catch (Exception ex) {
        ex.printStackTrace();
        Assert.fail(ex.getMessage());
    }
}

Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) IOException(java.io.IOException) Test(org.junit.Test)

Example 85 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class FileInputFormatTest method testDecorateInputStream.

// ------------------------------------------------------------------------
//  Stream Decoration
// ------------------------------------------------------------------------
@Test
public void testDecorateInputStream() throws IOException {
    // create temporary file with 3 blocks
    final File tempFile = File.createTempFile("input-stream-decoration-test", "tmp");
    tempFile.deleteOnExit();
    final int blockSize = 8;
    final int numBlocks = 3;
    FileOutputStream fileOutputStream = new FileOutputStream(tempFile);
    for (int i = 0; i < blockSize * numBlocks; i++) {
        fileOutputStream.write(new byte[] { (byte) i });
    }
    fileOutputStream.close();
    final Configuration config = new Configuration();
    final FileInputFormat<byte[]> inputFormat = new MyDecoratedInputFormat();
    inputFormat.setFilePath(tempFile.toURI().toString());
    inputFormat.configure(config);
    inputFormat.openInputFormat();
    FileInputSplit[] inputSplits = inputFormat.createInputSplits(3);
    byte[] bytes = null;
    byte prev = 0;
    for (FileInputSplit inputSplit : inputSplits) {
        inputFormat.open(inputSplit);
        while (!inputFormat.reachedEnd()) {
            if ((bytes = inputFormat.nextRecord(bytes)) != null) {
                Assert.assertArrayEquals(new byte[] { --prev }, bytes);
            }
        }
    }
    inputFormat.closeInputFormat();
}

Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) FileOutputStream(java.io.FileOutputStream) File(java.io.File) Test(org.junit.Test)

Aggregations

FileInputSplit (org.apache.flink.core.fs.FileInputSplit)140 Test (org.junit.Test)119 Configuration (org.apache.flink.configuration.Configuration)93 Path (org.apache.flink.core.fs.Path)59 IOException (java.io.IOException)45 File (java.io.File)36 FileOutputStream (java.io.FileOutputStream)23 TypeInformation (org.apache.flink.api.common.typeinfo.TypeInformation)20 Row (org.apache.flink.types.Row)20 OutputStreamWriter (java.io.OutputStreamWriter)18 ParseException (org.apache.flink.api.common.io.ParseException)17 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)17 DoubleValue (org.apache.flink.types.DoubleValue)17 IntValue (org.apache.flink.types.IntValue)17 LongValue (org.apache.flink.types.LongValue)17 StringValue (org.apache.flink.types.StringValue)17 Value (org.apache.flink.types.Value)17 Plan (org.apache.flink.api.common.Plan)12 ReplicatingInputFormat (org.apache.flink.api.common.io.ReplicatingInputFormat)12 Tuple1 (org.apache.flink.api.java.tuple.Tuple1)12