use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class AvroRecordInputFormatTest method doTestDeserializationGenericRecord.
/**
* Helper method to test GenericRecord serialisation
*
* @param format
* the format to test
* @param parameters
* the configuration to use
* @throws IOException
* thrown id there is a issue
*/
@SuppressWarnings("unchecked")
private void doTestDeserializationGenericRecord(final AvroInputFormat<GenericRecord> format, final Configuration parameters) throws IOException {
try {
format.configure(parameters);
FileInputSplit[] splits = format.createInputSplits(1);
assertEquals(splits.length, 1);
format.open(splits[0]);
GenericRecord u = format.nextRecord(null);
assertNotNull(u);
assertEquals("The schemas should be equal", userSchema, u.getSchema());
String name = u.get("name").toString();
assertNotNull("empty record", name);
assertEquals("name not equal", TEST_NAME, name);
// check arrays
List<CharSequence> sl = (List<CharSequence>) u.get("type_array_string");
assertEquals("element 0 not equal", TEST_ARRAY_STRING_1, sl.get(0).toString());
assertEquals("element 1 not equal", TEST_ARRAY_STRING_2, sl.get(1).toString());
List<Boolean> bl = (List<Boolean>) u.get("type_array_boolean");
assertEquals("element 0 not equal", TEST_ARRAY_BOOLEAN_1, bl.get(0));
assertEquals("element 1 not equal", TEST_ARRAY_BOOLEAN_2, bl.get(1));
// check enums
GenericData.EnumSymbol enumValue = (GenericData.EnumSymbol) u.get("type_enum");
assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), enumValue.toString());
// check maps
Map<CharSequence, Long> lm = (Map<CharSequence, Long>) u.get("type_map");
assertEquals("map value of key 1 not equal", TEST_MAP_VALUE1, lm.get(new Utf8(TEST_MAP_KEY1)).longValue());
assertEquals("map value of key 2 not equal", TEST_MAP_VALUE2, lm.get(new Utf8(TEST_MAP_KEY2)).longValue());
assertFalse("expecting second element", format.reachedEnd());
assertNotNull("expecting second element", format.nextRecord(u));
assertNull(format.nextRecord(u));
assertTrue(format.reachedEnd());
} finally {
format.close();
}
}
use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class AvroRecordInputFormatTest method testDeserialisationReuseAvroRecordFalse.
/**
* Test if the AvroInputFormat is able to properly read data from an avro file.
* @throws IOException
*/
@Test
public void testDeserialisationReuseAvroRecordFalse() throws IOException {
Configuration parameters = new Configuration();
AvroInputFormat<User> format = new AvroInputFormat<User>(new Path(testFile.getAbsolutePath()), User.class);
format.setReuseAvroValue(false);
format.configure(parameters);
FileInputSplit[] splits = format.createInputSplits(1);
assertEquals(splits.length, 1);
format.open(splits[0]);
User u = format.nextRecord(null);
assertNotNull(u);
String name = u.getName().toString();
assertNotNull("empty record", name);
assertEquals("name not equal", TEST_NAME, name);
// check arrays
List<CharSequence> sl = u.getTypeArrayString();
assertEquals("element 0 not equal", TEST_ARRAY_STRING_1, sl.get(0).toString());
assertEquals("element 1 not equal", TEST_ARRAY_STRING_2, sl.get(1).toString());
List<Boolean> bl = u.getTypeArrayBoolean();
assertEquals("element 0 not equal", TEST_ARRAY_BOOLEAN_1, bl.get(0));
assertEquals("element 1 not equal", TEST_ARRAY_BOOLEAN_2, bl.get(1));
// check enums
Colors enumValue = u.getTypeEnum();
assertEquals("enum not equal", TEST_ENUM_COLOR, enumValue);
// check maps
Map<CharSequence, Long> lm = u.getTypeMap();
assertEquals("map value of key 1 not equal", TEST_MAP_VALUE1, lm.get(new Utf8(TEST_MAP_KEY1)).longValue());
assertEquals("map value of key 2 not equal", TEST_MAP_VALUE2, lm.get(new Utf8(TEST_MAP_KEY2)).longValue());
assertFalse("expecting second element", format.reachedEnd());
assertNotNull("expecting second element", format.nextRecord(u));
assertNull(format.nextRecord(u));
assertTrue(format.reachedEnd());
format.close();
}
use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class FileInputFormat method createInputSplits.
/**
* Computes the input splits for the file. By default, one file block is one split. If more splits
* are requested than blocks are available, then a split may be a fraction of a block and splits may cross
* block boundaries.
*
* @param minNumSplits The minimum desired number of file splits.
* @return The computed file splits.
*
* @see org.apache.flink.api.common.io.InputFormat#createInputSplits(int)
*/
@Override
public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
if (minNumSplits < 1) {
throw new IllegalArgumentException("Number of input splits has to be at least 1.");
}
// take the desired number of splits into account
minNumSplits = Math.max(minNumSplits, this.numSplits);
final Path path = this.filePath;
final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits);
// get all the files that are involved in the splits
List<FileStatus> files = new ArrayList<FileStatus>();
long totalLength = 0;
final FileSystem fs = path.getFileSystem();
final FileStatus pathFile = fs.getFileStatus(path);
if (pathFile.isDir()) {
totalLength += addFilesInDir(path, files, true);
} else {
testForUnsplittable(pathFile);
files.add(pathFile);
totalLength += pathFile.getLen();
}
// returns if unsplittable
if (unsplittable) {
int splitNum = 0;
for (final FileStatus file : files) {
final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, file.getLen());
Set<String> hosts = new HashSet<String>();
for (BlockLocation block : blocks) {
hosts.addAll(Arrays.asList(block.getHosts()));
}
long len = file.getLen();
if (testForUnsplittable(file)) {
len = READ_WHOLE_SPLIT_FLAG;
}
FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, len, hosts.toArray(new String[hosts.size()]));
inputSplits.add(fis);
}
return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
final long maxSplitSize = (minNumSplits < 1) ? Long.MAX_VALUE : (totalLength / minNumSplits + (totalLength % minNumSplits == 0 ? 0 : 1));
// now that we have the files, generate the splits
int splitNum = 0;
for (final FileStatus file : files) {
final long len = file.getLen();
final long blockSize = file.getBlockSize();
final long minSplitSize;
if (this.minSplitSize <= blockSize) {
minSplitSize = this.minSplitSize;
} else {
if (LOG.isWarnEnabled()) {
LOG.warn("Minimal split size of " + this.minSplitSize + " is larger than the block size of " + blockSize + ". Decreasing minimal split size to block size.");
}
minSplitSize = blockSize;
}
final long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize));
final long halfSplit = splitSize >>> 1;
final long maxBytesForLastSplit = (long) (splitSize * MAX_SPLIT_SIZE_DISCREPANCY);
if (len > 0) {
// get the block locations and make sure they are in order with respect to their offset
final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
Arrays.sort(blocks);
long bytesUnassigned = len;
long position = 0;
int blockIndex = 0;
while (bytesUnassigned > maxBytesForLastSplit) {
// get the block containing the majority of the data
blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
// create a new split
FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, splitSize, blocks[blockIndex].getHosts());
inputSplits.add(fis);
// adjust the positions
position += splitSize;
bytesUnassigned -= splitSize;
}
// assign the last split
if (bytesUnassigned > 0) {
blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, bytesUnassigned, blocks[blockIndex].getHosts());
inputSplits.add(fis);
}
} else {
// special case with a file of zero bytes size
final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
String[] hosts;
if (blocks.length > 0) {
hosts = blocks[0].getHosts();
} else {
hosts = new String[0];
}
final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, 0, hosts);
inputSplits.add(fis);
}
}
return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class FileInputFormatTest method testFileInputSplit.
// ------------------------------------------------------------------------
// Unsplittable input files
// ------------------------------------------------------------------------
// ---- Tests for .deflate ---------
/**
* Create directory with files with .deflate extension and see if it creates a split
* for each file. Each split has to start from the beginning.
*/
@Test
public void testFileInputSplit() {
try {
String tempFile = TestFileUtils.createTempFileDirExtension(".deflate", "some", "stupid", "meaningless", "files");
final DummyFileInputFormat format = new DummyFileInputFormat();
format.setFilePath(tempFile);
format.configure(new Configuration());
FileInputSplit[] splits = format.createInputSplits(2);
Assert.assertEquals(4, splits.length);
for (FileInputSplit split : splits) {
// unsplittable deflate files have this size as a flag for "read whole file"
Assert.assertEquals(-1L, split.getLength());
// always read from the beginning.
Assert.assertEquals(0L, split.getStart());
}
// test if this also works for "mixed" directories
TestFileUtils.createTempFileInDirectory(tempFile.replace("file:", ""), "this creates a test file with a random extension (at least not .deflate)");
final DummyFileInputFormat formatMixed = new DummyFileInputFormat();
formatMixed.setFilePath(tempFile);
formatMixed.configure(new Configuration());
FileInputSplit[] splitsMixed = formatMixed.createInputSplits(2);
Assert.assertEquals(5, splitsMixed.length);
for (FileInputSplit split : splitsMixed) {
if (split.getPath().getName().endsWith(".deflate")) {
// unsplittable deflate files have this size as a flag for "read whole file"
Assert.assertEquals(-1L, split.getLength());
// always read from the beginning.
Assert.assertEquals(0L, split.getStart());
} else {
Assert.assertEquals(0L, split.getStart());
Assert.assertTrue("split size not correct", split.getLength() > 0);
}
}
} catch (Exception ex) {
ex.printStackTrace();
Assert.fail(ex.getMessage());
}
}
use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.
the class FileInputFormatTest method testDecorateInputStream.
// ------------------------------------------------------------------------
// Stream Decoration
// ------------------------------------------------------------------------
@Test
public void testDecorateInputStream() throws IOException {
// create temporary file with 3 blocks
final File tempFile = File.createTempFile("input-stream-decoration-test", "tmp");
tempFile.deleteOnExit();
final int blockSize = 8;
final int numBlocks = 3;
FileOutputStream fileOutputStream = new FileOutputStream(tempFile);
for (int i = 0; i < blockSize * numBlocks; i++) {
fileOutputStream.write(new byte[] { (byte) i });
}
fileOutputStream.close();
final Configuration config = new Configuration();
final FileInputFormat<byte[]> inputFormat = new MyDecoratedInputFormat();
inputFormat.setFilePath(tempFile.toURI().toString());
inputFormat.configure(config);
inputFormat.openInputFormat();
FileInputSplit[] inputSplits = inputFormat.createInputSplits(3);
byte[] bytes = null;
byte prev = 0;
for (FileInputSplit inputSplit : inputSplits) {
inputFormat.open(inputSplit);
while (!inputFormat.reachedEnd()) {
if ((bytes = inputFormat.nextRecord(bytes)) != null) {
Assert.assertArrayEquals(new byte[] { --prev }, bytes);
}
}
}
inputFormat.closeInputFormat();
}
Aggregations