use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.
the class DataLocalityBatchTaskScheduler method getInputFilesList.
private List<String> getInputFilesList() {
List<String> inputDataList = new ArrayList<>();
String directory = null;
if (config.get(DataObjectConstants.DINPUT_DIRECTORY) != null) {
directory = String.valueOf(config.get(DataObjectConstants.DINPUT_DIRECTORY));
}
final Path path = new Path(directory);
final FileSystem fileSystem;
try {
fileSystem = FileSystemUtils.get(path);
if (config.get(DataObjectConstants.FILE_SYSTEM).equals(DataContext.TWISTER2_HDFS_FILESYSTEM)) {
final FileStatus pathFile = fileSystem.getFileStatus(path);
inputDataList.add(String.valueOf(pathFile.getPath()));
} else if (config.get(DataObjectConstants.FILE_SYSTEM).equals(DataContext.TWISTER2_LOCAL_FILESYSTEM)) {
for (FileStatus file : fileSystem.listFiles(path)) {
String filename = String.valueOf(file.getPath());
if (filename != null) {
inputDataList.add(filename);
}
}
}
} catch (IOException e) {
throw new RuntimeException("IOException Occured");
}
return inputDataList;
}
use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.
the class DataLocalityStreamingTaskScheduler method getInputFilesList.
private List<String> getInputFilesList() {
List<String> inputDataList = new ArrayList<>();
String directory = null;
if (config.get(DataObjectConstants.DINPUT_DIRECTORY) != null) {
directory = String.valueOf(config.get(DataObjectConstants.DINPUT_DIRECTORY));
}
final Path path = new Path(directory);
final FileSystem fileSystem;
try {
fileSystem = FileSystemUtils.get(path);
if (config.get(DataObjectConstants.FILE_SYSTEM).equals(DataContext.TWISTER2_HDFS_FILESYSTEM)) {
final FileStatus pathFile = fileSystem.getFileStatus(path);
inputDataList.add(String.valueOf(pathFile.getPath()));
} else if (config.get(DataObjectConstants.FILE_SYSTEM).equals(DataContext.TWISTER2_LOCAL_FILESYSTEM)) {
for (FileStatus file : fileSystem.listFiles(path)) {
String filename = String.valueOf(file.getPath());
if (filename != null) {
inputDataList.add(filename);
}
}
}
} catch (IOException e) {
throw new TaskSchedulerException("Not able to get the input files", e);
}
return inputDataList;
}
use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.
the class BinaryInputPartitioner method createInputSplits.
/**
* Computes the input splits for the file. By default, one file block is one split. If more
* splits are requested than blocks are available, then a split may be a fraction of a block and
* splits may cross block boundaries.
*
* @param minNumSplits The minimum desired number of file splits.
* @return The computed file splits.
*/
@Override
public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
if (minNumSplits < 1) {
throw new IllegalArgumentException("Number of input splits has to be at least 1.");
}
int curminNumSplits = Math.max(minNumSplits, this.numSplits);
final Path path = this.filePath;
final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(curminNumSplits);
List<FileStatus> files = new ArrayList<FileStatus>();
long totalLength = 0;
// path.getFileSystem();
final FileSystem fs = FileSystemUtils.get(path);
final FileStatus pathFile = fs.getFileStatus(path);
if (pathFile.isDir()) {
totalLength += sumFilesInDir(path, files, true);
} else {
files.add(pathFile);
totalLength += pathFile.getLen();
}
if (totalLength % this.recordLength != 0) {
throw new IllegalStateException("The Binary file has a incomplete record");
}
long numberOfRecords = totalLength / this.recordLength;
long minRecordsForSplit = Math.floorDiv(numberOfRecords, minNumSplits);
long oddRecords = numberOfRecords % minNumSplits;
// Generate the splits
int splitNum = 0;
for (final FileStatus file : files) {
final long len = file.getLen();
final long blockSize = file.getBlockSize();
final long minSplitSize = minRecordsForSplit * this.recordLength;
long currentSplitSize = minSplitSize;
long halfSplit = currentSplitSize >>> 1;
if (oddRecords > 0) {
currentSplitSize = currentSplitSize + this.recordLength;
oddRecords--;
}
if (len > 0) {
// get the block locations and make sure they are in order with respect to their offset
final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
Arrays.sort(blocks);
long bytesUnassigned = len;
long position = 0;
int blockIndex = 0;
while (bytesUnassigned >= currentSplitSize) {
// get the block containing the majority of the data
blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
// create a new split
FileInputSplit fis = new BinaryInputSplit(splitNum++, file.getPath(), position, currentSplitSize, blocks[blockIndex].getHosts());
inputSplits.add(fis);
// adjust the positions
position += currentSplitSize;
bytesUnassigned -= currentSplitSize;
}
} else {
throw new IllegalStateException("The binary file " + file.getPath() + " is Empty");
}
}
return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.
the class CompleteFileInputPartitioner method sumFilesInDir.
/**
* Enumerate all files in the directory and recursive if enumerateNestedFiles is true.
*
* @return the total length of accepted files.
*/
long sumFilesInDir(Path path, List<FileStatus> files, boolean logExcludedFiles) throws IOException {
final FileSystem fs = FileSystemUtils.get(path);
long length = 0;
for (FileStatus file : fs.listFiles(path)) {
if (file.isDir()) {
if (acceptFile(file) && enumerateNestedFiles) {
length += sumFilesInDir(file.getPath(), files, logExcludedFiles);
} else {
if (logExcludedFiles) {
LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the " + "file-filter and is excluded.");
}
}
} else {
if (acceptFile(file)) {
files.add(file);
length += file.getLen();
} else {
if (logExcludedFiles) {
LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the file-filter and is excluded.");
}
}
}
}
return length;
}
use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.
the class FixedInputPartitioner method createInputSplits.
/**
* This method create the input splits which is based on the number of lines in the input and the
* parallelism value.
*
* @param minNumSplits Number of minimal input splits, as a hint.
*/
@Override
public FileInputSplit<OT>[] createInputSplits(int minNumSplits) throws IOException {
// take the desired number of splits into account
int curminNumSplits = Math.max(minNumSplits, this.numSplits);
final Path path = this.filePath;
final List<FileInputSplit> inputSplits = new ArrayList<>(curminNumSplits);
// get all the files that are involved in the splits
List<FileStatus> files = new ArrayList<>();
long totalLength = 0;
final FileSystem fs = FileSystemUtils.get(path);
final FileStatus pathFile = fs.getFileStatus(path);
if (pathFile.isDir()) {
totalLength += sumFilesInDir(path, files, true);
} else {
files.add(pathFile);
totalLength += pathFile.getLen();
}
// Generate the splits
final long maxSplitSize = totalLength / curminNumSplits + (totalLength % curminNumSplits == 0 ? 0 : 1);
if (files.size() > 1) {
throw new IllegalStateException("FixedInputPartitioner does not support multiple files" + "currently");
}
for (final FileStatus file : files) {
// First Split Calculation
// To retrieve the total count of the number of the lines in a file.
// final long lineCount = Files.lines(Paths.get(file.getPath().getPath())).count();
final long lineCount = dataSize;
int splSize = (int) (lineCount / curminNumSplits);
final long len = file.getLen();
final long blockSize = file.getBlockSize();
final long localminSplitSize;
if (this.minSplitSize <= blockSize) {
localminSplitSize = this.minSplitSize;
} else {
LOG.log(Level.WARNING, "Minimal split size of " + this.minSplitSize + " is larger than the block size of " + blockSize + ". Decreasing minimal split size to block size.");
localminSplitSize = blockSize;
}
int currLineCount = 0;
long[] splitSizes = getSplitSizes(fs, file.getPath(), curminNumSplits, splSize);
int position = 0;
if (len > 0) {
for (int i = 0; i < splitSizes.length; i++) {
String[] hosts = new String[0];
final FileInputSplit fis = createSplit(i, file.getPath(), position, splitSizes[i], hosts);
position += splitSizes[i];
inputSplits.add(fis);
}
} else {
// TODO need to check this section of the code for correctness
final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
String[] hosts;
if (blocks.length > 0) {
hosts = blocks[0].getHosts();
} else {
hosts = new String[0];
}
final FileInputSplit fis = createSplit(0, file.getPath(), 0, 0, hosts);
inputSplits.add(fis);
}
// Old code that does splitting based on fixed byte sizes
/* final long splitSize = Math.max(localminSplitSize, Math.min(maxSplitSize, blockSize));
final long maxBytesForLastSplit = (long) (splitSize * MAX_SPLIT_SIZE_DISCREPANCY);
long bytesUnassigned = len;
int splitNum = 0;
int position = 0;
if (len > 0) {
while (bytesUnassigned > maxBytesForLastSplit) {
String[] hosts = new String[0];
final FileInputSplit fis
= createSplit(splitNum++, file.getPath(), position, totalbytes, hosts);
inputSplits.add(fis);
position += totalbytes;
bytesUnassigned -= totalbytes;
}
if (bytesUnassigned > 0) {
long remainingBytes = getSplitSize(fs, file.getPath(), splSize, dataSize);
String[] hosts = new String[0];
final FileInputSplit fis
= createSplit(splitNum++, file.getPath(), position, bytesUnassigned, hosts);
inputSplits.add(fis);
}
} else {
final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
String[] hosts;
if (blocks.length > 0) {
hosts = blocks[0].getHosts();
} else {
hosts = new String[0];
}
final FileInputSplit fis = createSplit(splitNum++, file.getPath(), 0, 0, hosts);
inputSplits.add(fis);
}*/
}
return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Aggregations