use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.
the class CSVInputPartitioner method createInputSplits.
@Override
public FileInputSplit<OT>[] createInputSplits(int minNumSplits) throws IOException {
// take the desired number of splits into account
int curminNumSplits = Math.max(minNumSplits, this.numSplits);
final Path path = this.filePath;
final List<FileInputSplit> inputSplits = new ArrayList<>(curminNumSplits);
// get all the files that are involved in the splits
List<FileStatus> files = new ArrayList<>();
final FileSystem fs = FileSystemUtils.get(path, config);
final FileStatus pathFile = fs.getFileStatus(path);
long totalLength = 0;
if (pathFile.isDir()) {
totalLength += sumFilesInDir(path, files, true);
} else {
files.add(pathFile);
totalLength += pathFile.getLen();
}
if (files.size() > 1) {
throw new IllegalStateException("FixedInputPartitioner does not support multiple files" + "currently");
}
for (final FileStatus file : files) {
final long lineCount = dataSize;
int splSize = (int) (lineCount / curminNumSplits);
final long len = file.getLen();
long[] splitSizes = getSplitSizes(fs, file.getPath(), curminNumSplits, splSize);
int position = 0;
if (len > 0) {
for (int i = 0; i < splitSizes.length; i++) {
String[] hosts = new String[0];
final FileInputSplit fis = new CSVInputSplit(i, file.getPath(), position, splitSizes[i], hosts);
position += splitSizes[i];
inputSplits.add(fis);
}
} else {
final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
String[] hosts;
if (blocks.length > 0) {
hosts = blocks[0].getHosts();
} else {
hosts = new String[0];
}
final FileInputSplit fis = new CSVInputSplit(0, file.getPath(), 0, 0, hosts);
inputSplits.add(fis);
}
}
return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.
the class CompleteCSVInputPartitioner method sumFilesInDir.
/**
* Enumerate all files in the directory and recursive if enumerateNestedFiles is true.
*
* @return the total length of accepted files.
*/
long sumFilesInDir(Path path, List<FileStatus> files, boolean logExcludedFiles) throws IOException {
final FileSystem fs = FileSystemUtils.get(path);
long length = 0;
for (FileStatus file : fs.listFiles(path)) {
if (file.isDir()) {
if (acceptFile(file) && enumerateNestedFiles) {
length += sumFilesInDir(file.getPath(), files, logExcludedFiles);
} else {
if (logExcludedFiles) {
LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the " + "file-filter and is excluded.");
}
}
} else {
if (acceptFile(file)) {
files.add(file);
length += file.getLen();
} else {
if (logExcludedFiles) {
LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the file-filter and is excluded.");
}
}
}
}
return length;
}
use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.
the class FileInputPartitioner method sumFilesInDir.
/**
* Enumerate all files in the directory and recursive if enumerateNestedFiles is true.
*
* @return the total length of accepted files.
*/
long sumFilesInDir(Path path, List<FileStatus> files, boolean logExcludedFiles) throws IOException {
final FileSystem fs = FileSystemUtils.get(path);
long length = 0;
for (FileStatus file : fs.listFiles(path)) {
if (file.isDir()) {
if (acceptFile(file) && enumerateNestedFiles) {
length += sumFilesInDir(file.getPath(), files, logExcludedFiles);
} else {
if (logExcludedFiles) {
LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the " + "file-filter and is excluded.");
}
}
} else {
if (acceptFile(file)) {
files.add(file);
length += file.getLen();
// TODO: implement test for unsplittable
// testForUnsplittable(file);
} else {
if (logExcludedFiles) {
LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the file-filter and is excluded.");
}
}
}
}
return length;
}
use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.
the class DataNodeLocatorUtils method findDataNodesLocation.
/**
* This method receives the input data list for each vertex and find the location of the
* datanodes in the HDFS and returns the data node list.
*/
public List<String> findDataNodesLocation(List<String> inputFileList) {
List<String> dataNodes = new ArrayList<>();
FileSystem fileSystem;
try {
for (String anInputFileList : inputFileList) {
Path path = new Path(anInputFileList);
fileSystem = FileSystemUtils.get(path.toUri(), config);
this.datasetName = anInputFileList;
if (config.get(DataObjectConstants.FILE_SYSTEM).equals(DataContext.TWISTER2_HDFS_FILESYSTEM)) {
FileStatus fileStatus = fileSystem.getFileStatus(new Path(datasetName));
if (!fileStatus.getPath().isNullOrEmpty()) {
// dataNodes = getDataNodes(new String[]{this.datasetName});
dataNodes = getDataNodes();
}
} else if (config.get(DataObjectConstants.FILE_SYSTEM).equals(DataContext.TWISTER2_LOCAL_FILESYSTEM)) {
FileStatus fileStatus = fileSystem.getFileStatus(new Path(datasetName));
if (!fileStatus.getPath().isNullOrEmpty()) {
String datanodeName = InetAddress.getLocalHost().getHostName();
dataNodes.add(datanodeName);
}
}
}
} catch (IOException ioe) {
throw new RuntimeException("IOException Occured");
}
return dataNodes;
}
use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.
the class DataFileReader method readData.
/**
* It reads the datapoints from the corresponding file and store the data in a two-dimensional
* array for the later processing. The size of the two-dimensional array should be equal to the
* number of clusters and the dimension considered for the clustering process.
*/
public double[][] readData(Path path, int dimension, int datasize) {
double[][] datapoints = new double[datasize][dimension];
final FileStatus pathFile;
try {
final FileSystem fs = FileSystemUtils.get(path, config);
if (DataContext.TWISTER2_HDFS_FILESYSTEM.equals(fileSystem)) {
pathFile = fs.getFileStatus(path);
this.fdis = fs.open(pathFile.getPath());
} else {
for (FileStatus file : fs.listFiles(path)) {
this.fdis = fs.open(file.getPath());
}
}
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(this.fdis));
String line;
int value = 0;
while ((line = bufferedReader.readLine()) != null) {
String[] data = line.split(",");
for (int i = 0; i < data.length - 1; i++) {
datapoints[value][i] = Double.parseDouble(data[i].trim());
datapoints[value][i + 1] = Double.parseDouble(data[i + 1].trim());
}
value++;
}
if (bufferedReader != null) {
bufferedReader.close();
}
} catch (IOException ioe) {
throw new RuntimeException("IO Exception Occured");
}
return datapoints;
}
Aggregations