Search in sources :

Example 6 with FileStatus

use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.

the class CSVInputPartitioner method createInputSplits.

@Override
public FileInputSplit<OT>[] createInputSplits(int minNumSplits) throws IOException {
    // take the desired number of splits into account
    int curminNumSplits = Math.max(minNumSplits, this.numSplits);
    final Path path = this.filePath;
    final List<FileInputSplit> inputSplits = new ArrayList<>(curminNumSplits);
    // get all the files that are involved in the splits
    List<FileStatus> files = new ArrayList<>();
    final FileSystem fs = FileSystemUtils.get(path, config);
    final FileStatus pathFile = fs.getFileStatus(path);
    long totalLength = 0;
    if (pathFile.isDir()) {
        totalLength += sumFilesInDir(path, files, true);
    } else {
        files.add(pathFile);
        totalLength += pathFile.getLen();
    }
    if (files.size() > 1) {
        throw new IllegalStateException("FixedInputPartitioner does not support multiple files" + "currently");
    }
    for (final FileStatus file : files) {
        final long lineCount = dataSize;
        int splSize = (int) (lineCount / curminNumSplits);
        final long len = file.getLen();
        long[] splitSizes = getSplitSizes(fs, file.getPath(), curminNumSplits, splSize);
        int position = 0;
        if (len > 0) {
            for (int i = 0; i < splitSizes.length; i++) {
                String[] hosts = new String[0];
                final FileInputSplit fis = new CSVInputSplit(i, file.getPath(), position, splitSizes[i], hosts);
                position += splitSizes[i];
                inputSplits.add(fis);
            }
        } else {
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
            String[] hosts;
            if (blocks.length > 0) {
                hosts = blocks[0].getHosts();
            } else {
                hosts = new String[0];
            }
            final FileInputSplit fis = new CSVInputSplit(0, file.getPath(), 0, 0, hosts);
            inputSplits.add(fis);
        }
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) ArrayList(java.util.ArrayList) BlockLocation(edu.iu.dsc.tws.api.data.BlockLocation) FileInputSplit(edu.iu.dsc.tws.data.api.splits.FileInputSplit) CSVInputSplit(edu.iu.dsc.tws.data.api.splits.CSVInputSplit) FileSystem(edu.iu.dsc.tws.api.data.FileSystem)

Example 7 with FileStatus

use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.

the class CompleteCSVInputPartitioner method sumFilesInDir.

/**
 * Enumerate all files in the directory and recursive if enumerateNestedFiles is true.
 *
 * @return the total length of accepted files.
 */
long sumFilesInDir(Path path, List<FileStatus> files, boolean logExcludedFiles) throws IOException {
    final FileSystem fs = FileSystemUtils.get(path);
    long length = 0;
    for (FileStatus file : fs.listFiles(path)) {
        if (file.isDir()) {
            if (acceptFile(file) && enumerateNestedFiles) {
                length += sumFilesInDir(file.getPath(), files, logExcludedFiles);
            } else {
                if (logExcludedFiles) {
                    LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the " + "file-filter and is excluded.");
                }
            }
        } else {
            if (acceptFile(file)) {
                files.add(file);
                length += file.getLen();
            } else {
                if (logExcludedFiles) {
                    LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the file-filter and is excluded.");
                }
            }
        }
    }
    return length;
}
Also used : FileStatus(edu.iu.dsc.tws.api.data.FileStatus) FileSystem(edu.iu.dsc.tws.api.data.FileSystem)

Example 8 with FileStatus

use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.

the class FileInputPartitioner method sumFilesInDir.

/**
 * Enumerate all files in the directory and recursive if enumerateNestedFiles is true.
 *
 * @return the total length of accepted files.
 */
long sumFilesInDir(Path path, List<FileStatus> files, boolean logExcludedFiles) throws IOException {
    final FileSystem fs = FileSystemUtils.get(path);
    long length = 0;
    for (FileStatus file : fs.listFiles(path)) {
        if (file.isDir()) {
            if (acceptFile(file) && enumerateNestedFiles) {
                length += sumFilesInDir(file.getPath(), files, logExcludedFiles);
            } else {
                if (logExcludedFiles) {
                    LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the " + "file-filter and is excluded.");
                }
            }
        } else {
            if (acceptFile(file)) {
                files.add(file);
                length += file.getLen();
            // TODO: implement test for unsplittable
            // testForUnsplittable(file);
            } else {
                if (logExcludedFiles) {
                    LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the file-filter and is excluded.");
                }
            }
        }
    }
    return length;
}
Also used : FileStatus(edu.iu.dsc.tws.api.data.FileStatus) FileSystem(edu.iu.dsc.tws.api.data.FileSystem)

Example 9 with FileStatus

use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.

the class DataNodeLocatorUtils method findDataNodesLocation.

/**
 * This method receives the input data list for each vertex and find the location of the
 * datanodes in the HDFS and returns the data node list.
 */
public List<String> findDataNodesLocation(List<String> inputFileList) {
    List<String> dataNodes = new ArrayList<>();
    FileSystem fileSystem;
    try {
        for (String anInputFileList : inputFileList) {
            Path path = new Path(anInputFileList);
            fileSystem = FileSystemUtils.get(path.toUri(), config);
            this.datasetName = anInputFileList;
            if (config.get(DataObjectConstants.FILE_SYSTEM).equals(DataContext.TWISTER2_HDFS_FILESYSTEM)) {
                FileStatus fileStatus = fileSystem.getFileStatus(new Path(datasetName));
                if (!fileStatus.getPath().isNullOrEmpty()) {
                    // dataNodes = getDataNodes(new String[]{this.datasetName});
                    dataNodes = getDataNodes();
                }
            } else if (config.get(DataObjectConstants.FILE_SYSTEM).equals(DataContext.TWISTER2_LOCAL_FILESYSTEM)) {
                FileStatus fileStatus = fileSystem.getFileStatus(new Path(datasetName));
                if (!fileStatus.getPath().isNullOrEmpty()) {
                    String datanodeName = InetAddress.getLocalHost().getHostName();
                    dataNodes.add(datanodeName);
                }
            }
        }
    } catch (IOException ioe) {
        throw new RuntimeException("IOException Occured");
    }
    return dataNodes;
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) FileSystem(edu.iu.dsc.tws.api.data.FileSystem) ArrayList(java.util.ArrayList) IOException(java.io.IOException)

Example 10 with FileStatus

use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.

the class DataFileReader method readData.

/**
 * It reads the datapoints from the corresponding file and store the data in a two-dimensional
 * array for the later processing. The size of the two-dimensional array should be equal to the
 * number of clusters and the dimension considered for the clustering process.
 */
public double[][] readData(Path path, int dimension, int datasize) {
    double[][] datapoints = new double[datasize][dimension];
    final FileStatus pathFile;
    try {
        final FileSystem fs = FileSystemUtils.get(path, config);
        if (DataContext.TWISTER2_HDFS_FILESYSTEM.equals(fileSystem)) {
            pathFile = fs.getFileStatus(path);
            this.fdis = fs.open(pathFile.getPath());
        } else {
            for (FileStatus file : fs.listFiles(path)) {
                this.fdis = fs.open(file.getPath());
            }
        }
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(this.fdis));
        String line;
        int value = 0;
        while ((line = bufferedReader.readLine()) != null) {
            String[] data = line.split(",");
            for (int i = 0; i < data.length - 1; i++) {
                datapoints[value][i] = Double.parseDouble(data[i].trim());
                datapoints[value][i + 1] = Double.parseDouble(data[i + 1].trim());
            }
            value++;
        }
        if (bufferedReader != null) {
            bufferedReader.close();
        }
    } catch (IOException ioe) {
        throw new RuntimeException("IO Exception Occured");
    }
    return datapoints;
}
Also used : FileStatus(edu.iu.dsc.tws.api.data.FileStatus) InputStreamReader(java.io.InputStreamReader) FileSystem(edu.iu.dsc.tws.api.data.FileSystem) BufferedReader(java.io.BufferedReader) IOException(java.io.IOException)

Aggregations

FileStatus (edu.iu.dsc.tws.api.data.FileStatus)22 FileSystem (edu.iu.dsc.tws.api.data.FileSystem)20 Path (edu.iu.dsc.tws.api.data.Path)14 ArrayList (java.util.ArrayList)13 BlockLocation (edu.iu.dsc.tws.api.data.BlockLocation)7 FileInputSplit (edu.iu.dsc.tws.data.api.splits.FileInputSplit)7 IOException (java.io.IOException)6 CSVInputSplit (edu.iu.dsc.tws.data.api.splits.CSVInputSplit)3 Config (edu.iu.dsc.tws.api.config.Config)2 MessageType (edu.iu.dsc.tws.api.comms.messaging.types.MessageType)1 MessageTypes (edu.iu.dsc.tws.api.comms.messaging.types.MessageTypes)1 TaskSchedulerException (edu.iu.dsc.tws.api.compute.exceptions.TaskSchedulerException)1 DataPartitionConsumer (edu.iu.dsc.tws.api.dataset.DataPartitionConsumer)1 Twister2RuntimeException (edu.iu.dsc.tws.api.exceptions.Twister2RuntimeException)1 BinaryInputSplit (edu.iu.dsc.tws.data.api.splits.BinaryInputSplit)1 DataFileReader (edu.iu.dsc.tws.data.utils.DataFileReader)1 DataFileReplicatedReadSource (edu.iu.dsc.tws.task.dataobjects.DataFileReplicatedReadSource)1 ComputeGraphBuilder (edu.iu.dsc.tws.task.impl.ComputeGraphBuilder)1 BufferedReader (java.io.BufferedReader)1 Closeable (java.io.Closeable)1