Search in sources :

Example 6 with FileSystem

use of edu.iu.dsc.tws.api.data.FileSystem in project twister2 by DSC-SPIDAL.

the class CSVInputPartitioner method createInputSplits.

@Override
public FileInputSplit<OT>[] createInputSplits(int minNumSplits) throws IOException {
    // take the desired number of splits into account
    int curminNumSplits = Math.max(minNumSplits, this.numSplits);
    final Path path = this.filePath;
    final List<FileInputSplit> inputSplits = new ArrayList<>(curminNumSplits);
    // get all the files that are involved in the splits
    List<FileStatus> files = new ArrayList<>();
    final FileSystem fs = FileSystemUtils.get(path, config);
    final FileStatus pathFile = fs.getFileStatus(path);
    long totalLength = 0;
    if (pathFile.isDir()) {
        totalLength += sumFilesInDir(path, files, true);
    } else {
        files.add(pathFile);
        totalLength += pathFile.getLen();
    }
    if (files.size() > 1) {
        throw new IllegalStateException("FixedInputPartitioner does not support multiple files" + "currently");
    }
    for (final FileStatus file : files) {
        final long lineCount = dataSize;
        int splSize = (int) (lineCount / curminNumSplits);
        final long len = file.getLen();
        long[] splitSizes = getSplitSizes(fs, file.getPath(), curminNumSplits, splSize);
        int position = 0;
        if (len > 0) {
            for (int i = 0; i < splitSizes.length; i++) {
                String[] hosts = new String[0];
                final FileInputSplit fis = new CSVInputSplit(i, file.getPath(), position, splitSizes[i], hosts);
                position += splitSizes[i];
                inputSplits.add(fis);
            }
        } else {
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
            String[] hosts;
            if (blocks.length > 0) {
                hosts = blocks[0].getHosts();
            } else {
                hosts = new String[0];
            }
            final FileInputSplit fis = new CSVInputSplit(0, file.getPath(), 0, 0, hosts);
            inputSplits.add(fis);
        }
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) ArrayList(java.util.ArrayList) BlockLocation(edu.iu.dsc.tws.api.data.BlockLocation) FileInputSplit(edu.iu.dsc.tws.data.api.splits.FileInputSplit) CSVInputSplit(edu.iu.dsc.tws.data.api.splits.CSVInputSplit) FileSystem(edu.iu.dsc.tws.api.data.FileSystem)

Example 7 with FileSystem

use of edu.iu.dsc.tws.api.data.FileSystem in project twister2 by DSC-SPIDAL.

the class CompleteCSVInputPartitioner method sumFilesInDir.

/**
 * Enumerate all files in the directory and recursive if enumerateNestedFiles is true.
 *
 * @return the total length of accepted files.
 */
long sumFilesInDir(Path path, List<FileStatus> files, boolean logExcludedFiles) throws IOException {
    final FileSystem fs = FileSystemUtils.get(path);
    long length = 0;
    for (FileStatus file : fs.listFiles(path)) {
        if (file.isDir()) {
            if (acceptFile(file) && enumerateNestedFiles) {
                length += sumFilesInDir(file.getPath(), files, logExcludedFiles);
            } else {
                if (logExcludedFiles) {
                    LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the " + "file-filter and is excluded.");
                }
            }
        } else {
            if (acceptFile(file)) {
                files.add(file);
                length += file.getLen();
            } else {
                if (logExcludedFiles) {
                    LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the file-filter and is excluded.");
                }
            }
        }
    }
    return length;
}
Also used : FileStatus(edu.iu.dsc.tws.api.data.FileStatus) FileSystem(edu.iu.dsc.tws.api.data.FileSystem)

Example 8 with FileSystem

use of edu.iu.dsc.tws.api.data.FileSystem in project twister2 by DSC-SPIDAL.

the class FileInputPartitioner method sumFilesInDir.

/**
 * Enumerate all files in the directory and recursive if enumerateNestedFiles is true.
 *
 * @return the total length of accepted files.
 */
long sumFilesInDir(Path path, List<FileStatus> files, boolean logExcludedFiles) throws IOException {
    final FileSystem fs = FileSystemUtils.get(path);
    long length = 0;
    for (FileStatus file : fs.listFiles(path)) {
        if (file.isDir()) {
            if (acceptFile(file) && enumerateNestedFiles) {
                length += sumFilesInDir(file.getPath(), files, logExcludedFiles);
            } else {
                if (logExcludedFiles) {
                    LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the " + "file-filter and is excluded.");
                }
            }
        } else {
            if (acceptFile(file)) {
                files.add(file);
                length += file.getLen();
            // TODO: implement test for unsplittable
            // testForUnsplittable(file);
            } else {
                if (logExcludedFiles) {
                    LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the file-filter and is excluded.");
                }
            }
        }
    }
    return length;
}
Also used : FileStatus(edu.iu.dsc.tws.api.data.FileStatus) FileSystem(edu.iu.dsc.tws.api.data.FileSystem)

Example 9 with FileSystem

use of edu.iu.dsc.tws.api.data.FileSystem in project twister2 by DSC-SPIDAL.

the class DataNodeLocatorUtils method findDataNodesLocation.

/**
 * This method receives the input data list for each vertex and find the location of the
 * datanodes in the HDFS and returns the data node list.
 */
public List<String> findDataNodesLocation(List<String> inputFileList) {
    List<String> dataNodes = new ArrayList<>();
    FileSystem fileSystem;
    try {
        for (String anInputFileList : inputFileList) {
            Path path = new Path(anInputFileList);
            fileSystem = FileSystemUtils.get(path.toUri(), config);
            this.datasetName = anInputFileList;
            if (config.get(DataObjectConstants.FILE_SYSTEM).equals(DataContext.TWISTER2_HDFS_FILESYSTEM)) {
                FileStatus fileStatus = fileSystem.getFileStatus(new Path(datasetName));
                if (!fileStatus.getPath().isNullOrEmpty()) {
                    // dataNodes = getDataNodes(new String[]{this.datasetName});
                    dataNodes = getDataNodes();
                }
            } else if (config.get(DataObjectConstants.FILE_SYSTEM).equals(DataContext.TWISTER2_LOCAL_FILESYSTEM)) {
                FileStatus fileStatus = fileSystem.getFileStatus(new Path(datasetName));
                if (!fileStatus.getPath().isNullOrEmpty()) {
                    String datanodeName = InetAddress.getLocalHost().getHostName();
                    dataNodes.add(datanodeName);
                }
            }
        }
    } catch (IOException ioe) {
        throw new RuntimeException("IOException Occured");
    }
    return dataNodes;
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) FileSystem(edu.iu.dsc.tws.api.data.FileSystem) ArrayList(java.util.ArrayList) IOException(java.io.IOException)

Example 10 with FileSystem

use of edu.iu.dsc.tws.api.data.FileSystem in project twister2 by DSC-SPIDAL.

the class DataFileReader method readData.

/**
 * It reads the datapoints from the corresponding file and store the data in a two-dimensional
 * array for the later processing. The size of the two-dimensional array should be equal to the
 * number of clusters and the dimension considered for the clustering process.
 */
public double[][] readData(Path path, int dimension, int datasize) {
    double[][] datapoints = new double[datasize][dimension];
    final FileStatus pathFile;
    try {
        final FileSystem fs = FileSystemUtils.get(path, config);
        if (DataContext.TWISTER2_HDFS_FILESYSTEM.equals(fileSystem)) {
            pathFile = fs.getFileStatus(path);
            this.fdis = fs.open(pathFile.getPath());
        } else {
            for (FileStatus file : fs.listFiles(path)) {
                this.fdis = fs.open(file.getPath());
            }
        }
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(this.fdis));
        String line;
        int value = 0;
        while ((line = bufferedReader.readLine()) != null) {
            String[] data = line.split(",");
            for (int i = 0; i < data.length - 1; i++) {
                datapoints[value][i] = Double.parseDouble(data[i].trim());
                datapoints[value][i + 1] = Double.parseDouble(data[i + 1].trim());
            }
            value++;
        }
        if (bufferedReader != null) {
            bufferedReader.close();
        }
    } catch (IOException ioe) {
        throw new RuntimeException("IO Exception Occured");
    }
    return datapoints;
}
Also used : FileStatus(edu.iu.dsc.tws.api.data.FileStatus) InputStreamReader(java.io.InputStreamReader) FileSystem(edu.iu.dsc.tws.api.data.FileSystem) BufferedReader(java.io.BufferedReader) IOException(java.io.IOException)

Aggregations

FileSystem (edu.iu.dsc.tws.api.data.FileSystem)26 FileStatus (edu.iu.dsc.tws.api.data.FileStatus)19 Path (edu.iu.dsc.tws.api.data.Path)18 ArrayList (java.util.ArrayList)11 IOException (java.io.IOException)10 BlockLocation (edu.iu.dsc.tws.api.data.BlockLocation)7 FileInputSplit (edu.iu.dsc.tws.data.api.splits.FileInputSplit)7 FSDataOutputStream (edu.iu.dsc.tws.api.data.FSDataOutputStream)4 PrintWriter (java.io.PrintWriter)4 CSVInputSplit (edu.iu.dsc.tws.data.api.splits.CSVInputSplit)3 LocalFileSystem (edu.iu.dsc.tws.data.fs.local.LocalFileSystem)3 HadoopFileSystem (edu.iu.dsc.tws.data.hdfs.HadoopFileSystem)3 File (java.io.File)2 URI (java.net.URI)2 URISyntaxException (java.net.URISyntaxException)2 Random (java.util.Random)2 TaskSchedulerException (edu.iu.dsc.tws.api.compute.exceptions.TaskSchedulerException)1 Config (edu.iu.dsc.tws.api.config.Config)1 BinaryInputSplit (edu.iu.dsc.tws.data.api.splits.BinaryInputSplit)1 DataFileReader (edu.iu.dsc.tws.data.utils.DataFileReader)1