Search in sources :

Example 11 with FileStatus

use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.

the class HadoopFileSystem method listFiles.

/**
 * List the statuses of the files/directories in the given path if the path is
 * a directory.
 *
 * @param f given path
 * @return the statuses of the files/directories in the given patch
 */
@Override
public FileStatus[] listFiles(Path f) throws IOException {
    RemoteIterator<LocatedFileStatus> listFiles = this.hadoopFileSystem.listFiles(toHadoopPath(f), true);
    List<FileStatus> statusList = new ArrayList<>();
    while (listFiles.hasNext()) {
        LocatedFileStatus next = listFiles.next();
        FileStatus status = new HadoopFileStatus(next);
        statusList.add(status);
    }
    return statusList.toArray(new FileStatus[0]);
}
Also used : FileStatus(edu.iu.dsc.tws.api.data.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) ArrayList(java.util.ArrayList) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus)

Example 12 with FileStatus

use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.

the class KMeansDataGeneratorTest method testUniqueSchedules3.

/**
 * Commented the hdfs data generation testing for the travis build
 */
/* @Test
  public void testUniqueSchedules2() throws IOException {
    Config config = getConfig();

    String hostname = String.valueOf(config.get("twister2.data.hdfs.namenode"));
    String dinputDirectory = "hdfs://" + hostname + ":9000/tmp/testdinput";

    int numFiles = 1;
    int dsize = 20;
    int dimension = 2;
    int parallelismValue = 2;

    KMeansDataGenerator.generateData("txt", new Path(dinputDirectory),
        numFiles, dsize, 100, dimension, config);
    ComputeGraphBuilder computeGraphBuilder = ComputeGraphBuilder.newBuilder(config);
    computeGraphBuilder.setTaskGraphName("kmeans");
    DataObjectSource sourceTask = new DataObjectSource("direct", dinputDirectory);
    DataObjectSink sinkTask = new DataObjectSink();
    computeGraphBuilder.addSource("source", sourceTask, parallelismValue);
    ComputeConnection computeConnection1 = computeGraphBuilder.addSink("sink", sinkTask,
        parallelismValue);
    computeConnection1.direct("source").viaEdge("direct").withDataType(MessageTypes.OBJECT);
    computeGraphBuilder.setMode(OperationMode.BATCH);

    LocalCompleteTextInputPartitioner localCompleteTextInputPartitioner
        = new LocalCompleteTextInputPartitioner(
        new Path(dinputDirectory), parallelismValue, config);

    DataSource<String, ?> source
        = new DataSource<>(config, localCompleteTextInputPartitioner, parallelismValue);
    InputSplit<String> inputSplit;
    for (int i = 0; i < parallelismValue; i++) {
      inputSplit = source.getNextSplit(i);
      Assert.assertNotNull(inputSplit);
    }
  }*/
@Test
public void testUniqueSchedules3() throws IOException {
    Config config = getConfig();
    String cinputDirectory = "/tmp/testcinput";
    int numFiles = 1;
    int csize = 4;
    int dimension = 2;
    int parallelismValue = 2;
    KMeansDataGenerator.generateData("txt", new Path(cinputDirectory), numFiles, csize, 100, dimension, config);
    ComputeGraphBuilder computeGraphBuilder = ComputeGraphBuilder.newBuilder(config);
    computeGraphBuilder.setTaskGraphName("kmeans");
    DataFileReplicatedReadSource task = new DataFileReplicatedReadSource(Context.TWISTER2_DIRECT_EDGE, cinputDirectory);
    computeGraphBuilder.addSource("map", task, parallelismValue);
    computeGraphBuilder.setMode(OperationMode.BATCH);
    Path path = new Path(cinputDirectory);
    final FileSystem fs = FileSystemUtils.get(path);
    final FileStatus pathFile = fs.getFileStatus(path);
    Assert.assertNotNull(pathFile);
    DataFileReader fileReader = new DataFileReader(config, "local");
    double[][] centroids = fileReader.readData(path, dimension, csize);
    Assert.assertNotNull(centroids);
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) DataFileReader(edu.iu.dsc.tws.data.utils.DataFileReader) DataFileReplicatedReadSource(edu.iu.dsc.tws.task.dataobjects.DataFileReplicatedReadSource) Config(edu.iu.dsc.tws.api.config.Config) FileSystem(edu.iu.dsc.tws.api.data.FileSystem) ComputeGraphBuilder(edu.iu.dsc.tws.task.impl.ComputeGraphBuilder) Test(org.junit.Test)

Example 13 with FileStatus

use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.

the class DataNodeLocatorUtils method findDataNodesLocation.

/**
 * This method receives the input file name of a vertex and find the location of the datanodes
 * in the HDFS and returns the data node list.
 *
 * @return datanodes list
 */
public List<String> findDataNodesLocation(String inputFileName) {
    List<String> dataNodes = new ArrayList<>();
    FileSystem fileSystem;
    try {
        Path path = new Path(inputFileName);
        fileSystem = FileSystemUtils.get(path.toUri(), config);
        if (config.get(DataObjectConstants.FILE_SYSTEM).equals(DataContext.TWISTER2_HDFS_FILESYSTEM)) {
            FileStatus fileStatus = fileSystem.getFileStatus(path);
            if (!fileStatus.getPath().isNullOrEmpty()) {
                dataNodes = getDataNodes();
            }
        } else if (config.get(DataObjectConstants.FILE_SYSTEM).equals(DataContext.TWISTER2_LOCAL_FILESYSTEM)) {
            FileStatus fileStatus = fileSystem.getFileStatus(path);
            if (!fileStatus.getPath().isNullOrEmpty()) {
                String datanodeName = InetAddress.getLocalHost().getHostName();
                dataNodes.add(datanodeName);
            }
        }
    } catch (IOException ioe) {
        throw new RuntimeException("IOException Occured");
    }
    return dataNodes;
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) FileSystem(edu.iu.dsc.tws.api.data.FileSystem) ArrayList(java.util.ArrayList) IOException(java.io.IOException)

Example 14 with FileStatus

use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.

the class CSVInputPartitioner method sumFilesInDir.

long sumFilesInDir(Path path, List<FileStatus> files, boolean logExcludedFiles) throws IOException {
    final FileSystem fs = FileSystemUtils.get(path);
    long length = 0;
    for (FileStatus file : fs.listFiles(path)) {
        if (file.isDir()) {
            if (acceptFile(file) && enumerateNestedFiles) {
                length += sumFilesInDir(file.getPath(), files, logExcludedFiles);
            } else {
                if (logExcludedFiles) {
                    LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the " + "file-filter and is excluded.");
                }
            }
        } else {
            if (acceptFile(file)) {
                files.add(file);
                length += file.getLen();
            } else {
                if (logExcludedFiles) {
                    LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the file-filter and is excluded.");
                }
            }
        }
    }
    return length;
}
Also used : FileStatus(edu.iu.dsc.tws.api.data.FileStatus) FileSystem(edu.iu.dsc.tws.api.data.FileSystem)

Example 15 with FileStatus

use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.

the class CompleteCSVInputPartitioner method createInputSplits.

/**
 * It creates the split for the complete file.
 *
 * @param minNumSplits Number of minimal input splits, as a hint.
 */
@Override
public FileInputSplit<OT>[] createInputSplits(int minNumSplits) throws IOException {
    if (minNumSplits < 1) {
        throw new IllegalArgumentException("Number of input splits has to be at least 1.");
    }
    int curminNumSplits = Math.max(minNumSplits, this.numSplits);
    final Path path = this.filePath;
    final List<FileInputSplit> inputSplits = new ArrayList<>(curminNumSplits);
    List<FileStatus> files = new ArrayList<>();
    long totalLength = 0;
    final FileSystem fs = FileSystemUtils.get(path, config);
    final FileStatus pathFile = fs.getFileStatus(path);
    if (pathFile.isDir()) {
        totalLength += sumFilesInDir(path, files, true);
    } else {
        files.add(pathFile);
        totalLength += pathFile.getLen();
    }
    final long maxSplitSize = totalLength;
    // Generate the splits
    int splitNum = 0;
    for (final FileStatus file : files) {
        final long len = file.getLen();
        final long blockSize = file.getBlockSize();
        final long localminSplitSize;
        if (this.minSplitSize <= blockSize) {
            localminSplitSize = this.minSplitSize;
        } else {
            LOG.log(Level.WARNING, "Minimal split size of " + this.minSplitSize + " is larger than the block size of " + blockSize + ". Decreasing minimal split size to block size.");
            localminSplitSize = blockSize;
        }
        final long splitSize = Math.max(localminSplitSize, Math.min(maxSplitSize, blockSize));
        if (len > 0) {
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
            Arrays.sort(blocks);
            long position = 0;
            int blockIndex = 0;
            for (int i = 0; i < curminNumSplits; i++) {
                blockIndex = getBlockIndexForPosition(blocks, position, splitSize, blockIndex);
                final FileInputSplit fis = new CSVInputSplit(splitNum++, file.getPath(), position, splitSize, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
            }
        } else {
            // special case with a file of zero bytes size
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
            String[] hosts;
            if (blocks.length > 0) {
                hosts = blocks[0].getHosts();
            } else {
                hosts = new String[0];
            }
            for (int i = 0; i < curminNumSplits; i++) {
                final FileInputSplit fis = new CSVInputSplit(splitNum++, file.getPath(), 0, 0, hosts);
                inputSplits.add(fis);
            }
        }
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) ArrayList(java.util.ArrayList) BlockLocation(edu.iu.dsc.tws.api.data.BlockLocation) FileInputSplit(edu.iu.dsc.tws.data.api.splits.FileInputSplit) CSVInputSplit(edu.iu.dsc.tws.data.api.splits.CSVInputSplit) FileSystem(edu.iu.dsc.tws.api.data.FileSystem)

Aggregations

FileStatus (edu.iu.dsc.tws.api.data.FileStatus)22 FileSystem (edu.iu.dsc.tws.api.data.FileSystem)20 Path (edu.iu.dsc.tws.api.data.Path)14 ArrayList (java.util.ArrayList)13 BlockLocation (edu.iu.dsc.tws.api.data.BlockLocation)7 FileInputSplit (edu.iu.dsc.tws.data.api.splits.FileInputSplit)7 IOException (java.io.IOException)6 CSVInputSplit (edu.iu.dsc.tws.data.api.splits.CSVInputSplit)3 Config (edu.iu.dsc.tws.api.config.Config)2 MessageType (edu.iu.dsc.tws.api.comms.messaging.types.MessageType)1 MessageTypes (edu.iu.dsc.tws.api.comms.messaging.types.MessageTypes)1 TaskSchedulerException (edu.iu.dsc.tws.api.compute.exceptions.TaskSchedulerException)1 DataPartitionConsumer (edu.iu.dsc.tws.api.dataset.DataPartitionConsumer)1 Twister2RuntimeException (edu.iu.dsc.tws.api.exceptions.Twister2RuntimeException)1 BinaryInputSplit (edu.iu.dsc.tws.data.api.splits.BinaryInputSplit)1 DataFileReader (edu.iu.dsc.tws.data.utils.DataFileReader)1 DataFileReplicatedReadSource (edu.iu.dsc.tws.task.dataobjects.DataFileReplicatedReadSource)1 ComputeGraphBuilder (edu.iu.dsc.tws.task.impl.ComputeGraphBuilder)1 BufferedReader (java.io.BufferedReader)1 Closeable (java.io.Closeable)1