Search in sources :

Example 81 with LocalFileSystem

use of org.apache.hadoop.fs.LocalFileSystem in project incubator-systemml by apache.

the class WriterMatrixMarketParallel method writeMatrixMarketMatrixToHDFS.

@Override
protected void writeMatrixMarketMatrixToHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock src) throws IOException {
    int rlen = src.getNumRows();
    // estimate output size and number of output blocks (min 1)
    int numPartFiles = (int) (OptimizerUtils.estimateSizeTextOutput(src.getNumRows(), src.getNumColumns(), src.getNonZeros(), OutputInfo.MatrixMarketOutputInfo) / InfrastructureAnalyzer.getHDFSBlockSize());
    numPartFiles = Math.max(numPartFiles, 1);
    // determine degree of parallelism
    int numThreads = OptimizerUtils.getParallelTextWriteParallelism();
    numThreads = Math.min(numThreads, numPartFiles);
    // fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file
    if (numThreads <= 1) {
        super.writeMatrixMarketMatrixToHDFS(path, job, fs, src);
        return;
    }
    // create directory for concurrent tasks
    MapReduceTool.createDirIfNotExistOnHDFS(path, DMLConfig.DEFAULT_SHARED_DIR_PERMISSION);
    // create and execute tasks
    try {
        ExecutorService pool = CommonThreadPool.get(numThreads);
        ArrayList<WriteMMTask> tasks = new ArrayList<>();
        int blklen = (int) Math.ceil((double) rlen / numThreads);
        for (int i = 0; i < numThreads & i * blklen < rlen; i++) {
            Path newPath = new Path(path, IOUtilFunctions.getPartFileName(i));
            tasks.add(new WriteMMTask(newPath, job, fs, src, i * blklen, (int) Math.min((i + 1) * blklen, rlen)));
        }
        // wait until all tasks have been executed
        List<Future<Object>> rt = pool.invokeAll(tasks);
        pool.shutdown();
        // check for exceptions
        for (Future<Object> task : rt) task.get();
        // delete crc files if written to local file system
        if (fs instanceof LocalFileSystem) {
            for (int i = 0; i < numThreads & i * blklen < rlen; i++) IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, new Path(path, IOUtilFunctions.getPartFileName(i)));
        }
    } catch (Exception e) {
        throw new IOException("Failed parallel write of text output.", e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) IOException(java.io.IOException) IOException(java.io.IOException) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future)

Example 82 with LocalFileSystem

use of org.apache.hadoop.fs.LocalFileSystem in project incubator-systemml by apache.

the class WriterTextCSVParallel method writeCSVMatrixToHDFS.

@Override
protected void writeCSVMatrixToHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock src, CSVFileFormatProperties csvprops) throws IOException {
    // estimate output size and number of output blocks (min 1)
    int numPartFiles = (int) (OptimizerUtils.estimateSizeTextOutput(src.getNumRows(), src.getNumColumns(), src.getNonZeros(), OutputInfo.CSVOutputInfo) / InfrastructureAnalyzer.getHDFSBlockSize());
    numPartFiles = Math.max(numPartFiles, 1);
    // determine degree of parallelism
    int numThreads = OptimizerUtils.getParallelTextWriteParallelism();
    numThreads = Math.min(numThreads, numPartFiles);
    // fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file
    if (numThreads <= 1) {
        super.writeCSVMatrixToHDFS(path, job, fs, src, csvprops);
        return;
    }
    // create directory for concurrent tasks
    MapReduceTool.createDirIfNotExistOnHDFS(path, DMLConfig.DEFAULT_SHARED_DIR_PERMISSION);
    // create and execute tasks
    try {
        ExecutorService pool = CommonThreadPool.get(numThreads);
        ArrayList<WriteCSVTask> tasks = new ArrayList<>();
        int rlen = src.getNumRows();
        int blklen = (int) Math.ceil((double) rlen / numThreads);
        for (int i = 0; i < numThreads & i * blklen < rlen; i++) {
            Path newPath = new Path(path, IOUtilFunctions.getPartFileName(i));
            tasks.add(new WriteCSVTask(newPath, job, fs, src, i * blklen, (int) Math.min((i + 1) * blklen, rlen), csvprops));
        }
        // wait until all tasks have been executed
        List<Future<Object>> rt = pool.invokeAll(tasks);
        pool.shutdown();
        // check for exceptions
        for (Future<Object> task : rt) task.get();
        // delete crc files if written to local file system
        if (fs instanceof LocalFileSystem) {
            for (int i = 0; i < numThreads & i * blklen < rlen; i++) IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, new Path(path, IOUtilFunctions.getPartFileName(i)));
        }
    } catch (Exception e) {
        throw new IOException("Failed parallel write of csv output.", e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) IOException(java.io.IOException) IOException(java.io.IOException) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future)

Example 83 with LocalFileSystem

use of org.apache.hadoop.fs.LocalFileSystem in project incubator-systemml by apache.

the class WriterTextCellParallel method writeTextCellMatrixToHDFS.

@Override
protected void writeTextCellMatrixToHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock src, long rlen, long clen) throws IOException {
    // estimate output size and number of output blocks (min 1)
    int numPartFiles = (int) (OptimizerUtils.estimateSizeTextOutput(src.getNumRows(), src.getNumColumns(), src.getNonZeros(), OutputInfo.TextCellOutputInfo) / InfrastructureAnalyzer.getHDFSBlockSize());
    numPartFiles = Math.max(numPartFiles, 1);
    // determine degree of parallelism
    int numThreads = OptimizerUtils.getParallelTextWriteParallelism();
    numThreads = Math.min(numThreads, numPartFiles);
    // fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file
    if (numThreads <= 1 || src.getNonZeros() == 0) {
        super.writeTextCellMatrixToHDFS(path, job, fs, src, rlen, clen);
        return;
    }
    // create directory for concurrent tasks
    MapReduceTool.createDirIfNotExistOnHDFS(path, DMLConfig.DEFAULT_SHARED_DIR_PERMISSION);
    // create and execute tasks
    try {
        ExecutorService pool = CommonThreadPool.get(numThreads);
        ArrayList<WriteTextTask> tasks = new ArrayList<>();
        int blklen = (int) Math.ceil((double) rlen / numThreads);
        for (int i = 0; i < numThreads & i * blklen < rlen; i++) {
            Path newPath = new Path(path, IOUtilFunctions.getPartFileName(i));
            tasks.add(new WriteTextTask(newPath, job, fs, src, i * blklen, (int) Math.min((i + 1) * blklen, rlen)));
        }
        // wait until all tasks have been executed
        List<Future<Object>> rt = pool.invokeAll(tasks);
        pool.shutdown();
        // check for exceptions
        for (Future<Object> task : rt) task.get();
        // delete crc files if written to local file system
        if (fs instanceof LocalFileSystem) {
            for (int i = 0; i < numThreads & i * blklen < rlen; i++) IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, new Path(path, IOUtilFunctions.getPartFileName(i)));
        }
    } catch (Exception e) {
        throw new IOException("Failed parallel write of text output.", e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) IOException(java.io.IOException) IOException(java.io.IOException) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future)

Example 84 with LocalFileSystem

use of org.apache.hadoop.fs.LocalFileSystem in project incubator-systemml by apache.

the class FrameWriterTextCSVParallel method writeCSVFrameToHDFS.

@Override
protected void writeCSVFrameToHDFS(Path path, JobConf job, FrameBlock src, long rlen, long clen, CSVFileFormatProperties csvprops) throws IOException {
    // estimate output size and number of output blocks (min 1)
    int numPartFiles = Math.max((int) (OptimizerUtils.estimateSizeTextOutput(rlen, clen, rlen * clen, OutputInfo.CSVOutputInfo) / InfrastructureAnalyzer.getHDFSBlockSize()), 1);
    // determine degree of parallelism
    int numThreads = OptimizerUtils.getParallelTextWriteParallelism();
    numThreads = Math.min(numThreads, numPartFiles);
    // fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file
    if (numThreads <= 1) {
        super.writeCSVFrameToHDFS(path, job, src, rlen, clen, csvprops);
        return;
    }
    // create directory for concurrent tasks
    MapReduceTool.createDirIfNotExistOnHDFS(path, DMLConfig.DEFAULT_SHARED_DIR_PERMISSION);
    FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
    // create and execute tasks
    try {
        ExecutorService pool = CommonThreadPool.get(numThreads);
        ArrayList<WriteFileTask> tasks = new ArrayList<>();
        int blklen = (int) Math.ceil((double) rlen / numThreads);
        for (int i = 0; i < numThreads & i * blklen < rlen; i++) {
            Path newPath = new Path(path, IOUtilFunctions.getPartFileName(i));
            tasks.add(new WriteFileTask(newPath, job, fs, src, i * blklen, (int) Math.min((i + 1) * blklen, rlen), csvprops));
        }
        // wait until all tasks have been executed
        List<Future<Object>> rt = pool.invokeAll(tasks);
        pool.shutdown();
        // check for exceptions
        for (Future<Object> task : rt) task.get();
        // delete crc files if written to local file system
        if (fs instanceof LocalFileSystem) {
            for (int i = 0; i < numThreads & i * blklen < rlen; i++) IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, new Path(path, IOUtilFunctions.getPartFileName(i)));
        }
    } catch (Exception e) {
        throw new IOException("Failed parallel write of csv output.", e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) IOException(java.io.IOException) IOException(java.io.IOException) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) FileSystem(org.apache.hadoop.fs.FileSystem) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future)

Example 85 with LocalFileSystem

use of org.apache.hadoop.fs.LocalFileSystem in project incubator-systemml by apache.

the class IOUtilFunctions method deleteCrcFilesFromLocalFileSystem.

/**
 * Delete the CRC files from the local file system associated with a
 * particular file and its metadata file.
 *
 * @param fs
 *            the file system
 * @param path
 *            the path to a file
 * @throws IOException
 *             thrown if error occurred attempting to delete crc files
 */
public static void deleteCrcFilesFromLocalFileSystem(FileSystem fs, Path path) throws IOException {
    if (fs instanceof LocalFileSystem) {
        Path fnameCrc = new Path(path.getParent(), "." + path.getName() + ".crc");
        fs.delete(fnameCrc, false);
        Path fnameMtdCrc = new Path(path.getParent(), "." + path.getName() + ".mtd.crc");
        fs.delete(fnameMtdCrc, false);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem)

Aggregations

LocalFileSystem (org.apache.hadoop.fs.LocalFileSystem)120 Path (org.apache.hadoop.fs.Path)77 Test (org.junit.Test)63 Configuration (org.apache.hadoop.conf.Configuration)56 FileSystem (org.apache.hadoop.fs.FileSystem)35 IOException (java.io.IOException)33 File (java.io.File)23 NewTableConfiguration (org.apache.accumulo.core.client.admin.NewTableConfiguration)23 SamplerConfiguration (org.apache.accumulo.core.client.sample.SamplerConfiguration)23 SummarizerConfiguration (org.apache.accumulo.core.client.summary.SummarizerConfiguration)23 DefaultConfiguration (org.apache.accumulo.core.conf.DefaultConfiguration)23 Key (org.apache.accumulo.core.data.Key)22 Value (org.apache.accumulo.core.data.Value)22 ArrayList (java.util.ArrayList)19 ExecutorService (java.util.concurrent.ExecutorService)15 Future (java.util.concurrent.Future)15 Scanner (org.apache.accumulo.core.client.Scanner)14 DataSegment (org.apache.druid.timeline.DataSegment)13 DataSegmentPusher (org.apache.druid.segment.loading.DataSegmentPusher)8 HdfsDataSegmentPusher (org.apache.druid.storage.hdfs.HdfsDataSegmentPusher)8