Search in sources :

Example 16 with LocalFileSystem

use of org.apache.hadoop.fs.LocalFileSystem in project incubator-systemml by apache.

the class FrameWriterBinaryBlockParallel method writeBinaryBlockFrameToHDFS.

@Override
protected void writeBinaryBlockFrameToHDFS(Path path, JobConf job, FrameBlock src, long rlen, long clen) throws IOException, DMLRuntimeException {
    // estimate output size and number of output blocks (min 1)
    int blen = ConfigurationManager.getBlocksize();
    int numPartFiles = Math.max((int) (OptimizerUtils.estimatePartitionedSizeExactSparsity(rlen, clen, blen, blen, rlen * clen) / InfrastructureAnalyzer.getHDFSBlockSize()), 1);
    // determine degree of parallelism
    int numThreads = OptimizerUtils.getParallelBinaryWriteParallelism();
    numThreads = Math.min(numThreads, numPartFiles);
    // fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file
    if (numThreads <= 1) {
        super.writeBinaryBlockFrameToHDFS(path, job, src, rlen, clen);
        return;
    }
    // create directory for concurrent tasks
    MapReduceTool.createDirIfNotExistOnHDFS(path, DMLConfig.DEFAULT_SHARED_DIR_PERMISSION);
    FileSystem fs = IOUtilFunctions.getFileSystem(path);
    // create and execute write tasks
    try {
        ExecutorService pool = CommonThreadPool.get(numThreads);
        ArrayList<WriteFileTask> tasks = new ArrayList<>();
        int blklen = (int) Math.ceil((double) rlen / blen / numThreads) * blen;
        for (int i = 0; i < numThreads & i * blklen < rlen; i++) {
            Path newPath = new Path(path, IOUtilFunctions.getPartFileName(i));
            tasks.add(new WriteFileTask(newPath, job, fs, src, i * blklen, Math.min((i + 1) * blklen, (int) rlen), blen));
        }
        // wait until all tasks have been executed
        List<Future<Object>> rt = pool.invokeAll(tasks);
        pool.shutdown();
        // check for exceptions
        for (Future<Object> task : rt) task.get();
        // delete crc files if written to local file system
        if (fs instanceof LocalFileSystem) {
            for (int i = 0; i < numThreads & i * blklen < rlen; i++) IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, new Path(path, IOUtilFunctions.getPartFileName(i)));
        }
    } catch (Exception e) {
        throw new IOException("Failed parallel write of binary block input.", e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) FileSystem(org.apache.hadoop.fs.FileSystem) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future)

Example 17 with LocalFileSystem

use of org.apache.hadoop.fs.LocalFileSystem in project incubator-systemml by apache.

the class FrameWriterTextCellParallel method writeTextCellFrameToHDFS.

@Override
protected void writeTextCellFrameToHDFS(Path path, JobConf job, FrameBlock src, long rlen, long clen) throws IOException {
    // estimate output size and number of output blocks (min 1)
    int numPartFiles = Math.max((int) (OptimizerUtils.estimateSizeTextOutput(rlen, clen, rlen * clen, OutputInfo.TextCellOutputInfo) / InfrastructureAnalyzer.getHDFSBlockSize()), 1);
    // determine degree of parallelism
    int numThreads = OptimizerUtils.getParallelTextWriteParallelism();
    numThreads = Math.min(numThreads, numPartFiles);
    // fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file
    if (numThreads <= 1) {
        super.writeTextCellFrameToHDFS(path, job, src, rlen, clen);
        return;
    }
    // create directory for concurrent tasks
    MapReduceTool.createDirIfNotExistOnHDFS(path, DMLConfig.DEFAULT_SHARED_DIR_PERMISSION);
    FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
    // create and execute tasks
    try {
        ExecutorService pool = CommonThreadPool.get(numThreads);
        ArrayList<WriteFileTask> tasks = new ArrayList<>();
        int blklen = (int) Math.ceil((double) rlen / numThreads);
        for (int i = 0; i < numThreads & i * blklen < rlen; i++) {
            Path newPath = new Path(path, IOUtilFunctions.getPartFileName(i));
            tasks.add(new WriteFileTask(newPath, job, fs, src, i * blklen, (int) Math.min((i + 1) * blklen, rlen)));
        }
        // wait until all tasks have been executed
        List<Future<Object>> rt = pool.invokeAll(tasks);
        pool.shutdown();
        // check for exceptions
        for (Future<Object> task : rt) task.get();
        // delete crc files if written to local file system
        if (fs instanceof LocalFileSystem) {
            for (int i = 0; i < numThreads & i * blklen < rlen; i++) IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, new Path(path, IOUtilFunctions.getPartFileName(i)));
        }
    } catch (Exception e) {
        throw new IOException("Failed parallel write of text output.", e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) IOException(java.io.IOException) IOException(java.io.IOException) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) FileSystem(org.apache.hadoop.fs.FileSystem) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future)

Example 18 with LocalFileSystem

use of org.apache.hadoop.fs.LocalFileSystem in project incubator-systemml by apache.

the class WriterBinaryBlockParallel method writeBinaryBlockMatrixToHDFS.

@Override
protected void writeBinaryBlockMatrixToHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock src, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException {
    // estimate output size and number of output blocks (min 1)
    int numPartFiles = (int) (OptimizerUtils.estimatePartitionedSizeExactSparsity(rlen, clen, brlen, bclen, src.getNonZeros()) / InfrastructureAnalyzer.getHDFSBlockSize());
    numPartFiles = Math.max(numPartFiles, 1);
    // determine degree of parallelism
    int numThreads = OptimizerUtils.getParallelBinaryWriteParallelism();
    numThreads = Math.min(numThreads, numPartFiles);
    // fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file
    if (numThreads <= 1) {
        super.writeBinaryBlockMatrixToHDFS(path, job, fs, src, rlen, clen, brlen, bclen);
        return;
    }
    // create directory for concurrent tasks
    MapReduceTool.createDirIfNotExistOnHDFS(path, DMLConfig.DEFAULT_SHARED_DIR_PERMISSION);
    // create and execute write tasks
    try {
        ExecutorService pool = CommonThreadPool.get(numThreads);
        ArrayList<WriteFileTask> tasks = new ArrayList<>();
        int blklen = (int) Math.ceil((double) rlen / brlen / numThreads) * brlen;
        for (int i = 0; i < numThreads & i * blklen < rlen; i++) {
            Path newPath = new Path(path, IOUtilFunctions.getPartFileName(i));
            tasks.add(new WriteFileTask(newPath, job, fs, src, i * blklen, Math.min((i + 1) * blklen, rlen), brlen, bclen));
        }
        // wait until all tasks have been executed
        List<Future<Object>> rt = pool.invokeAll(tasks);
        pool.shutdown();
        // check for exceptions
        for (Future<Object> task : rt) task.get();
        // delete crc files if written to local file system
        if (fs instanceof LocalFileSystem) {
            for (int i = 0; i < numThreads & i * blklen < rlen; i++) IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, new Path(path, IOUtilFunctions.getPartFileName(i)));
        }
    } catch (Exception e) {
        throw new IOException("Failed parallel write of binary block input.", e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future)

Example 19 with LocalFileSystem

use of org.apache.hadoop.fs.LocalFileSystem in project accumulo by apache.

the class RFileTest method testSampling.

@Test
public void testSampling() throws Exception {
    SortedMap<Key, Value> testData1 = createTestData(1000, 2, 1);
    LocalFileSystem localFs = FileSystem.getLocal(new Configuration());
    String testFile = createTmpTestFile();
    SamplerConfiguration sc = new SamplerConfiguration(RowSampler.class).setOptions(ImmutableMap.of("hasher", "murmur3_32", "modulus", "19"));
    RFileWriter writer = RFile.newWriter().to(testFile).withFileSystem(localFs).withSampler(sc).build();
    writer.append(testData1.entrySet());
    writer.close();
    Scanner scanner = RFile.newScanner().from(testFile).withFileSystem(localFs).build();
    scanner.setSamplerConfiguration(sc);
    RowSampler rowSampler = new RowSampler();
    rowSampler.init(sc);
    SortedMap<Key, Value> sampleData = new TreeMap<>();
    for (Entry<Key, Value> e : testData1.entrySet()) {
        if (rowSampler.accept(e.getKey())) {
            sampleData.put(e.getKey(), e.getValue());
        }
    }
    Assert.assertTrue(sampleData.size() < testData1.size());
    Assert.assertEquals(sampleData, toMap(scanner));
    scanner.clearSamplerConfiguration();
    Assert.assertEquals(testData1, toMap(scanner));
}
Also used : RowSampler(org.apache.accumulo.core.client.sample.RowSampler) Scanner(org.apache.accumulo.core.client.Scanner) SummarizerConfiguration(org.apache.accumulo.core.client.summary.SummarizerConfiguration) NewTableConfiguration(org.apache.accumulo.core.client.admin.NewTableConfiguration) Configuration(org.apache.hadoop.conf.Configuration) SamplerConfiguration(org.apache.accumulo.core.client.sample.SamplerConfiguration) DefaultConfiguration(org.apache.accumulo.core.conf.DefaultConfiguration) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) Value(org.apache.accumulo.core.data.Value) SamplerConfiguration(org.apache.accumulo.core.client.sample.SamplerConfiguration) TreeMap(java.util.TreeMap) Key(org.apache.accumulo.core.data.Key) Test(org.junit.Test)

Example 20 with LocalFileSystem

use of org.apache.hadoop.fs.LocalFileSystem in project accumulo by apache.

the class RFileTest method testBounds.

@Test
public void testBounds() throws Exception {
    LocalFileSystem localFs = FileSystem.getLocal(new Configuration());
    SortedMap<Key, Value> testData = createTestData(10, 10, 10);
    String testFile = createRFile(testData);
    // set a lower bound row
    Range bounds = new Range(rowStr(3), false, null, true);
    Scanner scanner = RFile.newScanner().from(testFile).withFileSystem(localFs).withBounds(bounds).build();
    Assert.assertEquals(createTestData(4, 6, 0, 10, 10), toMap(scanner));
    scanner.close();
    // set an upper bound row
    bounds = new Range(null, false, rowStr(7), true);
    scanner = RFile.newScanner().from(testFile).withFileSystem(localFs).withBounds(bounds).build();
    Assert.assertEquals(createTestData(8, 10, 10), toMap(scanner));
    scanner.close();
    // set row bounds
    bounds = new Range(rowStr(3), false, rowStr(7), true);
    scanner = RFile.newScanner().from(testFile).withFileSystem(localFs).withBounds(bounds).build();
    Assert.assertEquals(createTestData(4, 4, 0, 10, 10), toMap(scanner));
    scanner.close();
    // set a row family bound
    bounds = Range.exact(rowStr(3), colStr(5));
    scanner = RFile.newScanner().from(testFile).withFileSystem(localFs).withBounds(bounds).build();
    Assert.assertEquals(createTestData(3, 1, 5, 1, 10), toMap(scanner));
    scanner.close();
}
Also used : Scanner(org.apache.accumulo.core.client.Scanner) SummarizerConfiguration(org.apache.accumulo.core.client.summary.SummarizerConfiguration) NewTableConfiguration(org.apache.accumulo.core.client.admin.NewTableConfiguration) Configuration(org.apache.hadoop.conf.Configuration) SamplerConfiguration(org.apache.accumulo.core.client.sample.SamplerConfiguration) DefaultConfiguration(org.apache.accumulo.core.conf.DefaultConfiguration) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) Value(org.apache.accumulo.core.data.Value) Range(org.apache.accumulo.core.data.Range) Key(org.apache.accumulo.core.data.Key) Test(org.junit.Test)

Aggregations

LocalFileSystem (org.apache.hadoop.fs.LocalFileSystem)121 Path (org.apache.hadoop.fs.Path)77 Test (org.junit.Test)64 Configuration (org.apache.hadoop.conf.Configuration)57 FileSystem (org.apache.hadoop.fs.FileSystem)35 IOException (java.io.IOException)33 File (java.io.File)23 NewTableConfiguration (org.apache.accumulo.core.client.admin.NewTableConfiguration)23 SamplerConfiguration (org.apache.accumulo.core.client.sample.SamplerConfiguration)23 SummarizerConfiguration (org.apache.accumulo.core.client.summary.SummarizerConfiguration)23 DefaultConfiguration (org.apache.accumulo.core.conf.DefaultConfiguration)23 Key (org.apache.accumulo.core.data.Key)22 Value (org.apache.accumulo.core.data.Value)22 ArrayList (java.util.ArrayList)19 ExecutorService (java.util.concurrent.ExecutorService)15 Future (java.util.concurrent.Future)15 Scanner (org.apache.accumulo.core.client.Scanner)14 DataSegment (org.apache.druid.timeline.DataSegment)13 DataSegmentPusher (org.apache.druid.segment.loading.DataSegmentPusher)8 HdfsDataSegmentPusher (org.apache.druid.storage.hdfs.HdfsDataSegmentPusher)8