use of org.apache.hadoop.fs.LocalFileSystem in project incubator-systemml by apache.
the class FrameWriterBinaryBlockParallel method writeBinaryBlockFrameToHDFS.
@Override
protected void writeBinaryBlockFrameToHDFS(Path path, JobConf job, FrameBlock src, long rlen, long clen) throws IOException, DMLRuntimeException {
// estimate output size and number of output blocks (min 1)
int blen = ConfigurationManager.getBlocksize();
int numPartFiles = Math.max((int) (OptimizerUtils.estimatePartitionedSizeExactSparsity(rlen, clen, blen, blen, rlen * clen) / InfrastructureAnalyzer.getHDFSBlockSize()), 1);
// determine degree of parallelism
int numThreads = OptimizerUtils.getParallelBinaryWriteParallelism();
numThreads = Math.min(numThreads, numPartFiles);
// fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file
if (numThreads <= 1) {
super.writeBinaryBlockFrameToHDFS(path, job, src, rlen, clen);
return;
}
// create directory for concurrent tasks
MapReduceTool.createDirIfNotExistOnHDFS(path, DMLConfig.DEFAULT_SHARED_DIR_PERMISSION);
FileSystem fs = IOUtilFunctions.getFileSystem(path);
// create and execute write tasks
try {
ExecutorService pool = CommonThreadPool.get(numThreads);
ArrayList<WriteFileTask> tasks = new ArrayList<>();
int blklen = (int) Math.ceil((double) rlen / blen / numThreads) * blen;
for (int i = 0; i < numThreads & i * blklen < rlen; i++) {
Path newPath = new Path(path, IOUtilFunctions.getPartFileName(i));
tasks.add(new WriteFileTask(newPath, job, fs, src, i * blklen, Math.min((i + 1) * blklen, (int) rlen), blen));
}
// wait until all tasks have been executed
List<Future<Object>> rt = pool.invokeAll(tasks);
pool.shutdown();
// check for exceptions
for (Future<Object> task : rt) task.get();
// delete crc files if written to local file system
if (fs instanceof LocalFileSystem) {
for (int i = 0; i < numThreads & i * blklen < rlen; i++) IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, new Path(path, IOUtilFunctions.getPartFileName(i)));
}
} catch (Exception e) {
throw new IOException("Failed parallel write of binary block input.", e);
}
}
use of org.apache.hadoop.fs.LocalFileSystem in project incubator-systemml by apache.
the class FrameWriterTextCellParallel method writeTextCellFrameToHDFS.
@Override
protected void writeTextCellFrameToHDFS(Path path, JobConf job, FrameBlock src, long rlen, long clen) throws IOException {
// estimate output size and number of output blocks (min 1)
int numPartFiles = Math.max((int) (OptimizerUtils.estimateSizeTextOutput(rlen, clen, rlen * clen, OutputInfo.TextCellOutputInfo) / InfrastructureAnalyzer.getHDFSBlockSize()), 1);
// determine degree of parallelism
int numThreads = OptimizerUtils.getParallelTextWriteParallelism();
numThreads = Math.min(numThreads, numPartFiles);
// fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file
if (numThreads <= 1) {
super.writeTextCellFrameToHDFS(path, job, src, rlen, clen);
return;
}
// create directory for concurrent tasks
MapReduceTool.createDirIfNotExistOnHDFS(path, DMLConfig.DEFAULT_SHARED_DIR_PERMISSION);
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
// create and execute tasks
try {
ExecutorService pool = CommonThreadPool.get(numThreads);
ArrayList<WriteFileTask> tasks = new ArrayList<>();
int blklen = (int) Math.ceil((double) rlen / numThreads);
for (int i = 0; i < numThreads & i * blklen < rlen; i++) {
Path newPath = new Path(path, IOUtilFunctions.getPartFileName(i));
tasks.add(new WriteFileTask(newPath, job, fs, src, i * blklen, (int) Math.min((i + 1) * blklen, rlen)));
}
// wait until all tasks have been executed
List<Future<Object>> rt = pool.invokeAll(tasks);
pool.shutdown();
// check for exceptions
for (Future<Object> task : rt) task.get();
// delete crc files if written to local file system
if (fs instanceof LocalFileSystem) {
for (int i = 0; i < numThreads & i * blklen < rlen; i++) IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, new Path(path, IOUtilFunctions.getPartFileName(i)));
}
} catch (Exception e) {
throw new IOException("Failed parallel write of text output.", e);
}
}
use of org.apache.hadoop.fs.LocalFileSystem in project incubator-systemml by apache.
the class WriterBinaryBlockParallel method writeBinaryBlockMatrixToHDFS.
@Override
protected void writeBinaryBlockMatrixToHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock src, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException {
// estimate output size and number of output blocks (min 1)
int numPartFiles = (int) (OptimizerUtils.estimatePartitionedSizeExactSparsity(rlen, clen, brlen, bclen, src.getNonZeros()) / InfrastructureAnalyzer.getHDFSBlockSize());
numPartFiles = Math.max(numPartFiles, 1);
// determine degree of parallelism
int numThreads = OptimizerUtils.getParallelBinaryWriteParallelism();
numThreads = Math.min(numThreads, numPartFiles);
// fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file
if (numThreads <= 1) {
super.writeBinaryBlockMatrixToHDFS(path, job, fs, src, rlen, clen, brlen, bclen);
return;
}
// create directory for concurrent tasks
MapReduceTool.createDirIfNotExistOnHDFS(path, DMLConfig.DEFAULT_SHARED_DIR_PERMISSION);
// create and execute write tasks
try {
ExecutorService pool = CommonThreadPool.get(numThreads);
ArrayList<WriteFileTask> tasks = new ArrayList<>();
int blklen = (int) Math.ceil((double) rlen / brlen / numThreads) * brlen;
for (int i = 0; i < numThreads & i * blklen < rlen; i++) {
Path newPath = new Path(path, IOUtilFunctions.getPartFileName(i));
tasks.add(new WriteFileTask(newPath, job, fs, src, i * blklen, Math.min((i + 1) * blklen, rlen), brlen, bclen));
}
// wait until all tasks have been executed
List<Future<Object>> rt = pool.invokeAll(tasks);
pool.shutdown();
// check for exceptions
for (Future<Object> task : rt) task.get();
// delete crc files if written to local file system
if (fs instanceof LocalFileSystem) {
for (int i = 0; i < numThreads & i * blklen < rlen; i++) IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, new Path(path, IOUtilFunctions.getPartFileName(i)));
}
} catch (Exception e) {
throw new IOException("Failed parallel write of binary block input.", e);
}
}
use of org.apache.hadoop.fs.LocalFileSystem in project accumulo by apache.
the class RFileTest method testSampling.
@Test
public void testSampling() throws Exception {
SortedMap<Key, Value> testData1 = createTestData(1000, 2, 1);
LocalFileSystem localFs = FileSystem.getLocal(new Configuration());
String testFile = createTmpTestFile();
SamplerConfiguration sc = new SamplerConfiguration(RowSampler.class).setOptions(ImmutableMap.of("hasher", "murmur3_32", "modulus", "19"));
RFileWriter writer = RFile.newWriter().to(testFile).withFileSystem(localFs).withSampler(sc).build();
writer.append(testData1.entrySet());
writer.close();
Scanner scanner = RFile.newScanner().from(testFile).withFileSystem(localFs).build();
scanner.setSamplerConfiguration(sc);
RowSampler rowSampler = new RowSampler();
rowSampler.init(sc);
SortedMap<Key, Value> sampleData = new TreeMap<>();
for (Entry<Key, Value> e : testData1.entrySet()) {
if (rowSampler.accept(e.getKey())) {
sampleData.put(e.getKey(), e.getValue());
}
}
Assert.assertTrue(sampleData.size() < testData1.size());
Assert.assertEquals(sampleData, toMap(scanner));
scanner.clearSamplerConfiguration();
Assert.assertEquals(testData1, toMap(scanner));
}
use of org.apache.hadoop.fs.LocalFileSystem in project accumulo by apache.
the class RFileTest method testBounds.
@Test
public void testBounds() throws Exception {
LocalFileSystem localFs = FileSystem.getLocal(new Configuration());
SortedMap<Key, Value> testData = createTestData(10, 10, 10);
String testFile = createRFile(testData);
// set a lower bound row
Range bounds = new Range(rowStr(3), false, null, true);
Scanner scanner = RFile.newScanner().from(testFile).withFileSystem(localFs).withBounds(bounds).build();
Assert.assertEquals(createTestData(4, 6, 0, 10, 10), toMap(scanner));
scanner.close();
// set an upper bound row
bounds = new Range(null, false, rowStr(7), true);
scanner = RFile.newScanner().from(testFile).withFileSystem(localFs).withBounds(bounds).build();
Assert.assertEquals(createTestData(8, 10, 10), toMap(scanner));
scanner.close();
// set row bounds
bounds = new Range(rowStr(3), false, rowStr(7), true);
scanner = RFile.newScanner().from(testFile).withFileSystem(localFs).withBounds(bounds).build();
Assert.assertEquals(createTestData(4, 4, 0, 10, 10), toMap(scanner));
scanner.close();
// set a row family bound
bounds = Range.exact(rowStr(3), colStr(5));
scanner = RFile.newScanner().from(testFile).withFileSystem(localFs).withBounds(bounds).build();
Assert.assertEquals(createTestData(3, 1, 5, 1, 10), toMap(scanner));
scanner.close();
}
Aggregations