use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.
the class DataPartitioner method createPartitionedMatrixObject.
/**
* Creates a partitioned matrix object based on the given input matrix object,
* according to the specified split format. The input matrix can be in-memory
* or still on HDFS and the partitioned output matrix is written to HDFS. The
* created matrix object can be used transparently for obtaining the full matrix
* or reading 1 or multiple partitions based on given index ranges.
*
* @param in input matrix object
* @param out output matrix object
* @param force if false, try to optimize
* @return partitioned matrix object
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public MatrixObject createPartitionedMatrixObject(MatrixObject in, MatrixObject out, boolean force) throws DMLRuntimeException {
//check for naive partitioning
if (_format == PDataPartitionFormat.NONE)
return in;
//analyze input matrix object
MatrixFormatMetaData meta = (MatrixFormatMetaData) in.getMetaData();
MatrixCharacteristics mc = meta.getMatrixCharacteristics();
InputInfo ii = meta.getInputInfo();
OutputInfo oi = meta.getOutputInfo();
long rows = mc.getRows();
long cols = mc.getCols();
int brlen = mc.getRowsPerBlock();
int bclen = mc.getColsPerBlock();
long nonZeros = mc.getNonZeros();
double sparsity = (nonZeros >= 0 && rows > 0 && cols > 0) ? ((double) nonZeros) / (rows * cols) : 1.0;
if (//try to optimize, if format not forced
!force) {
//check lower bound of useful data partitioning
if (//or matrix already fits in mem
rows < Hop.CPThreshold && cols < Hop.CPThreshold) {
return in;
}
//check for changing to blockwise representations
if (_format == PDataPartitionFormat.ROW_WISE && cols < Hop.CPThreshold) {
LOG.debug("Changing format from " + PDataPartitionFormat.ROW_WISE + " to " + PDataPartitionFormat.ROW_BLOCK_WISE + ".");
_format = PDataPartitionFormat.ROW_BLOCK_WISE;
}
if (_format == PDataPartitionFormat.COLUMN_WISE && rows < Hop.CPThreshold) {
LOG.debug("Changing format from " + PDataPartitionFormat.COLUMN_WISE + " to " + PDataPartitionFormat.ROW_BLOCK_WISE + ".");
_format = PDataPartitionFormat.COLUMN_BLOCK_WISE;
}
//_format = PDataPartitionFormat.ROW_BLOCK_WISE_N;
}
//check changing to binarycell in case of sparse cols (robustness)
boolean convertBlock2Cell = false;
if (ii == InputInfo.BinaryBlockInputInfo && _allowBinarycell && _format == PDataPartitionFormat.COLUMN_WISE && sparsity < SPARSITY_CELL_THRESHOLD) {
LOG.debug("Changing partition outputinfo from binaryblock to binarycell due to sparsity=" + sparsity);
oi = OutputInfo.BinaryCellOutputInfo;
convertBlock2Cell = true;
}
//prepare filenames and cleanup if required
String fnameNew = out.getFileName();
try {
MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
//core partitioning (depending on subclass)
partitionMatrix(in, fnameNew, ii, oi, rows, cols, brlen, bclen);
//create output matrix object
out.setPartitioned(_format, _n);
MatrixCharacteristics mcNew = new MatrixCharacteristics(rows, cols, (int) brlen, (int) bclen);
mcNew.setNonZeros(nonZeros);
if (convertBlock2Cell)
ii = InputInfo.BinaryCellInputInfo;
MatrixFormatMetaData metaNew = new MatrixFormatMetaData(mcNew, oi, ii);
out.setMetaData(metaNew);
return out;
}
use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.
the class DataPartitionerRemoteMapper method configure.
@Override
public void configure(JobConf job) {
long rlen = MRJobConfiguration.getPartitioningNumRows(job);
long clen = MRJobConfiguration.getPartitioningNumCols(job);
int brlen = MRJobConfiguration.getPartitioningBlockNumRows(job);
int bclen = MRJobConfiguration.getPartitioningBlockNumCols(job);
InputInfo ii = MRJobConfiguration.getPartitioningInputInfo(job);
OutputInfo oi = MRJobConfiguration.getPartitioningOutputInfo(job);
PDataPartitionFormat pdf = MRJobConfiguration.getPartitioningFormat(job);
int n = MRJobConfiguration.getPartitioningSizeN(job);
boolean keepIndexes = MRJobConfiguration.getPartitioningIndexFlag(job);
if (ii == InputInfo.TextCellInputInfo)
_mapper = new DataPartitionerMapperTextcell(rlen, clen, brlen, bclen, pdf, n);
else if (ii == InputInfo.BinaryCellInputInfo)
_mapper = new DataPartitionerMapperBinarycell(rlen, clen, brlen, bclen, pdf, n);
else if (ii == InputInfo.BinaryBlockInputInfo) {
if (oi == OutputInfo.BinaryBlockOutputInfo)
_mapper = new DataPartitionerMapperBinaryblock(rlen, clen, brlen, bclen, pdf, n, keepIndexes);
else if (oi == OutputInfo.BinaryCellOutputInfo) {
//fused parfor
boolean outputEmpty = MRJobConfiguration.getProgramBlocks(job) != null;
_mapper = new DataPartitionerMapperBinaryblock2Binarycell(job, rlen, clen, brlen, bclen, pdf, n, keepIndexes, outputEmpty);
} else
throw new RuntimeException("Partitioning from '" + ii + "' to '" + oi + "' not supported");
} else
throw new RuntimeException("Unable to configure mapper with unknown input info: " + ii.toString());
}
use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.
the class SortMR method runStitchupJob.
private static boolean runStitchupJob(String input, long rlen, long clen, int brlen, int bclen, long[] counts, int numReducers, int replication, String output) throws Exception {
JobConf job = new JobConf(SortMR.class);
job.setJobName("SortIndexesMR");
//setup input/output paths
Path inpath = new Path(input);
Path outpath = new Path(output);
FileInputFormat.setInputPaths(job, inpath);
FileOutputFormat.setOutputPath(job, outpath);
MapReduceTool.deleteFileIfExistOnHDFS(outpath, job);
//set number of reducers (1 if local mode)
if (InfrastructureAnalyzer.isLocalMode(job))
job.setNumReduceTasks(1);
else
MRJobConfiguration.setNumReducers(job, numReducers, numReducers);
//setup input/output format
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
job.setInputFormat(iinfo.inputFormatClass);
job.setOutputFormat(oinfo.outputFormatClass);
CompactInputFormat.setKeyValueClasses(job, MatrixIndexes.class, MatrixBlock.class);
//setup mapper/reducer/output classes
MRJobConfiguration.setInputInfo(job, (byte) 0, InputInfo.BinaryBlockInputInfo, brlen, bclen, ConvertTarget.BLOCK);
job.setMapperClass(IndexSortStitchupMapper.class);
job.setReducerClass(IndexSortStitchupReducer.class);
job.setOutputKeyClass(oinfo.outputKeyClass);
job.setOutputValueClass(oinfo.outputValueClass);
MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen);
MRJobConfiguration.setMatricesDimensions(job, new byte[] { 0 }, new long[] { rlen }, new long[] { clen });
//compute shifted prefix sum of offsets and put into configuration
long[] cumsumCounts = new long[counts.length];
long sum = 0;
for (int i = 0; i < counts.length; i++) {
cumsumCounts[i] = sum;
sum += counts[i];
}
job.set(SORT_INDEXES_OFFSETS, Arrays.toString(cumsumCounts));
//setup replication factor
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
//set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
//run mr job
RunningJob runJob = JobClient.runJob(job);
return runJob.isSuccessful();
}
use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.
the class VariableCPInstruction method writeMMFile.
/**
* Helper function to write MM files to HDFS.
*
* @param ec execution context
* @param fname file name
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private void writeMMFile(ExecutionContext ec, String fname) throws DMLRuntimeException {
MatrixObject mo = ec.getMatrixObject(input1.getName());
String outFmt = "matrixmarket";
if (mo.isDirty()) {
// there exist data computed in CP that is not backed up on HDFS
// i.e., it is either in-memory or in evicted space
mo.exportData(fname, outFmt);
} else {
OutputInfo oi = ((MatrixFormatMetaData) mo.getMetaData()).getOutputInfo();
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
if (oi == OutputInfo.TextCellOutputInfo) {
try {
WriterMatrixMarket writer = new WriterMatrixMarket();
writer.mergeTextcellToMatrixMarket(mo.getFileName(), fname, mc.getRows(), mc.getCols(), mc.getNonZeros());
} catch (IOException e) {
throw new DMLRuntimeException(e);
}
} else if (oi == OutputInfo.BinaryBlockOutputInfo) {
mo.exportData(fname, outFmt);
} else {
throw new DMLRuntimeException("Unexpected data format (" + OutputInfo.outputInfoToString(oi) + "): can not export into MatrixMarket format.");
}
}
}
use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.
the class VariableCPInstruction method writeCSVFile.
/**
* Helper function to write CSV files to HDFS.
*
* @param ec execution context
* @param fname file name
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private void writeCSVFile(ExecutionContext ec, String fname) throws DMLRuntimeException {
MatrixObject mo = ec.getMatrixObject(input1.getName());
String outFmt = "csv";
if (mo.isDirty()) {
// there exist data computed in CP that is not backed up on HDFS
// i.e., it is either in-memory or in evicted space
mo.exportData(fname, outFmt, _formatProperties);
} else {
try {
OutputInfo oi = ((MatrixFormatMetaData) mo.getMetaData()).getOutputInfo();
MatrixCharacteristics mc = ((MatrixFormatMetaData) mo.getMetaData()).getMatrixCharacteristics();
if (oi == OutputInfo.CSVOutputInfo) {
WriterTextCSV writer = new WriterTextCSV((CSVFileFormatProperties) _formatProperties);
writer.addHeaderToCSV(mo.getFileName(), fname, mc.getRows(), mc.getCols());
} else if (oi == OutputInfo.BinaryBlockOutputInfo || oi == OutputInfo.TextCellOutputInfo) {
mo.exportData(fname, outFmt, _formatProperties);
} else {
throw new DMLRuntimeException("Unexpected data format (" + OutputInfo.outputInfoToString(oi) + "): can not export into CSV format.");
}
// Write Metadata file
MapReduceTool.writeMetaDataFile(fname + ".mtd", mo.getValueType(), mc, OutputInfo.CSVOutputInfo, _formatProperties);
} catch (IOException e) {
throw new DMLRuntimeException(e);
}
}
}
Aggregations