use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.
the class MatrixObject method readBlobFromRDD.
@Override
protected MatrixBlock readBlobFromRDD(RDDObject rdd, MutableBoolean writeStatus) throws IOException {
// note: the read of a matrix block from an RDD might trigger
// lazy evaluation of pending transformations.
RDDObject lrdd = rdd;
// prepare return status (by default only collect)
writeStatus.setValue(false);
MetaDataFormat iimd = (MetaDataFormat) _metaData;
MatrixCharacteristics mc = iimd.getMatrixCharacteristics();
InputInfo ii = iimd.getInputInfo();
MatrixBlock mb = null;
try {
// prevent unnecessary collect through rdd checkpoint
if (rdd.allowsShortCircuitCollect()) {
lrdd = (RDDObject) rdd.getLineageChilds().get(0);
}
// obtain matrix block from RDD
int rlen = (int) mc.getRows();
int clen = (int) mc.getCols();
int brlen = (int) mc.getRowsPerBlock();
int bclen = (int) mc.getColsPerBlock();
long nnz = mc.getNonZerosBound();
// guarded rdd collect
if (// guarded collect not for binary cell
ii == InputInfo.BinaryBlockInputInfo && !OptimizerUtils.checkSparkCollectMemoryBudget(mc, getPinnedSize() + getBroadcastSize(), true)) {
// note: lazy, partition-at-a-time collect (toLocalIterator) was significantly slower
if (!MapReduceTool.existsFileOnHDFS(_hdfsFileName)) {
// prevent overwrite existing file
long newnnz = SparkExecutionContext.writeRDDtoHDFS(lrdd, _hdfsFileName, iimd.getOutputInfo());
_metaData.getMatrixCharacteristics().setNonZeros(newnnz);
// mark rdd as non-pending (for export)
((RDDObject) rdd).setPending(false);
// mark rdd as hdfs file (for restore)
((RDDObject) rdd).setHDFSFile(true);
// mark for no cache-write on read
writeStatus.setValue(true);
// note: the flag hdfsFile is actually not entirely correct because we still hold an rdd
// reference to the input not to an rdd of the hdfs file but the resulting behavior is correct
}
mb = readBlobFromHDFS(_hdfsFileName);
} else if (ii == InputInfo.BinaryCellInputInfo) {
// collect matrix block from binary block RDD
mb = SparkExecutionContext.toMatrixBlock(lrdd, rlen, clen, nnz);
} else {
// collect matrix block from binary cell RDD
mb = SparkExecutionContext.toMatrixBlock(lrdd, rlen, clen, brlen, bclen, nnz);
}
} catch (DMLRuntimeException ex) {
throw new IOException(ex);
}
// sanity check correct output
if (mb == null)
throw new IOException("Unable to load matrix from rdd.");
return mb;
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.
the class SparkExecutionContext method getRDDHandleForFrameObject.
/**
* FIXME: currently this implementation assumes matrix representations but frame signature
* in order to support the old transform implementation.
*
* @param fo frame object
* @param inputInfo input info
* @return JavaPairRDD handle for a frame object
*/
@SuppressWarnings("unchecked")
public JavaPairRDD<?, ?> getRDDHandleForFrameObject(FrameObject fo, InputInfo inputInfo) {
// NOTE: MB this logic should be integrated into FrameObject
// However, for now we cannot assume that spark libraries are
// always available and hence only store generic references in
// matrix object while all the logic is in the SparkExecContext
InputInfo inputInfo2 = (inputInfo == InputInfo.BinaryBlockInputInfo) ? InputInfo.BinaryBlockFrameInputInfo : inputInfo;
JavaSparkContext sc = getSparkContext();
JavaPairRDD<?, ?> rdd = null;
// rdd operations if already executed and cached
if (fo.getRDDHandle() != null && (fo.getRDDHandle().isCheckpointRDD() || !fo.isCached(false))) {
// return existing rdd handling (w/o input format change)
rdd = fo.getRDDHandle().getRDD();
} else // CASE 2: dirty in memory data or cached result of rdd operations
if (fo.isDirty() || fo.isCached(false)) {
// get in-memory matrix block and parallelize it
// w/ guarded parallelize (fallback to export, rdd from file if too large)
MatrixCharacteristics mc = fo.getMatrixCharacteristics();
boolean fromFile = false;
if (!OptimizerUtils.checkSparkCollectMemoryBudget(mc, 0) || !_parRDDs.reserve(OptimizerUtils.estimatePartitionedSizeExactSparsity(mc))) {
if (fo.isDirty()) {
// write only if necessary
fo.exportData();
}
rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
// cp is workaround for read bug
rdd = ((JavaPairRDD<LongWritable, FrameBlock>) rdd).mapToPair(new CopyFrameBlockPairFunction());
fromFile = true;
} else {
// default case
// pin frame in memory
FrameBlock fb = fo.acquireRead();
rdd = toFrameJavaPairRDD(sc, fb);
// unpin frame
fo.release();
_parRDDs.registerRDD(rdd.id(), OptimizerUtils.estimatePartitionedSizeExactSparsity(mc), true);
}
// keep rdd handle for future operations on it
RDDObject rddhandle = new RDDObject(rdd);
rddhandle.setHDFSFile(fromFile);
fo.setRDDHandle(rddhandle);
} else // CASE 3: non-dirty (file exists on HDFS)
{
// For binary block, these are: SequenceFileInputFormat.class, MatrixIndexes.class, MatrixBlock.class
if (inputInfo2 == InputInfo.BinaryBlockFrameInputInfo) {
rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
// note: this copy is still required in Spark 1.4 because spark hands out whatever the inputformat
// recordreader returns; the javadoc explicitly recommend to copy all key/value pairs
// cp is workaround for read bug
rdd = ((JavaPairRDD<LongWritable, FrameBlock>) rdd).mapToPair(new CopyFrameBlockPairFunction());
} else if (inputInfo2 == InputInfo.TextCellInputInfo || inputInfo2 == InputInfo.CSVInputInfo || inputInfo2 == InputInfo.MatrixMarketInputInfo) {
rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
// cp is workaround for read bug
rdd = ((JavaPairRDD<LongWritable, Text>) rdd).mapToPair(new CopyTextInputFunction());
} else if (inputInfo2 == InputInfo.BinaryCellInputInfo) {
throw new DMLRuntimeException("Binarycell not supported for frames.");
} else {
throw new DMLRuntimeException("Incorrect input format in getRDDHandleForVariable");
}
// keep rdd handle for future operations on it
RDDObject rddhandle = new RDDObject(rdd);
rddhandle.setHDFSFile(true);
fo.setRDDHandle(rddhandle);
}
return rdd;
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.
the class ResultMergeLocalFile method createNewMatrixObject.
private MatrixObject createNewMatrixObject(MatrixObject output, ArrayList<MatrixObject> inMO) {
MetaDataFormat metadata = (MetaDataFormat) _output.getMetaData();
MatrixObject moNew = new MatrixObject(_output.getValueType(), _outputFName);
// create deep copy of metadata obj
MatrixCharacteristics mcOld = metadata.getMatrixCharacteristics();
OutputInfo oiOld = metadata.getOutputInfo();
InputInfo iiOld = metadata.getInputInfo();
MatrixCharacteristics mc = new MatrixCharacteristics(mcOld);
mc.setNonZeros(_isAccum ? -1 : computeNonZeros(output, inMO));
MetaDataFormat meta = new MetaDataFormat(mc, oiOld, iiOld);
moNew.setMetaData(meta);
return moNew;
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.
the class MRJobConfiguration method setUpMultipleInputs.
public static void setUpMultipleInputs(JobConf job, byte[] inputIndexes, String[] inputs, InputInfo[] inputInfos, int[] brlens, int[] bclens, boolean[] distCacheOnly, boolean setConverter, ConvertTarget target) throws Exception {
if (inputs.length != inputInfos.length)
throw new Exception("number of inputs and inputInfos does not match");
// set up names of the input matrices and their inputformat information
job.setStrings(INPUT_MATRICIES_DIRS_CONFIG, inputs);
MRJobConfiguration.setMapFunctionInputMatrixIndexes(job, inputIndexes);
// set up converter infos (converter determined implicitly)
if (setConverter) {
for (int i = 0; i < inputs.length; i++) setInputInfo(job, inputIndexes[i], inputInfos[i], brlens[i], bclens[i], target);
}
// remove redundant inputs and pure broadcast variables
ArrayList<Path> lpaths = new ArrayList<>();
ArrayList<InputInfo> liinfos = new ArrayList<>();
for (int i = 0; i < inputs.length; i++) {
Path p = new Path(inputs[i]);
// check and skip redundant inputs
if (// path already included
lpaths.contains(p) || // input only required in dist cache
distCacheOnly[i]) {
continue;
}
lpaths.add(p);
liinfos.add(inputInfos[i]);
}
boolean combineInputFormat = false;
if (OptimizerUtils.ALLOW_COMBINE_FILE_INPUT_FORMAT) {
// determine total input sizes
double totalInputSize = 0;
for (int i = 0; i < inputs.length; i++) totalInputSize += MapReduceTool.getFilesizeOnHDFS(new Path(inputs[i]));
// set max split size (default blocksize) to 2x blocksize if (1) sort buffer large enough,
// (2) degree of parallelism not hurt, and only a single input (except broadcasts)
// (the sort buffer size is relevant for pass-through of, potentially modified, inputs to the reducers)
// (the single input constraint stems from internal runtime assumptions used to relate meta data to inputs)
long sizeSortBuff = InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer();
long sizeHDFSBlk = InfrastructureAnalyzer.getHDFSBlockSize();
// use generic config api for backwards compatibility
long newSplitSize = sizeHDFSBlk * 2;
double spillPercent = Double.parseDouble(job.get(MRConfigurationNames.MR_MAP_SORT_SPILL_PERCENT, "1.0"));
int numPMap = OptimizerUtils.getNumMappers();
if (numPMap < totalInputSize / newSplitSize && sizeSortBuff * spillPercent >= newSplitSize && lpaths.size() == 1) {
job.setLong(MRConfigurationNames.MR_INPUT_FILEINPUTFORMAT_SPLIT_MAXSIZE, newSplitSize);
combineInputFormat = true;
}
}
// add inputs to jobs input (incl input format configuration)
for (int i = 0; i < lpaths.size(); i++) {
// add input to job inputs (for binaryblock we use CombineSequenceFileInputFormat to reduce task latency)
if (combineInputFormat && liinfos.get(i) == InputInfo.BinaryBlockInputInfo)
MultipleInputs.addInputPath(job, lpaths.get(i), CombineSequenceFileInputFormat.class);
else
MultipleInputs.addInputPath(job, lpaths.get(i), liinfos.get(i).inputFormatClass);
}
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.
the class Connection method readStringFrame.
// //////////////////////////////////////////
// Read frames
// //////////////////////////////////////////
/**
* Reads an input frame in arbitrary format from HDFS into a dense string array.
* NOTE: this call currently only supports default configurations for CSV.
*
* @param fname the filename of the input frame
* @return frame as a two-dimensional string array
* @throws IOException if IOException occurs
*/
public String[][] readStringFrame(String fname) throws IOException {
try {
// read json meta data
String fnamemtd = DataExpression.getMTDFileName(fname);
JSONObject jmtd = new DataExpression().readMetadataFile(fnamemtd, false);
// parse json meta data
long rows = jmtd.getLong(DataExpression.READROWPARAM);
long cols = jmtd.getLong(DataExpression.READCOLPARAM);
String format = jmtd.getString(DataExpression.FORMAT_TYPE);
InputInfo iinfo = InputInfo.stringExternalToInputInfo(format);
// read frame file
return readStringFrame(fname, iinfo, rows, cols);
} catch (Exception ex) {
throw new IOException(ex);
}
}
Aggregations