Search in sources :

Example 61 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.

the class MatrixObject method readBlobFromRDD.

@Override
protected MatrixBlock readBlobFromRDD(RDDObject rdd, MutableBoolean writeStatus) throws IOException {
    // note: the read of a matrix block from an RDD might trigger
    // lazy evaluation of pending transformations.
    RDDObject lrdd = rdd;
    // prepare return status (by default only collect)
    writeStatus.setValue(false);
    MetaDataFormat iimd = (MetaDataFormat) _metaData;
    MatrixCharacteristics mc = iimd.getMatrixCharacteristics();
    InputInfo ii = iimd.getInputInfo();
    MatrixBlock mb = null;
    try {
        // prevent unnecessary collect through rdd checkpoint
        if (rdd.allowsShortCircuitCollect()) {
            lrdd = (RDDObject) rdd.getLineageChilds().get(0);
        }
        // obtain matrix block from RDD
        int rlen = (int) mc.getRows();
        int clen = (int) mc.getCols();
        int brlen = (int) mc.getRowsPerBlock();
        int bclen = (int) mc.getColsPerBlock();
        long nnz = mc.getNonZerosBound();
        // guarded rdd collect
        if (// guarded collect not for binary cell
        ii == InputInfo.BinaryBlockInputInfo && !OptimizerUtils.checkSparkCollectMemoryBudget(mc, getPinnedSize() + getBroadcastSize(), true)) {
            // note: lazy, partition-at-a-time collect (toLocalIterator) was significantly slower
            if (!MapReduceTool.existsFileOnHDFS(_hdfsFileName)) {
                // prevent overwrite existing file
                long newnnz = SparkExecutionContext.writeRDDtoHDFS(lrdd, _hdfsFileName, iimd.getOutputInfo());
                _metaData.getMatrixCharacteristics().setNonZeros(newnnz);
                // mark rdd as non-pending (for export)
                ((RDDObject) rdd).setPending(false);
                // mark rdd as hdfs file (for restore)
                ((RDDObject) rdd).setHDFSFile(true);
                // mark for no cache-write on read
                writeStatus.setValue(true);
            // note: the flag hdfsFile is actually not entirely correct because we still hold an rdd
            // reference to the input not to an rdd of the hdfs file but the resulting behavior is correct
            }
            mb = readBlobFromHDFS(_hdfsFileName);
        } else if (ii == InputInfo.BinaryCellInputInfo) {
            // collect matrix block from binary block RDD
            mb = SparkExecutionContext.toMatrixBlock(lrdd, rlen, clen, nnz);
        } else {
            // collect matrix block from binary cell RDD
            mb = SparkExecutionContext.toMatrixBlock(lrdd, rlen, clen, brlen, bclen, nnz);
        }
    } catch (DMLRuntimeException ex) {
        throw new IOException(ex);
    }
    // sanity check correct output
    if (mb == null)
        throw new IOException("Unable to load matrix from rdd.");
    return mb;
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) IOException(java.io.IOException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 62 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.

the class SparkExecutionContext method getRDDHandleForFrameObject.

/**
 * FIXME: currently this implementation assumes matrix representations but frame signature
 * in order to support the old transform implementation.
 *
 * @param fo frame object
 * @param inputInfo input info
 * @return JavaPairRDD handle for a frame object
 */
@SuppressWarnings("unchecked")
public JavaPairRDD<?, ?> getRDDHandleForFrameObject(FrameObject fo, InputInfo inputInfo) {
    // NOTE: MB this logic should be integrated into FrameObject
    // However, for now we cannot assume that spark libraries are
    // always available and hence only store generic references in
    // matrix object while all the logic is in the SparkExecContext
    InputInfo inputInfo2 = (inputInfo == InputInfo.BinaryBlockInputInfo) ? InputInfo.BinaryBlockFrameInputInfo : inputInfo;
    JavaSparkContext sc = getSparkContext();
    JavaPairRDD<?, ?> rdd = null;
    // rdd operations if already executed and cached
    if (fo.getRDDHandle() != null && (fo.getRDDHandle().isCheckpointRDD() || !fo.isCached(false))) {
        // return existing rdd handling (w/o input format change)
        rdd = fo.getRDDHandle().getRDD();
    } else // CASE 2: dirty in memory data or cached result of rdd operations
    if (fo.isDirty() || fo.isCached(false)) {
        // get in-memory matrix block and parallelize it
        // w/ guarded parallelize (fallback to export, rdd from file if too large)
        MatrixCharacteristics mc = fo.getMatrixCharacteristics();
        boolean fromFile = false;
        if (!OptimizerUtils.checkSparkCollectMemoryBudget(mc, 0) || !_parRDDs.reserve(OptimizerUtils.estimatePartitionedSizeExactSparsity(mc))) {
            if (fo.isDirty()) {
                // write only if necessary
                fo.exportData();
            }
            rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
            // cp is workaround for read bug
            rdd = ((JavaPairRDD<LongWritable, FrameBlock>) rdd).mapToPair(new CopyFrameBlockPairFunction());
            fromFile = true;
        } else {
            // default case
            // pin frame in memory
            FrameBlock fb = fo.acquireRead();
            rdd = toFrameJavaPairRDD(sc, fb);
            // unpin frame
            fo.release();
            _parRDDs.registerRDD(rdd.id(), OptimizerUtils.estimatePartitionedSizeExactSparsity(mc), true);
        }
        // keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd);
        rddhandle.setHDFSFile(fromFile);
        fo.setRDDHandle(rddhandle);
    } else // CASE 3: non-dirty (file exists on HDFS)
    {
        // For binary block, these are: SequenceFileInputFormat.class, MatrixIndexes.class, MatrixBlock.class
        if (inputInfo2 == InputInfo.BinaryBlockFrameInputInfo) {
            rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
            // note: this copy is still required in Spark 1.4 because spark hands out whatever the inputformat
            // recordreader returns; the javadoc explicitly recommend to copy all key/value pairs
            // cp is workaround for read bug
            rdd = ((JavaPairRDD<LongWritable, FrameBlock>) rdd).mapToPair(new CopyFrameBlockPairFunction());
        } else if (inputInfo2 == InputInfo.TextCellInputInfo || inputInfo2 == InputInfo.CSVInputInfo || inputInfo2 == InputInfo.MatrixMarketInputInfo) {
            rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
            // cp is workaround for read bug
            rdd = ((JavaPairRDD<LongWritable, Text>) rdd).mapToPair(new CopyTextInputFunction());
        } else if (inputInfo2 == InputInfo.BinaryCellInputInfo) {
            throw new DMLRuntimeException("Binarycell not supported for frames.");
        } else {
            throw new DMLRuntimeException("Incorrect input format in getRDDHandleForVariable");
        }
        // keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd);
        rddhandle.setHDFSFile(true);
        fo.setRDDHandle(rddhandle);
    }
    return rdd;
}
Also used : CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Text(org.apache.hadoop.io.Text) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) CopyFrameBlockPairFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockPairFunction) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 63 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.

the class ResultMergeLocalFile method createNewMatrixObject.

private MatrixObject createNewMatrixObject(MatrixObject output, ArrayList<MatrixObject> inMO) {
    MetaDataFormat metadata = (MetaDataFormat) _output.getMetaData();
    MatrixObject moNew = new MatrixObject(_output.getValueType(), _outputFName);
    // create deep copy of metadata obj
    MatrixCharacteristics mcOld = metadata.getMatrixCharacteristics();
    OutputInfo oiOld = metadata.getOutputInfo();
    InputInfo iiOld = metadata.getInputInfo();
    MatrixCharacteristics mc = new MatrixCharacteristics(mcOld);
    mc.setNonZeros(_isAccum ? -1 : computeNonZeros(output, inMO));
    MetaDataFormat meta = new MetaDataFormat(mc, oiOld, iiOld);
    moNew.setMetaData(meta);
    return moNew;
}
Also used : OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 64 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.

the class MRJobConfiguration method setUpMultipleInputs.

public static void setUpMultipleInputs(JobConf job, byte[] inputIndexes, String[] inputs, InputInfo[] inputInfos, int[] brlens, int[] bclens, boolean[] distCacheOnly, boolean setConverter, ConvertTarget target) throws Exception {
    if (inputs.length != inputInfos.length)
        throw new Exception("number of inputs and inputInfos does not match");
    // set up names of the input matrices and their inputformat information
    job.setStrings(INPUT_MATRICIES_DIRS_CONFIG, inputs);
    MRJobConfiguration.setMapFunctionInputMatrixIndexes(job, inputIndexes);
    // set up converter infos (converter determined implicitly)
    if (setConverter) {
        for (int i = 0; i < inputs.length; i++) setInputInfo(job, inputIndexes[i], inputInfos[i], brlens[i], bclens[i], target);
    }
    // remove redundant inputs and pure broadcast variables
    ArrayList<Path> lpaths = new ArrayList<>();
    ArrayList<InputInfo> liinfos = new ArrayList<>();
    for (int i = 0; i < inputs.length; i++) {
        Path p = new Path(inputs[i]);
        // check and skip redundant inputs
        if (// path already included
        lpaths.contains(p) || // input only required in dist cache
        distCacheOnly[i]) {
            continue;
        }
        lpaths.add(p);
        liinfos.add(inputInfos[i]);
    }
    boolean combineInputFormat = false;
    if (OptimizerUtils.ALLOW_COMBINE_FILE_INPUT_FORMAT) {
        // determine total input sizes
        double totalInputSize = 0;
        for (int i = 0; i < inputs.length; i++) totalInputSize += MapReduceTool.getFilesizeOnHDFS(new Path(inputs[i]));
        // set max split size (default blocksize) to 2x blocksize if (1) sort buffer large enough,
        // (2) degree of parallelism not hurt, and only a single input (except broadcasts)
        // (the sort buffer size is relevant for pass-through of, potentially modified, inputs to the reducers)
        // (the single input constraint stems from internal runtime assumptions used to relate meta data to inputs)
        long sizeSortBuff = InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer();
        long sizeHDFSBlk = InfrastructureAnalyzer.getHDFSBlockSize();
        // use generic config api for backwards compatibility
        long newSplitSize = sizeHDFSBlk * 2;
        double spillPercent = Double.parseDouble(job.get(MRConfigurationNames.MR_MAP_SORT_SPILL_PERCENT, "1.0"));
        int numPMap = OptimizerUtils.getNumMappers();
        if (numPMap < totalInputSize / newSplitSize && sizeSortBuff * spillPercent >= newSplitSize && lpaths.size() == 1) {
            job.setLong(MRConfigurationNames.MR_INPUT_FILEINPUTFORMAT_SPLIT_MAXSIZE, newSplitSize);
            combineInputFormat = true;
        }
    }
    // add inputs to jobs input (incl input format configuration)
    for (int i = 0; i < lpaths.size(); i++) {
        // add input to job inputs (for binaryblock we use CombineSequenceFileInputFormat to reduce task latency)
        if (combineInputFormat && liinfos.get(i) == InputInfo.BinaryBlockInputInfo)
            MultipleInputs.addInputPath(job, lpaths.get(i), CombineSequenceFileInputFormat.class);
        else
            MultipleInputs.addInputPath(job, lpaths.get(i), liinfos.get(i).inputFormatClass);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) CombineSequenceFileInputFormat(org.apache.hadoop.mapred.lib.CombineSequenceFileInputFormat) ArrayList(java.util.ArrayList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException)

Example 65 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.

the class Connection method readStringFrame.

// //////////////////////////////////////////
// Read frames
// //////////////////////////////////////////
/**
 * Reads an input frame in arbitrary format from HDFS into a dense string array.
 * NOTE: this call currently only supports default configurations for CSV.
 *
 * @param fname the filename of the input frame
 * @return frame as a two-dimensional string array
 * @throws IOException if IOException occurs
 */
public String[][] readStringFrame(String fname) throws IOException {
    try {
        // read json meta data
        String fnamemtd = DataExpression.getMTDFileName(fname);
        JSONObject jmtd = new DataExpression().readMetadataFile(fnamemtd, false);
        // parse json meta data
        long rows = jmtd.getLong(DataExpression.READROWPARAM);
        long cols = jmtd.getLong(DataExpression.READCOLPARAM);
        String format = jmtd.getString(DataExpression.FORMAT_TYPE);
        InputInfo iinfo = InputInfo.stringExternalToInputInfo(format);
        // read frame file
        return readStringFrame(fname, iinfo, rows, cols);
    } catch (Exception ex) {
        throw new IOException(ex);
    }
}
Also used : DataExpression(org.apache.sysml.parser.DataExpression) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JSONObject(org.apache.wink.json4j.JSONObject) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DMLException(org.apache.sysml.api.DMLException) LanguageException(org.apache.sysml.parser.LanguageException) IOException(java.io.IOException) ParseException(org.apache.sysml.parser.ParseException)

Aggregations

InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)74 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)38 OutputInfo (org.apache.sysml.runtime.matrix.data.OutputInfo)30 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)26 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)20 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)20 IOException (java.io.IOException)17 JobConf (org.apache.hadoop.mapred.JobConf)13 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)13 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)12 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)11 RunningJob (org.apache.hadoop.mapred.RunningJob)10 Path (org.apache.hadoop.fs.Path)9 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)9 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)9 DMLConfig (org.apache.sysml.conf.DMLConfig)8 ValueType (org.apache.sysml.parser.Expression.ValueType)8 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)7 CSVFileFormatProperties (org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties)7 Group (org.apache.hadoop.mapred.Counters.Group)6