Search in sources :

Example 71 with FrameBlock

use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.

the class VariableCPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    switch(opcode) {
        case CreateVariable:
            if (getInput1().getDataType() == DataType.MATRIX) {
                // create new variable for symbol table and cache
                // (existing objects gets cleared through rmvar instructions)
                String fname = getInput2().getName();
                // check if unique filename needs to be generated
                if (Boolean.parseBoolean(getInput3().getName())) {
                    fname = new StringBuilder(fname.length() + 16).append(fname).append('_').append(_uniqueVarID.getNextID()).toString();
                }
                MatrixObject mobj = new MatrixObject(getInput1().getValueType(), fname);
                // clone meta data because it is updated on copy-on-write, otherwise there
                // is potential for hidden side effects between variables.
                mobj.setMetaData((MetaData) metadata.clone());
                mobj.setFileFormatProperties(_formatProperties);
                mobj.setUpdateType(_updateType);
                ec.setVariable(getInput1().getName(), mobj);
                if (DMLScript.STATISTICS && _updateType.isInPlace())
                    Statistics.incrementTotalUIPVar();
            } else if (getInput1().getDataType() == DataType.FRAME) {
                String fname = getInput2().getName();
                FrameObject fobj = new FrameObject(fname);
                fobj.setMetaData((MetaData) metadata.clone());
                fobj.setFileFormatProperties(_formatProperties);
                if (_schema != null)
                    // after metadata
                    fobj.setSchema(_schema);
                ec.setVariable(getInput1().getName(), fobj);
            } else if (getInput1().getDataType() == DataType.SCALAR) {
                // created variable not called for scalars
                ec.setScalarOutput(getInput1().getName(), null);
            } else {
                throw new DMLRuntimeException("Unexpected data type: " + getInput1().getDataType());
            }
            break;
        case AssignVariable:
            // assign value of variable to the other
            ec.setScalarOutput(getInput2().getName(), ec.getScalarInput(getInput1()));
            break;
        case CopyVariable:
            processCopyInstruction(ec);
            break;
        case MoveVariable:
            processMoveInstruction(ec);
            break;
        case RemoveVariable:
            for (CPOperand input : inputs) processRemoveVariableInstruction(ec, input.getName());
            break;
        case RemoveVariableAndFile:
            // Remove the variable from HashMap _variables, and possibly delete the data on disk.
            boolean del = ((BooleanObject) ec.getScalarInput(getInput2().getName(), getInput2().getValueType(), true)).getBooleanValue();
            MatrixObject m = (MatrixObject) ec.removeVariable(getInput1().getName());
            if (!del) {
                // therefore data must be exported if dirty flag is set
                if (m.isDirty())
                    m.exportData();
            } else {
                // throw new DMLRuntimeException("rmfilevar w/ true is not expected! " + instString);
                // cleanDataOnHDFS(pb, input1.getName());
                cleanDataOnHDFS(m);
            }
            // check if in-memory object can be cleaned up
            if (!ec.getVariables().hasReferences(m)) {
                // no other variable in the symbol table points to the same Data object as that of input1.getName()
                // remove matrix object from cache
                m.clearData();
            }
            break;
        case // castAsScalarVariable
        CastAsScalarVariable:
            if (getInput1().getDataType() == DataType.FRAME) {
                FrameBlock fBlock = ec.getFrameInput(getInput1().getName());
                if (fBlock.getNumRows() != 1 || fBlock.getNumColumns() != 1)
                    throw new DMLRuntimeException("Dimension mismatch - unable to cast frame '" + getInput1().getName() + "' of dimension (" + fBlock.getNumRows() + " x " + fBlock.getNumColumns() + ") to scalar.");
                Object value = fBlock.get(0, 0);
                ec.releaseFrameInput(getInput1().getName());
                ec.setScalarOutput(output.getName(), ScalarObjectFactory.createScalarObject(fBlock.getSchema()[0], value));
            } else {
                // assume DataType.MATRIX otherwise
                MatrixBlock mBlock = ec.getMatrixInput(getInput1().getName(), getExtendedOpcode());
                if (mBlock.getNumRows() != 1 || mBlock.getNumColumns() != 1)
                    throw new DMLRuntimeException("Dimension mismatch - unable to cast matrix '" + getInput1().getName() + "' of dimension (" + mBlock.getNumRows() + " x " + mBlock.getNumColumns() + ") to scalar.");
                double value = mBlock.getValue(0, 0);
                ec.releaseMatrixInput(getInput1().getName(), getExtendedOpcode());
                ec.setScalarOutput(output.getName(), new DoubleObject(value));
            }
            break;
        case CastAsMatrixVariable:
            {
                MatrixBlock out = null;
                if (getInput1().getDataType() == DataType.FRAME) {
                    FrameBlock fin = ec.getFrameInput(getInput1().getName());
                    out = DataConverter.convertToMatrixBlock(fin);
                    ec.releaseFrameInput(getInput1().getName());
                } else {
                    // assume DataType.SCALAR otherwise
                    ScalarObject scalarInput = ec.getScalarInput(getInput1().getName(), getInput1().getValueType(), getInput1().isLiteral());
                    out = new MatrixBlock(1, 1, false);
                    out.quickSetValue(0, 0, scalarInput.getDoubleValue());
                }
                ec.setMatrixOutput(output.getName(), out, getExtendedOpcode());
                break;
            }
        case CastAsFrameVariable:
            {
                FrameBlock out = null;
                if (getInput1().getDataType() == DataType.SCALAR) {
                    ScalarObject scalarInput = ec.getScalarInput(getInput1());
                    out = new FrameBlock(1, getInput1().getValueType());
                    out.ensureAllocatedColumns(1);
                    out.set(0, 0, scalarInput.getStringValue());
                } else {
                    // DataType.FRAME
                    MatrixBlock min = ec.getMatrixInput(getInput1().getName(), getExtendedOpcode());
                    out = DataConverter.convertToFrameBlock(min);
                    ec.releaseMatrixInput(getInput1().getName(), getExtendedOpcode());
                }
                ec.setFrameOutput(output.getName(), out);
                break;
            }
        case CastAsDoubleVariable:
            {
                ScalarObject scalarInput = ec.getScalarInput(getInput1());
                ec.setScalarOutput(output.getName(), new DoubleObject(scalarInput.getDoubleValue()));
                break;
            }
        case CastAsIntegerVariable:
            {
                ScalarObject scalarInput = ec.getScalarInput(getInput1());
                ec.setScalarOutput(output.getName(), new IntObject(scalarInput.getLongValue()));
                break;
            }
        case CastAsBooleanVariable:
            {
                ScalarObject scalarInput = ec.getScalarInput(getInput1());
                ec.setScalarOutput(output.getName(), new BooleanObject(scalarInput.getBooleanValue()));
                break;
            }
        case Read:
            ScalarObject res = null;
            try {
                switch(getInput1().getValueType()) {
                    case DOUBLE:
                        double d = MapReduceTool.readDoubleFromHDFSFile(getInput2().getName());
                        res = (ScalarObject) new DoubleObject(d);
                        break;
                    case INT:
                        long i = MapReduceTool.readIntegerFromHDFSFile(getInput2().getName());
                        res = (ScalarObject) new IntObject(i);
                        break;
                    case BOOLEAN:
                        boolean b = MapReduceTool.readBooleanFromHDFSFile(getInput2().getName());
                        res = (ScalarObject) new BooleanObject(b);
                        break;
                    case STRING:
                        String s = MapReduceTool.readStringFromHDFSFile(getInput2().getName());
                        res = (ScalarObject) new StringObject(s);
                        break;
                    default:
                        throw new DMLRuntimeException("Invalid value type (" + getInput1().getValueType() + ") while processing readScalar instruction.");
                }
            } catch (IOException e) {
                throw new DMLRuntimeException(e);
            }
            ec.setScalarOutput(getInput1().getName(), res);
            break;
        case Write:
            processWriteInstruction(ec);
            break;
        case SetFileName:
            Data data = ec.getVariable(getInput1().getName());
            if (data.getDataType() == DataType.MATRIX) {
                if (getInput3().getName().equalsIgnoreCase("remote")) {
                    ((MatrixObject) data).setFileName(getInput2().getName());
                } else {
                    throw new DMLRuntimeException("Invalid location (" + getInput3().getName() + ") in SetFileName instruction: " + instString);
                }
            } else {
                throw new DMLRuntimeException("Invalid data type (" + getInput1().getDataType() + ") in SetFileName instruction: " + instString);
            }
            break;
        default:
            throw new DMLRuntimeException("Unknown opcode: " + opcode);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) CacheableData(org.apache.sysml.runtime.controlprogram.caching.CacheableData) MetaData(org.apache.sysml.runtime.matrix.MetaData) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) MetaData(org.apache.sysml.runtime.matrix.MetaData) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject)

Example 72 with FrameBlock

use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.

the class CheckpointSPInstruction method processInstruction.

@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // this is valid if relevant branches are never entered)
    if (sec.getVariable(input1.getName()) == null || sec.getVariable(input1.getName()) instanceof BooleanObject) {
        // add a dummy entry to the input, which will be immediately overwritten by the null output.
        sec.setVariable(input1.getName(), new BooleanObject(false));
        sec.setVariable(output.getName(), new BooleanObject(false));
        return;
    }
    // -------
    // (for csv input files with unknown dimensions, we might have generated a checkpoint after
    // csvreblock although not necessary because the csvreblock was subject to in-memory reblock)
    CacheableData<?> obj = sec.getCacheableData(input1.getName());
    if (obj.isCached(true)) {
        // available in memory
        sec.setVariable(output.getName(), obj);
        return;
    }
    // get input rdd handle (for matrix or frame)
    JavaPairRDD<?, ?> in = sec.getRDDHandleForVariable(input1.getName(), InputInfo.BinaryBlockInputInfo);
    MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
    // Step 2: Checkpoint given rdd (only if currently in different storage level to prevent redundancy)
    // -------
    // Note that persist is an transformation which will be triggered on-demand with the next rdd operations
    // This prevents unnecessary overhead if the dataset is only consumed by cp operations.
    JavaPairRDD<?, ?> out = null;
    if (!in.getStorageLevel().equals(_level)) {
        // (trigger coalesce if intended number of partitions exceeded by 20%
        // and not hash partitioned to avoid losing the existing partitioner)
        int numPartitions = SparkUtils.getNumPreferredPartitions(mcIn, in);
        boolean coalesce = (1.2 * numPartitions < in.getNumPartitions() && !SparkUtils.isHashPartitioned(in) && in.getNumPartitions() > SparkExecutionContext.getDefaultParallelism(true));
        // checkpoint pre-processing rdd operations
        if (coalesce) {
            // merge partitions without shuffle if too many partitions
            out = in.coalesce(numPartitions);
        } else {
            // apply a narrow shallow copy to allow for short-circuit collects
            if (input1.getDataType() == DataType.MATRIX)
                out = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) in, false);
            else if (input1.getDataType() == DataType.FRAME)
                out = ((JavaPairRDD<Long, FrameBlock>) in).mapValues(new CopyFrameBlockFunction(false));
        }
        // convert mcsr into memory-efficient csr if potentially sparse
        if (input1.getDataType() == DataType.MATRIX && OptimizerUtils.checkSparseBlockCSRConversion(mcIn) && !_level.equals(Checkpoint.SER_STORAGE_LEVEL)) {
            out = ((JavaPairRDD<MatrixIndexes, MatrixBlock>) out).mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
        }
        // actual checkpoint into given storage level
        out = out.persist(_level);
        // otherwise these their nnz would never be evaluated due to lazy evaluation in spark
        if (input1.isMatrix() && mcIn.dimsKnown() && !mcIn.dimsKnown(true) && !OptimizerUtils.isValidCPDimensions(mcIn)) {
            mcIn.setNonZeros(SparkUtils.getNonZeros((JavaPairRDD<MatrixIndexes, MatrixBlock>) out));
        }
    } else {
        // pass-through
        out = in;
    }
    // Step 3: In-place update of input matrix/frame rdd handle and set as output
    // -------
    // We use this in-place approach for two reasons. First, it is correct because our checkpoint
    // injection rewrites guarantee that after checkpoint instructions there are no consumers on the
    // given input. Second, it is beneficial because otherwise we need to pass in-memory objects and
    // filenames to the new matrix object in order to prevent repeated reads from hdfs and unnecessary
    // caching and subsequent collects. Note that in-place update requires us to explicitly handle
    // lineage information in order to prevent cycles on cleanup.
    CacheableData<?> cd = sec.getCacheableData(input1.getName());
    if (out != in) {
        // prevent unnecessary lineage info
        // guaranteed to exist (see above)
        RDDObject inro = cd.getRDDHandle();
        // create new rdd object
        RDDObject outro = new RDDObject(out);
        // mark as checkpointed
        outro.setCheckpointRDD(true);
        // keep lineage to prevent cycles on cleanup
        outro.addLineageChild(inro);
        cd.setRDDHandle(outro);
    }
    sec.setVariable(output.getName(), cd);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Checkpoint(org.apache.sysml.lops.Checkpoint) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) CreateSparseBlockFunction(org.apache.sysml.runtime.instructions.spark.functions.CreateSparseBlockFunction) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) CopyFrameBlockFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockFunction) BooleanObject(org.apache.sysml.runtime.instructions.cp.BooleanObject)

Example 73 with FrameBlock

use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.

the class FrameAppendRSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    JavaPairRDD<Long, FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable(input1.getName());
    JavaPairRDD<Long, FrameBlock> in2 = sec.getFrameBinaryBlockRDDHandleForVariable(input2.getName());
    JavaPairRDD<Long, FrameBlock> out = null;
    long leftRows = sec.getMatrixCharacteristics(input1.getName()).getRows();
    if (_cbind) {
        JavaPairRDD<Long, FrameBlock> in1Aligned = in1.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
        in1Aligned = FrameRDDAggregateUtils.mergeByKey(in1Aligned);
        JavaPairRDD<Long, FrameBlock> in2Aligned = in2.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
        in2Aligned = FrameRDDAggregateUtils.mergeByKey(in2Aligned);
        out = in1Aligned.join(in2Aligned).mapValues(new ReduceSideColumnsFunction(_cbind));
    } else {
        // rbind
        JavaPairRDD<Long, FrameBlock> right = in2.mapToPair(new ReduceSideAppendRowsFunction(leftRows));
        out = in1.union(right);
    }
    // put output RDD handle into symbol table
    updateBinaryAppendOutputMatrixCharacteristics(sec, _cbind);
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), input1.getName());
    sec.addLineageRDD(output.getName(), input2.getName());
    // update schema of output with merged input schemas
    sec.getFrameObject(output.getName()).setSchema(sec.getFrameObject(input1.getName()).mergeSchemas(sec.getFrameObject(input2.getName())));
}
Also used : FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)

Example 74 with FrameBlock

use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.

the class ReblockSPInstruction method processFrameReblockInstruction.

@SuppressWarnings("unchecked")
protected void processFrameReblockInstruction(SparkExecutionContext sec, InputInfo iinfo) {
    FrameObject fo = sec.getFrameObject(input1.getName());
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    if (iinfo == InputInfo.TextCellInputInfo) {
        // get the input textcell rdd
        JavaPairRDD<LongWritable, Text> lines = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForVariable(input1.getName(), iinfo);
        // convert textcell to binary block
        JavaPairRDD<Long, FrameBlock> out = FrameRDDConverterUtils.textCellToBinaryBlock(sec.getSparkContext(), lines, mcOut, fo.getSchema());
        // put output RDD handle into symbol table
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), input1.getName());
    } else if (iinfo == InputInfo.CSVInputInfo) {
        // HACK ALERT: Until we introduces the rewrite to insert csvrblock for non-persistent read
        // throw new DMLRuntimeException("CSVInputInfo is not supported for ReblockSPInstruction");
        CSVReblockSPInstruction csvInstruction = null;
        boolean hasHeader = false;
        String delim = ",";
        boolean fill = false;
        double fillValue = 0;
        if (fo.getFileFormatProperties() instanceof CSVFileFormatProperties && fo.getFileFormatProperties() != null) {
            CSVFileFormatProperties props = (CSVFileFormatProperties) fo.getFileFormatProperties();
            hasHeader = props.hasHeader();
            delim = props.getDelim();
            fill = props.isFill();
            fillValue = props.getFillValue();
        }
        csvInstruction = new CSVReblockSPInstruction(null, input1, output, mcOut.getRowsPerBlock(), mcOut.getColsPerBlock(), hasHeader, delim, fill, fillValue, "csvrblk", instString);
        csvInstruction.processInstruction(sec);
    } else {
        throw new DMLRuntimeException("The given InputInfo is not implemented " + "for ReblockSPInstruction: " + InputInfo.inputInfoToString(iinfo));
    }
}
Also used : CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) Text(org.apache.hadoop.io.Text) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) LongWritable(org.apache.hadoop.io.LongWritable)

Example 75 with FrameBlock

use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.

the class WriteSPInstruction method processFrameWriteInstruction.

@SuppressWarnings("unchecked")
protected void processFrameWriteInstruction(SparkExecutionContext sec, String fname, OutputInfo oi, ValueType[] schema) throws IOException {
    // get input rdd
    JavaPairRDD<Long, FrameBlock> in1 = (JavaPairRDD<Long, FrameBlock>) sec.getRDDHandleForVariable(input1.getName(), InputInfo.BinaryBlockInputInfo);
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    if (oi == OutputInfo.TextCellOutputInfo) {
        JavaRDD<String> out = FrameRDDConverterUtils.binaryBlockToTextCell(in1, mc);
        customSaveTextFile(out, fname, false);
    } else if (oi == OutputInfo.CSVOutputInfo) {
        CSVFileFormatProperties props = (formatProperties != null) ? (CSVFileFormatProperties) formatProperties : null;
        JavaRDD<String> out = FrameRDDConverterUtils.binaryBlockToCsv(in1, mc, props, true);
        customSaveTextFile(out, fname, false);
    } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
        JavaPairRDD<LongWritable, FrameBlock> out = in1.mapToPair(new LongFrameToLongWritableFrameFunction());
        out.saveAsHadoopFile(fname, LongWritable.class, FrameBlock.class, SequenceFileOutputFormat.class);
    } else {
        // unsupported formats: binarycell (not externalized)
        throw new DMLRuntimeException("Unexpected data format: " + OutputInfo.outputInfoToString(oi));
    }
    // write meta data file
    MapReduceTool.writeMetaDataFile(fname + ".mtd", input1.getValueType(), schema, DataType.FRAME, mc, oi, formatProperties);
}
Also used : CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) JavaRDD(org.apache.spark.api.java.JavaRDD) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) LongWritable(org.apache.hadoop.io.LongWritable) LongFrameToLongWritableFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction)

Aggregations

FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)90 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)28 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)26 ValueType (org.apache.sysml.parser.Expression.ValueType)23 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)23 FrameReader (org.apache.sysml.runtime.io.FrameReader)18 IOException (java.io.IOException)16 RUNTIME_PLATFORM (org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM)16 FrameObject (org.apache.sysml.runtime.controlprogram.caching.FrameObject)15 LongWritable (org.apache.hadoop.io.LongWritable)12 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)11 CSVFileFormatProperties (org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties)11 FrameWriter (org.apache.sysml.runtime.io.FrameWriter)9 TestConfiguration (org.apache.sysml.test.integration.TestConfiguration)8 Text (org.apache.hadoop.io.Text)7 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)7 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)7 ConvertStringToLongTextPair (org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair)6 CopyTextInputFunction (org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction)5 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)5