Search in sources :

Example 41 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.

the class Dag method generateMapReduceInstructions.

/**
 * Method to generate MapReduce job instructions from a given set of nodes.
 *
 * @param execNodes list of exec nodes
 * @param inst list of instructions
 * @param writeinst list of write instructions
 * @param deleteinst list of delete instructions
 * @param rmvarinst list of rmvar instructions
 * @param jt job type
 */
private void generateMapReduceInstructions(ArrayList<Lop> execNodes, ArrayList<Instruction> inst, ArrayList<Instruction> writeinst, ArrayList<Instruction> deleteinst, ArrayList<Instruction> rmvarinst, JobType jt) {
    ArrayList<Byte> resultIndices = new ArrayList<>();
    ArrayList<String> inputs = new ArrayList<>();
    ArrayList<String> outputs = new ArrayList<>();
    ArrayList<InputInfo> inputInfos = new ArrayList<>();
    ArrayList<OutputInfo> outputInfos = new ArrayList<>();
    ArrayList<Long> numRows = new ArrayList<>();
    ArrayList<Long> numCols = new ArrayList<>();
    ArrayList<Long> numRowsPerBlock = new ArrayList<>();
    ArrayList<Long> numColsPerBlock = new ArrayList<>();
    ArrayList<String> mapperInstructions = new ArrayList<>();
    ArrayList<String> randInstructions = new ArrayList<>();
    ArrayList<String> recordReaderInstructions = new ArrayList<>();
    int numReducers = 0;
    int replication = 1;
    ArrayList<String> inputLabels = new ArrayList<>();
    ArrayList<String> outputLabels = new ArrayList<>();
    ArrayList<Instruction> renameInstructions = new ArrayList<>();
    ArrayList<Instruction> variableInstructions = new ArrayList<>();
    ArrayList<Instruction> postInstructions = new ArrayList<>();
    ArrayList<Integer> MRJobLineNumbers = null;
    if (DMLScript.ENABLE_DEBUG_MODE) {
        MRJobLineNumbers = new ArrayList<>();
    }
    ArrayList<Lop> inputLops = new ArrayList<>();
    boolean cellModeOverride = false;
    /* Find the nodes that produce an output */
    ArrayList<Lop> rootNodes = new ArrayList<>();
    getOutputNodes(execNodes, rootNodes, jt);
    if (LOG.isTraceEnabled())
        LOG.trace("# of root nodes = " + rootNodes.size());
    /* Remove transient writes that are simple copy of transient reads */
    if (jt == JobType.GMR || jt == JobType.GMRCELL) {
        ArrayList<Lop> markedNodes = new ArrayList<>();
        // only keep data nodes that are results of some computation.
        for (Lop rnode : rootNodes) {
            if (rnode.getExecLocation() == ExecLocation.Data && ((Data) rnode).isTransient() && ((Data) rnode).getOperationType() == OperationTypes.WRITE && ((Data) rnode).getDataType() == DataType.MATRIX) {
                // no computation, just a copy
                if (rnode.getInputs().get(0).getExecLocation() == ExecLocation.Data && ((Data) rnode.getInputs().get(0)).isTransient() && rnode.getOutputParameters().getLabel().equals(rnode.getInputs().get(0).getOutputParameters().getLabel())) {
                    markedNodes.add(rnode);
                }
            }
        }
        // delete marked nodes
        rootNodes.removeAll(markedNodes);
        markedNodes.clear();
        if (rootNodes.isEmpty())
            return;
    }
    // structure that maps node to their indices that will be used in the instructions
    HashMap<Lop, Integer> nodeIndexMapping = new HashMap<>();
    for (Lop rnode : rootNodes) {
        getInputPathsAndParameters(rnode, execNodes, inputs, inputInfos, numRows, numCols, numRowsPerBlock, numColsPerBlock, nodeIndexMapping, inputLabels, inputLops, MRJobLineNumbers);
    }
    // In case of RAND job, instructions are defined in the input file
    if (jt == JobType.DATAGEN)
        randInstructions = inputs;
    int[] start_index = new int[1];
    start_index[0] = inputs.size();
    // currently, recordreader instructions are allowed only in GMR jobs
    if (jt == JobType.GMR || jt == JobType.GMRCELL) {
        for (Lop rnode : rootNodes) {
            getRecordReaderInstructions(rnode, execNodes, inputs, recordReaderInstructions, nodeIndexMapping, start_index, inputLabels, inputLops, MRJobLineNumbers);
            if (recordReaderInstructions.size() > 1)
                throw new LopsException("MapReduce job can only have a single recordreader instruction: " + recordReaderInstructions.toString());
        }
    }
    // 
    if (jt != JobType.REBLOCK && jt != JobType.CSV_REBLOCK && jt != JobType.DATAGEN) {
        for (int i = 0; i < inputInfos.size(); i++) if (inputInfos.get(i) == InputInfo.BinaryCellInputInfo || inputInfos.get(i) == InputInfo.TextCellInputInfo)
            cellModeOverride = true;
    }
    if (!recordReaderInstructions.isEmpty() || jt == JobType.GROUPED_AGG)
        cellModeOverride = true;
    for (int i = 0; i < rootNodes.size(); i++) {
        getMapperInstructions(rootNodes.get(i), execNodes, inputs, mapperInstructions, nodeIndexMapping, start_index, inputLabels, inputLops, MRJobLineNumbers);
    }
    if (LOG.isTraceEnabled()) {
        LOG.trace("    Input strings: " + inputs.toString());
        if (jt == JobType.DATAGEN)
            LOG.trace("    Rand instructions: " + getCSVString(randInstructions));
        if (jt == JobType.GMR)
            LOG.trace("    RecordReader instructions: " + getCSVString(recordReaderInstructions));
        LOG.trace("    Mapper instructions: " + getCSVString(mapperInstructions));
    }
    /* Get Shuffle and Reducer Instructions */
    ArrayList<String> shuffleInstructions = new ArrayList<>();
    ArrayList<String> aggInstructionsReducer = new ArrayList<>();
    ArrayList<String> otherInstructionsReducer = new ArrayList<>();
    for (Lop rn : rootNodes) {
        int resultIndex = getAggAndOtherInstructions(rn, execNodes, shuffleInstructions, aggInstructionsReducer, otherInstructionsReducer, nodeIndexMapping, start_index, inputLabels, inputLops, MRJobLineNumbers);
        if (resultIndex == -1)
            throw new LopsException("Unexpected error in piggybacking!");
        if (rn.getExecLocation() == ExecLocation.Data && ((Data) rn).getOperationType() == Data.OperationTypes.WRITE && ((Data) rn).isTransient() && rootNodes.contains(rn.getInputs().get(0))) {
            // Both rn (a transient write) and its input are root nodes.
            // Instead of creating two copies of the data, simply generate a cpvar instruction
            NodeOutput out = setupNodeOutputs(rn, ExecType.MR, cellModeOverride, true);
            writeinst.addAll(out.getLastInstructions());
        } else {
            resultIndices.add(Byte.valueOf((byte) resultIndex));
            // setup output filenames and outputInfos and generate related instructions
            NodeOutput out = setupNodeOutputs(rn, ExecType.MR, cellModeOverride, false);
            outputLabels.add(out.getVarName());
            outputs.add(out.getFileName());
            outputInfos.add(out.getOutInfo());
            if (LOG.isTraceEnabled()) {
                LOG.trace("    Output Info: " + out.getFileName() + ";" + OutputInfo.outputInfoToString(out.getOutInfo()) + ";" + out.getVarName());
            }
            renameInstructions.addAll(out.getLastInstructions());
            variableInstructions.addAll(out.getPreInstructions());
            postInstructions.addAll(out.getPostInstructions());
        }
    }
    /* Determine if the output dimensions are known */
    byte[] resultIndicesByte = new byte[resultIndices.size()];
    for (int i = 0; i < resultIndicesByte.length; i++) {
        resultIndicesByte[i] = resultIndices.get(i).byteValue();
    }
    if (LOG.isTraceEnabled()) {
        LOG.trace("    Shuffle Instructions: " + getCSVString(shuffleInstructions));
        LOG.trace("    Aggregate Instructions: " + getCSVString(aggInstructionsReducer));
        LOG.trace("    Other instructions =" + getCSVString(otherInstructionsReducer));
        LOG.trace("    Output strings: " + outputs.toString());
        LOG.trace("    ResultIndices = " + resultIndices.toString());
    }
    /* Prepare the MapReduce job instruction */
    MRJobInstruction mr = new MRJobInstruction(jt);
    // check if this is a map-only job. If not, set the number of reducers
    if (!shuffleInstructions.isEmpty() || !aggInstructionsReducer.isEmpty() || !otherInstructionsReducer.isEmpty())
        numReducers = total_reducers;
    // set inputs, outputs, and other other properties for the job
    mr.setInputOutputLabels(inputLabels.toArray(new String[0]), outputLabels.toArray(new String[0]));
    mr.setOutputs(resultIndicesByte);
    mr.setDimsUnknownFilePrefix(getFilePath());
    mr.setNumberOfReducers(numReducers);
    mr.setReplication(replication);
    // set instructions for recordReader and mapper
    mr.setRecordReaderInstructions(getCSVString(recordReaderInstructions));
    mr.setMapperInstructions(getCSVString(mapperInstructions));
    // compute and set mapper memory requirements (for consistency of runtime piggybacking)
    if (jt == JobType.GMR) {
        double mem = 0;
        for (Lop n : execNodes) mem += computeFootprintInMapper(n);
        mr.setMemoryRequirements(mem);
    }
    if (jt == JobType.DATAGEN)
        mr.setRandInstructions(getCSVString(randInstructions));
    // set shuffle instructions
    mr.setShuffleInstructions(getCSVString(shuffleInstructions));
    // set reducer instruction
    mr.setAggregateInstructionsInReducer(getCSVString(aggInstructionsReducer));
    mr.setOtherInstructionsInReducer(getCSVString(otherInstructionsReducer));
    if (DMLScript.ENABLE_DEBUG_MODE) {
        // set line number information for each MR instruction
        mr.setMRJobInstructionsLineNumbers(MRJobLineNumbers);
    }
    /* Add the prepared instructions to output set */
    inst.addAll(variableInstructions);
    inst.add(mr);
    inst.addAll(postInstructions);
    deleteinst.addAll(renameInstructions);
    for (Lop l : inputLops) {
        if (DMLScript.ENABLE_DEBUG_MODE) {
            processConsumers(l, rmvarinst, deleteinst, l);
        } else {
            processConsumers(l, rmvarinst, deleteinst, null);
        }
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) MRJobInstruction(org.apache.sysml.runtime.instructions.MRJobInstruction) CPInstruction(org.apache.sysml.runtime.instructions.cp.CPInstruction) Instruction(org.apache.sysml.runtime.instructions.Instruction) VariableCPInstruction(org.apache.sysml.runtime.instructions.cp.VariableCPInstruction) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) MRJobInstruction(org.apache.sysml.runtime.instructions.MRJobInstruction) Data(org.apache.sysml.lops.Data) Lop(org.apache.sysml.lops.Lop) OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) LopsException(org.apache.sysml.lops.LopsException)

Example 42 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.

the class Connection method convertToFrame.

/**
 * Converts an input stream of a frame in csv or textcell format
 * into a frame block.
 *
 * @param input InputStream to a string frame in csv or textcell format
 * @param rows number of rows in the frame
 * @param cols number of columns in the frame
 * @param format input format of the given stream
 * @return frame as a frame block
 * @throws IOException if IOException occurs
 */
public FrameBlock convertToFrame(InputStream input, int rows, int cols, String format) throws IOException {
    FrameBlock ret = null;
    // sanity check input format
    if (!(DataExpression.FORMAT_TYPE_VALUE_TEXT.equals(format) || DataExpression.FORMAT_TYPE_VALUE_MATRIXMARKET.equals(format) || DataExpression.FORMAT_TYPE_VALUE_CSV.equals(format))) {
        throw new IOException("Invalid input format (expected: csv, text or mm): " + format);
    }
    setLocalConfigs();
    try {
        // read input frame
        InputInfo iinfo = DataExpression.FORMAT_TYPE_VALUE_CSV.equals(format) ? InputInfo.CSVInputInfo : InputInfo.TextCellInputInfo;
        FrameReader reader = FrameReaderFactory.createFrameReader(iinfo);
        ret = reader.readFrameFromInputStream(input, rows, cols);
    } catch (DMLRuntimeException rex) {
        throw new IOException(rex);
    }
    return ret;
}
Also used : InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) IOException(java.io.IOException) FrameReader(org.apache.sysml.runtime.io.FrameReader) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 43 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.

the class FrameConverterTest method runConverter.

@SuppressWarnings("unchecked")
private static void runConverter(ConvType type, MatrixCharacteristics mc, MatrixCharacteristics mcMatrix, List<ValueType> schema, String fnameIn, String fnameOut) throws IOException {
    SparkExecutionContext sec = (SparkExecutionContext) ExecutionContextFactory.createContext();
    JavaSparkContext sc = sec.getSparkContext();
    ValueType[] lschema = schema.toArray(new ValueType[0]);
    MapReduceTool.deleteFileIfExistOnHDFS(fnameOut);
    switch(type) {
        case CSV2BIN:
            {
                InputInfo iinfo = InputInfo.CSVInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.csvToBinaryBlock(sc, rddIn, mc, null, false, separator, false, 0).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2CSV:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
                JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
                CSVFileFormatProperties fprop = new CSVFileFormatProperties();
                JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToCsv(rddIn2, mc, fprop, true);
                rddOut.saveAsTextFile(fnameOut);
                break;
            }
        case TXTCELL2BIN:
            {
                InputInfo iinfo = InputInfo.TextCellInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.textCellToBinaryBlock(sc, rddIn, mc, lschema).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2TXTCELL:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
                JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
                JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToTextCell(rddIn2, mc);
                rddOut.saveAsTextFile(fnameOut);
                break;
            }
        case MAT2BIN:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<MatrixIndexes, MatrixBlock> rddIn = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.matrixBlockToBinaryBlock(sc, rddIn, mcMatrix);
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2MAT:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
                JavaPairRDD<MatrixIndexes, MatrixBlock> rddOut = FrameRDDConverterUtils.binaryBlockToMatrixBlock(rddIn, mc, mcMatrix);
                rddOut.saveAsHadoopFile(fnameOut, MatrixIndexes.class, MatrixBlock.class, oinfo.outputFormatClass);
                break;
            }
        case DFRM2BIN:
            {
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                // Create DataFrame
                SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
                StructType dfSchema = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(lschema, false);
                JavaRDD<Row> rowRDD = FrameRDDConverterUtils.csvToRowRDD(sc, fnameIn, separator, lschema);
                Dataset<Row> df = sparkSession.createDataFrame(rowRDD, dfSchema);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, false).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2DFRM:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
                SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
                Dataset<Row> df = FrameRDDConverterUtils.binaryBlockToDataFrame(sparkSession, rddIn, mc, lschema);
                // Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, true).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        default:
            throw new RuntimeException("Unsuported converter type: " + type.toString());
    }
    sec.close();
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) ValueType(org.apache.sysml.parser.Expression.ValueType) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Dataset(org.apache.spark.sql.Dataset) Text(org.apache.hadoop.io.Text) JavaRDD(org.apache.spark.api.java.JavaRDD) OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) LongWritableFrameToLongFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongWritableFrameToLongFrameFunction) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) LongFrameToLongWritableFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction) CopyFrameBlockPairFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockPairFunction)

Example 44 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.

the class MatrixObject method readBlobFromRDD.

@Override
protected MatrixBlock readBlobFromRDD(RDDObject rdd, MutableBoolean writeStatus) throws IOException {
    // note: the read of a matrix block from an RDD might trigger
    // lazy evaluation of pending transformations.
    RDDObject lrdd = rdd;
    // prepare return status (by default only collect)
    writeStatus.setValue(false);
    MetaDataFormat iimd = (MetaDataFormat) _metaData;
    MatrixCharacteristics mc = iimd.getMatrixCharacteristics();
    InputInfo ii = iimd.getInputInfo();
    MatrixBlock mb = null;
    try {
        // prevent unnecessary collect through rdd checkpoint
        if (rdd.allowsShortCircuitCollect()) {
            lrdd = (RDDObject) rdd.getLineageChilds().get(0);
        }
        // obtain matrix block from RDD
        int rlen = (int) mc.getRows();
        int clen = (int) mc.getCols();
        int brlen = (int) mc.getRowsPerBlock();
        int bclen = (int) mc.getColsPerBlock();
        long nnz = mc.getNonZerosBound();
        // guarded rdd collect
        if (// guarded collect not for binary cell
        ii == InputInfo.BinaryBlockInputInfo && !OptimizerUtils.checkSparkCollectMemoryBudget(mc, getPinnedSize() + getBroadcastSize(), true)) {
            // note: lazy, partition-at-a-time collect (toLocalIterator) was significantly slower
            if (!MapReduceTool.existsFileOnHDFS(_hdfsFileName)) {
                // prevent overwrite existing file
                long newnnz = SparkExecutionContext.writeRDDtoHDFS(lrdd, _hdfsFileName, iimd.getOutputInfo());
                _metaData.getMatrixCharacteristics().setNonZeros(newnnz);
                // mark rdd as non-pending (for export)
                ((RDDObject) rdd).setPending(false);
                // mark rdd as hdfs file (for restore)
                ((RDDObject) rdd).setHDFSFile(true);
                // mark for no cache-write on read
                writeStatus.setValue(true);
            // note: the flag hdfsFile is actually not entirely correct because we still hold an rdd
            // reference to the input not to an rdd of the hdfs file but the resulting behavior is correct
            }
            mb = readBlobFromHDFS(_hdfsFileName);
        } else if (ii == InputInfo.BinaryCellInputInfo) {
            // collect matrix block from binary block RDD
            mb = SparkExecutionContext.toMatrixBlock(lrdd, rlen, clen, nnz);
        } else {
            // collect matrix block from binary cell RDD
            mb = SparkExecutionContext.toMatrixBlock(lrdd, rlen, clen, brlen, bclen, nnz);
        }
    } catch (DMLRuntimeException ex) {
        throw new IOException(ex);
    }
    // sanity check correct output
    if (mb == null)
        throw new IOException("Unable to load matrix from rdd.");
    return mb;
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) IOException(java.io.IOException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 45 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.

the class SparkExecutionContext method getRDDHandleForFrameObject.

/**
 * FIXME: currently this implementation assumes matrix representations but frame signature
 * in order to support the old transform implementation.
 *
 * @param fo frame object
 * @param inputInfo input info
 * @return JavaPairRDD handle for a frame object
 */
@SuppressWarnings("unchecked")
public JavaPairRDD<?, ?> getRDDHandleForFrameObject(FrameObject fo, InputInfo inputInfo) {
    // NOTE: MB this logic should be integrated into FrameObject
    // However, for now we cannot assume that spark libraries are
    // always available and hence only store generic references in
    // matrix object while all the logic is in the SparkExecContext
    InputInfo inputInfo2 = (inputInfo == InputInfo.BinaryBlockInputInfo) ? InputInfo.BinaryBlockFrameInputInfo : inputInfo;
    JavaSparkContext sc = getSparkContext();
    JavaPairRDD<?, ?> rdd = null;
    // rdd operations if already executed and cached
    if (fo.getRDDHandle() != null && (fo.getRDDHandle().isCheckpointRDD() || !fo.isCached(false))) {
        // return existing rdd handling (w/o input format change)
        rdd = fo.getRDDHandle().getRDD();
    } else // CASE 2: dirty in memory data or cached result of rdd operations
    if (fo.isDirty() || fo.isCached(false)) {
        // get in-memory matrix block and parallelize it
        // w/ guarded parallelize (fallback to export, rdd from file if too large)
        MatrixCharacteristics mc = fo.getMatrixCharacteristics();
        boolean fromFile = false;
        if (!OptimizerUtils.checkSparkCollectMemoryBudget(mc, 0) || !_parRDDs.reserve(OptimizerUtils.estimatePartitionedSizeExactSparsity(mc))) {
            if (fo.isDirty()) {
                // write only if necessary
                fo.exportData();
            }
            rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
            // cp is workaround for read bug
            rdd = ((JavaPairRDD<LongWritable, FrameBlock>) rdd).mapToPair(new CopyFrameBlockPairFunction());
            fromFile = true;
        } else {
            // default case
            // pin frame in memory
            FrameBlock fb = fo.acquireRead();
            rdd = toFrameJavaPairRDD(sc, fb);
            // unpin frame
            fo.release();
            _parRDDs.registerRDD(rdd.id(), OptimizerUtils.estimatePartitionedSizeExactSparsity(mc), true);
        }
        // keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd);
        rddhandle.setHDFSFile(fromFile);
        fo.setRDDHandle(rddhandle);
    } else // CASE 3: non-dirty (file exists on HDFS)
    {
        // For binary block, these are: SequenceFileInputFormat.class, MatrixIndexes.class, MatrixBlock.class
        if (inputInfo2 == InputInfo.BinaryBlockFrameInputInfo) {
            rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
            // note: this copy is still required in Spark 1.4 because spark hands out whatever the inputformat
            // recordreader returns; the javadoc explicitly recommend to copy all key/value pairs
            // cp is workaround for read bug
            rdd = ((JavaPairRDD<LongWritable, FrameBlock>) rdd).mapToPair(new CopyFrameBlockPairFunction());
        } else if (inputInfo2 == InputInfo.TextCellInputInfo || inputInfo2 == InputInfo.CSVInputInfo || inputInfo2 == InputInfo.MatrixMarketInputInfo) {
            rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
            // cp is workaround for read bug
            rdd = ((JavaPairRDD<LongWritable, Text>) rdd).mapToPair(new CopyTextInputFunction());
        } else if (inputInfo2 == InputInfo.BinaryCellInputInfo) {
            throw new DMLRuntimeException("Binarycell not supported for frames.");
        } else {
            throw new DMLRuntimeException("Incorrect input format in getRDDHandleForVariable");
        }
        // keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd);
        rddhandle.setHDFSFile(true);
        fo.setRDDHandle(rddhandle);
    }
    return rdd;
}
Also used : CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Text(org.apache.hadoop.io.Text) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) CopyFrameBlockPairFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockPairFunction) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Aggregations

InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)74 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)38 OutputInfo (org.apache.sysml.runtime.matrix.data.OutputInfo)30 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)26 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)20 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)20 IOException (java.io.IOException)17 JobConf (org.apache.hadoop.mapred.JobConf)13 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)13 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)12 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)11 RunningJob (org.apache.hadoop.mapred.RunningJob)10 Path (org.apache.hadoop.fs.Path)9 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)9 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)9 DMLConfig (org.apache.sysml.conf.DMLConfig)8 ValueType (org.apache.sysml.parser.Expression.ValueType)8 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)7 CSVFileFormatProperties (org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties)7 Group (org.apache.hadoop.mapred.Counters.Group)6