Search in sources :

Example 1 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.

the class Connection method readDoubleMatrix.

// //////////////////////////////////////////
// Read matrices
// //////////////////////////////////////////
/**
 * Reads an input matrix in arbitrary format from HDFS into a dense double array.
 * NOTE: this call currently only supports default configurations for CSV.
 *
 * @param fname the filename of the input matrix
 * @return matrix as a two-dimensional double array
 * @throws IOException if IOException occurs
 */
public double[][] readDoubleMatrix(String fname) throws IOException {
    try {
        // read json meta data
        String fnamemtd = DataExpression.getMTDFileName(fname);
        JSONObject jmtd = new DataExpression().readMetadataFile(fnamemtd, false);
        // parse json meta data
        long rows = jmtd.getLong(DataExpression.READROWPARAM);
        long cols = jmtd.getLong(DataExpression.READCOLPARAM);
        int brlen = jmtd.containsKey(DataExpression.ROWBLOCKCOUNTPARAM) ? jmtd.getInt(DataExpression.ROWBLOCKCOUNTPARAM) : -1;
        int bclen = jmtd.containsKey(DataExpression.COLUMNBLOCKCOUNTPARAM) ? jmtd.getInt(DataExpression.COLUMNBLOCKCOUNTPARAM) : -1;
        long nnz = jmtd.containsKey(DataExpression.READNUMNONZEROPARAM) ? jmtd.getLong(DataExpression.READNUMNONZEROPARAM) : -1;
        String format = jmtd.getString(DataExpression.FORMAT_TYPE);
        InputInfo iinfo = InputInfo.stringExternalToInputInfo(format);
        // read matrix file
        return readDoubleMatrix(fname, iinfo, rows, cols, brlen, bclen, nnz);
    } catch (Exception ex) {
        throw new IOException(ex);
    }
}
Also used : DataExpression(org.apache.sysml.parser.DataExpression) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JSONObject(org.apache.wink.json4j.JSONObject) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DMLException(org.apache.sysml.api.DMLException) LanguageException(org.apache.sysml.parser.LanguageException) IOException(java.io.IOException) ParseException(org.apache.sysml.parser.ParseException)

Example 2 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.

the class Connection method convertToMatrix.

/**
 * Converts an input stream of a string matrix in csv or textcell format
 * into a matrix block.
 *
 * @param input InputStream to a string matrix in csv or textcell format
 * @param rows number of rows in the matrix
 * @param cols number of columns in the matrix
 * @param format input format of the given stream
 * @return matrix as a matrix block
 * @throws IOException if IOException occurs
 */
public MatrixBlock convertToMatrix(InputStream input, int rows, int cols, String format) throws IOException {
    MatrixBlock ret = null;
    // sanity check input format
    if (!(DataExpression.FORMAT_TYPE_VALUE_TEXT.equals(format) || DataExpression.FORMAT_TYPE_VALUE_MATRIXMARKET.equals(format) || DataExpression.FORMAT_TYPE_VALUE_CSV.equals(format))) {
        throw new IOException("Invalid input format (expected: csv, text or mm): " + format);
    }
    setLocalConfigs();
    try {
        // read input matrix
        InputInfo iinfo = DataExpression.FORMAT_TYPE_VALUE_CSV.equals(format) ? InputInfo.CSVInputInfo : InputInfo.TextCellInputInfo;
        MatrixReader reader = MatrixReaderFactory.createMatrixReader(iinfo);
        int blksz = ConfigurationManager.getBlocksize();
        ret = reader.readMatrixFromInputStream(input, rows, cols, blksz, blksz, (long) rows * cols);
    } catch (DMLRuntimeException rex) {
        throw new IOException(rex);
    }
    return ret;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) IOException(java.io.IOException) MatrixReader(org.apache.sysml.runtime.io.MatrixReader) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 3 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.

the class Connection method readStringFrame.

// //////////////////////////////////////////
// Read frames
// //////////////////////////////////////////
/**
 * Reads an input frame in arbitrary format from HDFS into a dense string array.
 * NOTE: this call currently only supports default configurations for CSV.
 *
 * @param fname the filename of the input frame
 * @return frame as a two-dimensional string array
 * @throws IOException if IOException occurs
 */
public String[][] readStringFrame(String fname) throws IOException {
    try {
        // read json meta data
        String fnamemtd = DataExpression.getMTDFileName(fname);
        JSONObject jmtd = new DataExpression().readMetadataFile(fnamemtd, false);
        // parse json meta data
        long rows = jmtd.getLong(DataExpression.READROWPARAM);
        long cols = jmtd.getLong(DataExpression.READCOLPARAM);
        String format = jmtd.getString(DataExpression.FORMAT_TYPE);
        InputInfo iinfo = InputInfo.stringExternalToInputInfo(format);
        // read frame file
        return readStringFrame(fname, iinfo, rows, cols);
    } catch (Exception ex) {
        throw new IOException(ex);
    }
}
Also used : DataExpression(org.apache.sysml.parser.DataExpression) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JSONObject(org.apache.wink.json4j.JSONObject) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DMLException(org.apache.sysml.api.DMLException) LanguageException(org.apache.sysml.parser.LanguageException) IOException(java.io.IOException) ParseException(org.apache.sysml.parser.ParseException)

Example 4 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.

the class ReblockSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // set the output characteristics
    CacheableData<?> obj = sec.getCacheableData(input1.getName());
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    mcOut.set(mc.getRows(), mc.getCols(), brlen, bclen, mc.getNonZeros());
    // get the source format form the meta data
    MetaDataFormat iimd = (MetaDataFormat) obj.getMetaData();
    if (iimd == null)
        throw new DMLRuntimeException("Error: Metadata not found");
    InputInfo iinfo = iimd.getInputInfo();
    // check for in-memory reblock (w/ lazy spark context, potential for latency reduction)
    if (Recompiler.checkCPReblock(sec, input1.getName())) {
        if (input1.getDataType() == DataType.MATRIX)
            Recompiler.executeInMemoryMatrixReblock(sec, input1.getName(), output.getName());
        else if (input1.getDataType() == DataType.FRAME)
            Recompiler.executeInMemoryFrameReblock(sec, input1.getName(), output.getName());
        return;
    }
    // execute matrix/frame reblock
    if (input1.getDataType() == DataType.MATRIX)
        processMatrixReblockInstruction(sec, iinfo);
    else if (input1.getDataType() == DataType.FRAME)
        processFrameReblockInstruction(sec, iinfo);
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 5 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.

the class RemoteDPParForSpark method getPartitionedInput.

@SuppressWarnings("unchecked")
private static JavaPairRDD<Long, Writable> getPartitionedInput(SparkExecutionContext sec, String matrixvar, OutputInfo oi, PartitionFormat dpf) {
    InputInfo ii = InputInfo.BinaryBlockInputInfo;
    MatrixObject mo = sec.getMatrixObject(matrixvar);
    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    // NOTE: there will always be a checkpoint rdd on top of the input rdd and the dataset
    if (hasInputDataSet(dpf, mo)) {
        DatasetObject dsObj = (DatasetObject) mo.getRDDHandle().getLineageChilds().get(0).getLineageChilds().get(0);
        Dataset<Row> in = dsObj.getDataset();
        // construct or reuse row ids
        JavaPairRDD<Row, Long> prepinput = dsObj.containsID() ? in.javaRDD().mapToPair(new DataFrameExtractIDFunction(in.schema().fieldIndex(RDDConverterUtils.DF_ID_COLUMN))) : // zip row index
        in.javaRDD().zipWithIndex();
        // convert row to row in matrix block format
        return prepinput.mapToPair(new DataFrameToRowBinaryBlockFunction(mc.getCols(), dsObj.isVectorBased(), dsObj.containsID()));
    } else // binary block input rdd without grouping
    if (!requiresGrouping(dpf, mo)) {
        // get input rdd and data partitioning
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
        DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
        return in.flatMapToPair(dpfun);
    } else // default binary block input rdd with grouping
    {
        // get input rdd, avoid unnecessary caching if input is checkpoint and not cached yet
        // to reduce memory pressure for shuffle and subsequent
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
        if (mo.getRDDHandle().isCheckpointRDD() && !sec.isRDDCached(in.id()))
            in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
        // data partitioning of input rdd
        DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
        return in.flatMapToPair(dpfun);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DatasetObject(org.apache.sysml.runtime.instructions.spark.data.DatasetObject) DataFrameExtractIDFunction(org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtils.DataFrameExtractIDFunction) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Row(org.apache.spark.sql.Row)

Aggregations

InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)74 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)38 OutputInfo (org.apache.sysml.runtime.matrix.data.OutputInfo)30 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)26 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)20 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)20 IOException (java.io.IOException)17 JobConf (org.apache.hadoop.mapred.JobConf)13 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)13 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)12 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)11 RunningJob (org.apache.hadoop.mapred.RunningJob)10 Path (org.apache.hadoop.fs.Path)9 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)9 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)9 DMLConfig (org.apache.sysml.conf.DMLConfig)8 ValueType (org.apache.sysml.parser.Expression.ValueType)8 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)7 CSVFileFormatProperties (org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties)7 Group (org.apache.hadoop.mapred.Counters.Group)6