Search in sources :

Example 86 with FrameBlock

use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.

the class TfMetaUtils method convertToTransformMetaDataFrame.

/**
 * Converts transform meta data into an in-memory FrameBlock object.
 *
 * @param rows number of rows
 * @param colnames column names
 * @param rcIDs recode IDs
 * @param binIDs binning IDs
 * @param meta ?
 * @param mvmeta ?
 * @return frame block
 * @throws IOException if IOException occurs
 */
private static FrameBlock convertToTransformMetaDataFrame(int rows, String[] colnames, List<Integer> rcIDs, List<Integer> binIDs, HashMap<String, String> meta, HashMap<String, String> mvmeta) throws IOException {
    // create frame block w/ pure string schema
    ValueType[] schema = UtilFunctions.nCopies(colnames.length, ValueType.STRING);
    FrameBlock ret = new FrameBlock(schema, colnames);
    ret.ensureAllocatedColumns(rows);
    // encode recode maps (recoding/dummycoding) into frame
    for (Integer colID : rcIDs) {
        String name = colnames[colID - 1];
        String map = meta.get(name);
        if (map == null)
            throw new IOException("Recode map for column '" + name + "' (id=" + colID + ") not existing.");
        InputStream is = new ByteArrayInputStream(map.getBytes("UTF-8"));
        BufferedReader br = new BufferedReader(new InputStreamReader(is));
        Pair<String, String> pair = new Pair<>();
        String line;
        int rpos = 0;
        while ((line = br.readLine()) != null) {
            DecoderRecode.parseRecodeMapEntry(line, pair);
            String tmp = pair.getKey() + Lop.DATATYPE_PREFIX + pair.getValue();
            ret.set(rpos++, colID - 1, tmp);
        }
        ret.getColumnMetadata(colID - 1).setNumDistinct((long) rpos);
    }
    // encode bin maps (binning) into frame
    for (Integer colID : binIDs) {
        String name = colnames[colID - 1];
        String map = meta.get(name);
        if (map == null)
            throw new IOException("Binning map for column '" + name + "' (id=" + colID + ") not existing.");
        String[] fields = map.split(TfUtils.TXMTD_SEP);
        double min = UtilFunctions.parseToDouble(fields[1]);
        double binwidth = UtilFunctions.parseToDouble(fields[3]);
        int nbins = UtilFunctions.parseToInt(fields[4]);
        // materialize bins to support equi-width/equi-height
        for (int i = 0; i < nbins; i++) {
            String lbound = String.valueOf(min + i * binwidth);
            String ubound = String.valueOf(min + (i + 1) * binwidth);
            ret.set(i, colID - 1, lbound + Lop.DATATYPE_PREFIX + ubound);
        }
        ret.getColumnMetadata(colID - 1).setNumDistinct((long) nbins);
    }
    // encode impute meta data into frame
    for (Entry<String, String> e : mvmeta.entrySet()) {
        int colID = ArrayUtils.indexOf(colnames, e.getKey()) + 1;
        String mvVal = e.getValue().split(TfUtils.TXMTD_SEP)[1];
        ret.getColumnMetadata(colID - 1).setMvValue(mvVal);
    }
    return ret;
}
Also used : InputStreamReader(java.io.InputStreamReader) ValueType(org.apache.sysml.parser.Expression.ValueType) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) IOException(java.io.IOException) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) ByteArrayInputStream(java.io.ByteArrayInputStream) BufferedReader(java.io.BufferedReader) Pair(org.apache.sysml.runtime.matrix.data.Pair)

Example 87 with FrameBlock

use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.

the class DataConverter method convertToFrameBlock.

public static FrameBlock convertToFrameBlock(MatrixBlock mb, ValueType[] schema) {
    FrameBlock frame = new FrameBlock(schema);
    Object[] row = new Object[mb.getNumColumns()];
    if (// SPARSE
    mb.isInSparseFormat()) {
        SparseBlock sblock = mb.getSparseBlock();
        for (int i = 0; i < mb.getNumRows(); i++) {
            // reset
            Arrays.fill(row, null);
            if (sblock != null && !sblock.isEmpty(i)) {
                int apos = sblock.pos(i);
                int alen = sblock.size(i);
                int[] aix = sblock.indexes(i);
                double[] aval = sblock.values(i);
                for (int j = apos; j < apos + alen; j++) {
                    row[aix[j]] = UtilFunctions.doubleToObject(schema[aix[j]], aval[j]);
                }
            }
            frame.appendRow(row);
        }
    } else // DENSE
    {
        int dFreq = UtilFunctions.frequency(schema, ValueType.DOUBLE);
        if (schema.length == 1 && dFreq == 1 && mb.isAllocated()) {
            // special case double schema and single columns which
            // allows for a shallow copy since the physical representation
            // of row-major matrix and column-major frame match exactly
            frame.reset();
            frame.appendColumns(new double[][] { mb.getDenseBlockValues() });
        } else if (dFreq == schema.length) {
            // special case double schema (without cell-object creation,
            // col pre-allocation, and cache-friendly row-column copy)
            int m = mb.getNumRows();
            int n = mb.getNumColumns();
            double[] a = mb.getDenseBlockValues();
            double[][] c = new double[n][m];
            // blocks of a/c+overhead in L1 cache
            int blocksizeIJ = 16;
            if (!mb.isEmptyBlock(false))
                for (int bi = 0; bi < m; bi += blocksizeIJ) for (int bj = 0; bj < n; bj += blocksizeIJ) {
                    int bimin = Math.min(bi + blocksizeIJ, m);
                    int bjmin = Math.min(bj + blocksizeIJ, n);
                    for (int i = bi, aix = bi * n; i < bimin; i++, aix += n) for (int j = bj; j < bjmin; j++) c[j][i] = a[aix + j];
                }
            frame.reset();
            frame.appendColumns(c);
        } else {
            // general case
            for (int i = 0; i < mb.getNumRows(); i++) {
                for (int j = 0; j < mb.getNumColumns(); j++) {
                    row[j] = UtilFunctions.doubleToObject(schema[j], mb.quickGetValue(i, j));
                }
                frame.appendRow(row);
            }
        }
    }
    return frame;
}
Also used : FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) BooleanObject(org.apache.sysml.runtime.instructions.cp.BooleanObject) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) SparseBlock(org.apache.sysml.runtime.matrix.data.SparseBlock)

Example 88 with FrameBlock

use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.

the class MLContextConversionUtil method dataFrameToFrameObject.

/**
	 * Convert a {@code DataFrame} to a {@code FrameObject}.
	 * 
	 * @param variableName
	 *            name of the variable associated with the frame
	 * @param dataFrame
	 *            the Spark {@code DataFrame}
	 * @param frameMetadata
	 *            the frame metadata
	 * @return the {@code DataFrame} frame converted to a converted to a
	 *         {@code FrameObject}
	 */
public static FrameObject dataFrameToFrameObject(String variableName, Dataset<Row> dataFrame, FrameMetadata frameMetadata) {
    try {
        //setup meta data and java spark context
        if (frameMetadata == null)
            frameMetadata = new FrameMetadata();
        determineFrameFormatIfNeeded(dataFrame, frameMetadata);
        boolean containsID = isDataFrameWithIDColumn(frameMetadata);
        MatrixCharacteristics mc = frameMetadata.asMatrixCharacteristics();
        if (mc == null)
            mc = new MatrixCharacteristics();
        //convert data frame and obtain column names / schema
        //TODO extend frame schema by column names (right now dropped)
        Pair<String[], ValueType[]> ret = new Pair<String[], ValueType[]>();
        JavaPairRDD<Long, FrameBlock> binaryBlock = FrameRDDConverterUtils.dataFrameToBinaryBlock(jsc(), dataFrame, mc, containsID, ret);
        frameMetadata.setFrameSchema(new FrameSchema(Arrays.asList(ret.getValue())));
        //required due to meta data copy
        frameMetadata.setMatrixCharacteristics(mc);
        return MLContextConversionUtil.binaryBlocksToFrameObject(variableName, binaryBlock, frameMetadata);
    } catch (DMLRuntimeException e) {
        throw new MLContextException("Exception converting DataFrame to FrameObject", e);
    }
}
Also used : ValueType(org.apache.sysml.parser.Expression.ValueType) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) ConvertStringToLongTextPair(org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair) Pair(org.apache.sysml.runtime.matrix.data.Pair)

Example 89 with FrameBlock

use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.

the class MLContextConversionUtil method javaRDDStringIJVToFrameObject.

/**
	 * Convert a {@code JavaRDD<String>} in IJV format to a {@code FrameObject}
	 * . Note that metadata is required for IJV format.
	 * 
	 * @param variableName
	 *            name of the variable associated with the frame
	 * @param javaRDD
	 *            the Java RDD of strings
	 * @param frameMetadata
	 *            frame metadata
	 * @return the {@code JavaRDD<String>} converted to a {@code FrameObject}
	 */
public static FrameObject javaRDDStringIJVToFrameObject(String variableName, JavaRDD<String> javaRDD, FrameMetadata frameMetadata) {
    JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
    MatrixCharacteristics mc = (frameMetadata != null) ? frameMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
    JavaPairRDD<LongWritable, Text> javaPairRDDText = javaPairRDD.mapToPair(new CopyTextInputFunction());
    FrameObject frameObject = new FrameObject(OptimizerUtils.getUniqueTempFileName(), new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo), frameMetadata.getFrameSchema().getSchema().toArray(new ValueType[0]));
    JavaPairRDD<Long, FrameBlock> rdd;
    try {
        ValueType[] lschema = null;
        if (lschema == null)
            lschema = UtilFunctions.nCopies((int) mc.getCols(), ValueType.STRING);
        rdd = FrameRDDConverterUtils.textCellToBinaryBlock(jsc(), javaPairRDDText, mc, lschema);
    } catch (DMLRuntimeException e) {
        e.printStackTrace();
        return null;
    }
    frameObject.setRDDHandle(new RDDObject(rdd, variableName));
    return frameObject;
}
Also used : ValueType(org.apache.sysml.parser.Expression.ValueType) Text(org.apache.hadoop.io.Text) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) MatrixFormatMetaData(org.apache.sysml.runtime.matrix.MatrixFormatMetaData) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) ConvertStringToLongTextPair(org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) LongWritable(org.apache.hadoop.io.LongWritable)

Example 90 with FrameBlock

use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.

the class TransformReadMetaTest method runTransformReadMetaTest.

/**
	 * 
	 * @param sparseM1
	 * @param sparseM2
	 * @param instType
	 * @throws IOException 
	 * @throws DMLRuntimeException 
	 */
private void runTransformReadMetaTest(RUNTIME_PLATFORM rt, String ofmt, String delim) throws IOException, DMLRuntimeException {
    RUNTIME_PLATFORM platformOld = rtplatform;
    rtplatform = rt;
    boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
    if (rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)
        DMLScript.USE_LOCAL_SPARK_CONFIG = true;
    try {
        String testname = delim.equals(",") ? TEST_NAME1 : TEST_NAME2;
        getAndLoadTestConfiguration(testname);
        //generate input data
        double[][] X = DataConverter.convertToDoubleMatrix(MatrixBlock.seqOperations(0.5, rows / 2, 0.5).appendOperations(MatrixBlock.seqOperations(0.5, rows / 2, 0.5), new MatrixBlock()));
        MatrixBlock mbX = DataConverter.convertToMatrixBlock(X);
        CSVFileFormatProperties fprops = new CSVFileFormatProperties(false, delim, false);
        MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(OutputInfo.CSVOutputInfo, 1, fprops);
        writer.writeMatrixToHDFS(mbX, input("X"), rows, 2, -1, -1, -1);
        //read specs transform X and Y
        String specX = MapReduceTool.readStringFromHDFSFile(SCRIPT_DIR + TEST_DIR + SPEC_X);
        fullDMLScriptName = SCRIPT_DIR + TEST_DIR + testname + ".dml";
        programArgs = new String[] { "-args", input("X"), specX, output("M1"), output("M"), ofmt, delim };
        //run test
        runTest(true, false, null, -1);
        //compare meta data frames
        InputInfo iinfo = InputInfo.stringExternalToInputInfo(ofmt);
        FrameReader reader = FrameReaderFactory.createFrameReader(iinfo);
        FrameBlock mExpected = TfMetaUtils.readTransformMetaDataFromFile(specX, output("M1"), delim);
        FrameBlock mRet = reader.readFrameFromHDFS(output("M"), rows, 2);
        for (int i = 0; i < rows; i++) for (int j = 0; j < 2; j++) {
            Assert.assertTrue("Wrong result: " + mRet.get(i, j) + ".", UtilFunctions.compareTo(ValueType.STRING, mExpected.get(i, j), mRet.get(i, j)) == 0);
        }
    } catch (Exception ex) {
        throw new IOException(ex);
    } finally {
        rtplatform = platformOld;
        DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) RUNTIME_PLATFORM(org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) FrameReader(org.apache.sysml.runtime.io.FrameReader) MatrixWriter(org.apache.sysml.runtime.io.MatrixWriter)

Aggregations

FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)90 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)28 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)26 ValueType (org.apache.sysml.parser.Expression.ValueType)23 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)23 FrameReader (org.apache.sysml.runtime.io.FrameReader)18 IOException (java.io.IOException)16 RUNTIME_PLATFORM (org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM)16 FrameObject (org.apache.sysml.runtime.controlprogram.caching.FrameObject)15 LongWritable (org.apache.hadoop.io.LongWritable)12 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)11 CSVFileFormatProperties (org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties)11 FrameWriter (org.apache.sysml.runtime.io.FrameWriter)9 TestConfiguration (org.apache.sysml.test.integration.TestConfiguration)8 Text (org.apache.hadoop.io.Text)7 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)7 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)7 ConvertStringToLongTextPair (org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair)6 CopyTextInputFunction (org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction)5 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)5