Search in sources :

Example 1 with EncoderMVImpute

use of org.apache.sysml.runtime.transform.encode.EncoderMVImpute in project incubator-systemml by apache.

the class MultiReturnParameterizedBuiltinSPInstruction method processInstruction.

@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    try {
        // get input RDD and meta data
        FrameObject fo = sec.getFrameObject(input1.getName());
        FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName());
        JavaPairRDD<Long, FrameBlock> in = (JavaPairRDD<Long, FrameBlock>) sec.getRDDHandleForFrameObject(fo, InputInfo.BinaryBlockInputInfo);
        String spec = ec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral()).getStringValue();
        MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        String[] colnames = !TfMetaUtils.isIDSpec(spec) ? in.lookup(1L).get(0).getColumnNames() : null;
        // step 1: build transform meta data
        Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int) fo.getNumColumns(), null);
        MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext());
        JavaRDD<String> rcMaps = in.mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild)).distinct().groupByKey().flatMap(new TransformEncodeGroupFunction(accMax));
        if (containsMVImputeEncoder(encoderBuild)) {
            EncoderMVImpute mva = getMVImputeEncoder(encoderBuild);
            rcMaps = rcMaps.union(in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva)).groupByKey().flatMap(new TransformEncodeGroup2Function(mva)));
        }
        // trigger eval
        rcMaps.saveAsTextFile(fometa.getFileName());
        // consolidate meta data frame (reuse multi-threaded reader, special handling missing values)
        FrameReader reader = FrameReaderFactory.createFrameReader(InputInfo.TextCellInputInfo);
        FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns());
        // recompute num distinct items per column
        meta.recomputeColumnCardinality();
        meta.setColumnNames((colnames != null) ? colnames : meta.getColumnNames());
        // step 2: transform apply (similar to spark transformapply)
        // compute omit offset map for block shifts
        TfOffsetMap omap = null;
        if (TfMetaUtils.containsOmitSpec(spec, colnames)) {
            omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(new RDDTransformApplyOffsetFunction(spec, colnames)).collect()));
        }
        // create encoder broadcast (avoiding replication per task)
        Encoder encoder = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int) fo.getNumColumns(), meta);
        mcOut.setDimension(mcIn.getRows() - ((omap != null) ? omap.getNumRmRows() : 0), encoder.getNumCols());
        Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
        Broadcast<TfOffsetMap> bomap = (omap != null) ? sec.getSparkContext().broadcast(omap) : null;
        // execute transform apply
        JavaPairRDD<Long, FrameBlock> tmp = in.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = FrameRDDConverterUtils.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
        // set output and maintain lineage/output characteristics
        sec.setRDDHandleForVariable(_outputs.get(0).getName(), out);
        sec.addLineageRDD(_outputs.get(0).getName(), input1.getName());
        sec.setFrameOutput(_outputs.get(1).getName(), meta);
    } catch (IOException ex) {
        throw new RuntimeException(ex);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) Encoder(org.apache.sysml.runtime.transform.encode.Encoder) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) EncoderMVImpute(org.apache.sysml.runtime.transform.encode.EncoderMVImpute) RDDTransformApplyOffsetFunction(org.apache.sysml.runtime.instructions.spark.ParameterizedBuiltinSPInstruction.RDDTransformApplyOffsetFunction) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) IOException(java.io.IOException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) RDDTransformApplyFunction(org.apache.sysml.runtime.instructions.spark.ParameterizedBuiltinSPInstruction.RDDTransformApplyFunction) TfOffsetMap(org.apache.sysml.runtime.transform.meta.TfOffsetMap) FrameReader(org.apache.sysml.runtime.io.FrameReader)

Example 2 with EncoderMVImpute

use of org.apache.sysml.runtime.transform.encode.EncoderMVImpute in project systemml by apache.

the class MultiReturnParameterizedBuiltinSPInstruction method processInstruction.

@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    try {
        // get input RDD and meta data
        FrameObject fo = sec.getFrameObject(input1.getName());
        FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName());
        JavaPairRDD<Long, FrameBlock> in = (JavaPairRDD<Long, FrameBlock>) sec.getRDDHandleForFrameObject(fo, InputInfo.BinaryBlockInputInfo);
        String spec = ec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral()).getStringValue();
        MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        String[] colnames = !TfMetaUtils.isIDSpec(spec) ? in.lookup(1L).get(0).getColumnNames() : null;
        // step 1: build transform meta data
        Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int) fo.getNumColumns(), null);
        MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext());
        JavaRDD<String> rcMaps = in.mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild)).distinct().groupByKey().flatMap(new TransformEncodeGroupFunction(accMax));
        if (containsMVImputeEncoder(encoderBuild)) {
            EncoderMVImpute mva = getMVImputeEncoder(encoderBuild);
            rcMaps = rcMaps.union(in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva)).groupByKey().flatMap(new TransformEncodeGroup2Function(mva)));
        }
        // trigger eval
        rcMaps.saveAsTextFile(fometa.getFileName());
        // consolidate meta data frame (reuse multi-threaded reader, special handling missing values)
        FrameReader reader = FrameReaderFactory.createFrameReader(InputInfo.TextCellInputInfo);
        FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns());
        // recompute num distinct items per column
        meta.recomputeColumnCardinality();
        meta.setColumnNames((colnames != null) ? colnames : meta.getColumnNames());
        // step 2: transform apply (similar to spark transformapply)
        // compute omit offset map for block shifts
        TfOffsetMap omap = null;
        if (TfMetaUtils.containsOmitSpec(spec, colnames)) {
            omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(new RDDTransformApplyOffsetFunction(spec, colnames)).collect()));
        }
        // create encoder broadcast (avoiding replication per task)
        Encoder encoder = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int) fo.getNumColumns(), meta);
        mcOut.setDimension(mcIn.getRows() - ((omap != null) ? omap.getNumRmRows() : 0), encoder.getNumCols());
        Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
        Broadcast<TfOffsetMap> bomap = (omap != null) ? sec.getSparkContext().broadcast(omap) : null;
        // execute transform apply
        JavaPairRDD<Long, FrameBlock> tmp = in.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = FrameRDDConverterUtils.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
        // set output and maintain lineage/output characteristics
        sec.setRDDHandleForVariable(_outputs.get(0).getName(), out);
        sec.addLineageRDD(_outputs.get(0).getName(), input1.getName());
        sec.setFrameOutput(_outputs.get(1).getName(), meta);
    } catch (IOException ex) {
        throw new RuntimeException(ex);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) Encoder(org.apache.sysml.runtime.transform.encode.Encoder) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) EncoderMVImpute(org.apache.sysml.runtime.transform.encode.EncoderMVImpute) RDDTransformApplyOffsetFunction(org.apache.sysml.runtime.instructions.spark.ParameterizedBuiltinSPInstruction.RDDTransformApplyOffsetFunction) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) IOException(java.io.IOException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) RDDTransformApplyFunction(org.apache.sysml.runtime.instructions.spark.ParameterizedBuiltinSPInstruction.RDDTransformApplyFunction) TfOffsetMap(org.apache.sysml.runtime.transform.meta.TfOffsetMap) FrameReader(org.apache.sysml.runtime.io.FrameReader)

Aggregations

IOException (java.io.IOException)2 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)2 FrameObject (org.apache.sysml.runtime.controlprogram.caching.FrameObject)2 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)2 RDDTransformApplyFunction (org.apache.sysml.runtime.instructions.spark.ParameterizedBuiltinSPInstruction.RDDTransformApplyFunction)2 RDDTransformApplyOffsetFunction (org.apache.sysml.runtime.instructions.spark.ParameterizedBuiltinSPInstruction.RDDTransformApplyOffsetFunction)2 FrameReader (org.apache.sysml.runtime.io.FrameReader)2 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)2 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)2 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)2 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)2 Encoder (org.apache.sysml.runtime.transform.encode.Encoder)2 EncoderMVImpute (org.apache.sysml.runtime.transform.encode.EncoderMVImpute)2 TfOffsetMap (org.apache.sysml.runtime.transform.meta.TfOffsetMap)2