Search in sources :

Example 1 with PickByCountInstruction

use of org.apache.sysml.runtime.instructions.mr.PickByCountInstruction in project incubator-systemml by apache.

the class CostEstimatorStaticRuntime method extractMRInstStatistics.

private Object[] extractMRInstStatistics(String inst, VarStats[] stats) {
    // stats, attrs
    Object[] ret = new Object[2];
    VarStats[] vs = new VarStats[3];
    String[] attr = null;
    String[] parts = InstructionUtils.getInstructionParts(inst);
    String opcode = parts[0];
    if (opcode.equals(DataGen.RAND_OPCODE)) {
        vs[0] = _unknownStats;
        vs[1] = _unknownStats;
        vs[2] = stats[Integer.parseInt(parts[2])];
        int type = 2;
        // awareness of instruction patching min/max
        if (!parts[7].contains(Lop.VARIABLE_NAME_PLACEHOLDER) && !parts[8].contains(Lop.VARIABLE_NAME_PLACEHOLDER)) {
            double minValue = Double.parseDouble(parts[7]);
            double maxValue = Double.parseDouble(parts[8]);
            double sparsity = Double.parseDouble(parts[9]);
            if (minValue == 0.0 && maxValue == 0.0)
                type = 0;
            else if (sparsity == 1.0 && minValue == maxValue)
                type = 1;
        }
        attr = new String[] { String.valueOf(type) };
    }
    if (opcode.equals(DataGen.SEQ_OPCODE)) {
        vs[0] = _unknownStats;
        vs[1] = _unknownStats;
        vs[2] = stats[Integer.parseInt(parts[2])];
    } else // general case
    {
        String inst2 = replaceInstructionPatch(inst);
        MRInstruction mrinst = MRInstructionParser.parseSingleInstruction(inst2);
        if (mrinst instanceof UnaryMRInstructionBase) {
            UnaryMRInstructionBase uinst = (UnaryMRInstructionBase) mrinst;
            vs[0] = uinst.input >= 0 ? stats[uinst.input] : _unknownStats;
            vs[1] = _unknownStats;
            vs[2] = stats[uinst.output];
            if (// scalar input, e.g., print
            vs[0] == null)
                vs[0] = _scalarStats;
            if (// scalar output
            vs[2] == null)
                vs[2] = _scalarStats;
            if (mrinst instanceof MMTSJMRInstruction) {
                String type = ((MMTSJMRInstruction) mrinst).getMMTSJType().toString();
                attr = new String[] { type };
            } else if (mrinst instanceof CM_N_COVInstruction) {
                if (opcode.equals("cm"))
                    attr = new String[] { parts[parts.length - 2] };
            } else if (mrinst instanceof GroupedAggregateInstruction) {
                if (opcode.equals("groupedagg")) {
                    AggregateOperationTypes type = CMOperator.getAggOpType(parts[2], parts[3]);
                    attr = new String[] { String.valueOf(type.ordinal()) };
                }
            }
        } else if (mrinst instanceof BinaryMRInstructionBase) {
            BinaryMRInstructionBase binst = (BinaryMRInstructionBase) mrinst;
            vs[0] = stats[binst.input1];
            vs[1] = stats[binst.input2];
            vs[2] = stats[binst.output];
            if (// scalar input,
            vs[0] == null)
                vs[0] = _scalarStats;
            if (// scalar input,
            vs[1] == null)
                vs[1] = _scalarStats;
            if (// scalar output
            vs[2] == null)
                vs[2] = _scalarStats;
            if (opcode.equals("rmempty")) {
                RemoveEmptyMRInstruction rbinst = (RemoveEmptyMRInstruction) mrinst;
                attr = new String[] { rbinst.isRemoveRows() ? "0" : "1" };
            }
        } else if (mrinst instanceof TernaryInstruction) {
            TernaryInstruction tinst = (TernaryInstruction) mrinst;
            byte[] ix = tinst.getAllIndexes();
            for (int i = 0; i < ix.length - 1; i++) vs[0] = stats[ix[i]];
            vs[2] = stats[ix[ix.length - 1]];
            if (// scalar input,
            vs[0] == null)
                vs[0] = _scalarStats;
            if (// scalar input,
            vs[1] == null)
                vs[1] = _scalarStats;
            if (// scalar output
            vs[2] == null)
                vs[2] = _scalarStats;
        } else if (mrinst instanceof CtableInstruction) {
            CtableInstruction tinst = (CtableInstruction) mrinst;
            vs[0] = stats[tinst.input1];
            vs[1] = stats[tinst.input2];
            vs[2] = stats[tinst.input3];
            if (// scalar input,
            vs[0] == null)
                vs[0] = _scalarStats;
            if (// scalar input,
            vs[1] == null)
                vs[1] = _scalarStats;
            if (// scalar input
            vs[2] == null)
                vs[2] = _scalarStats;
        } else if (mrinst instanceof PickByCountInstruction) {
            PickByCountInstruction pinst = (PickByCountInstruction) mrinst;
            vs[0] = stats[pinst.input1];
            vs[2] = stats[pinst.output];
            if (// scalar input,
            vs[0] == null)
                vs[0] = _scalarStats;
            if (// scalar input,
            vs[1] == null)
                vs[1] = _scalarStats;
            if (// scalar input
            vs[2] == null)
                vs[2] = _scalarStats;
        } else if (mrinst instanceof MapMultChainInstruction) {
            MapMultChainInstruction minst = (MapMultChainInstruction) mrinst;
            vs[0] = stats[minst.getInput1()];
            vs[1] = stats[minst.getInput2()];
            if (minst.getInput3() >= 0)
                vs[2] = stats[minst.getInput3()];
            if (// scalar input,
            vs[0] == null)
                vs[0] = _scalarStats;
            if (// scalar input,
            vs[1] == null)
                vs[1] = _scalarStats;
            if (// scalar input
            vs[2] == null)
                vs[2] = _scalarStats;
        }
    }
    // maintain var status (CP output always inmem)
    vs[2]._inmem = true;
    ret[0] = vs;
    ret[1] = attr;
    return ret;
}
Also used : CM_N_COVInstruction(org.apache.sysml.runtime.instructions.mr.CM_N_COVInstruction) BinaryMRInstructionBase(org.apache.sysml.runtime.instructions.mr.BinaryMRInstructionBase) PickByCountInstruction(org.apache.sysml.runtime.instructions.mr.PickByCountInstruction) TernaryInstruction(org.apache.sysml.runtime.instructions.mr.TernaryInstruction) AggregateOperationTypes(org.apache.sysml.runtime.matrix.operators.CMOperator.AggregateOperationTypes) RemoveEmptyMRInstruction(org.apache.sysml.runtime.instructions.mr.RemoveEmptyMRInstruction) CtableInstruction(org.apache.sysml.runtime.instructions.mr.CtableInstruction) UnaryMRInstructionBase(org.apache.sysml.runtime.instructions.mr.UnaryMRInstructionBase) MapMultChainInstruction(org.apache.sysml.runtime.instructions.mr.MapMultChainInstruction) MMTSJMRInstruction(org.apache.sysml.runtime.instructions.mr.MMTSJMRInstruction) DataGenMRInstruction(org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction) MMTSJMRInstruction(org.apache.sysml.runtime.instructions.mr.MMTSJMRInstruction) MRInstruction(org.apache.sysml.runtime.instructions.mr.MRInstruction) RemoveEmptyMRInstruction(org.apache.sysml.runtime.instructions.mr.RemoveEmptyMRInstruction) GroupedAggregateInstruction(org.apache.sysml.runtime.instructions.mr.GroupedAggregateInstruction)

Example 2 with PickByCountInstruction

use of org.apache.sysml.runtime.instructions.mr.PickByCountInstruction in project incubator-systemml by apache.

the class GMR method runJob.

/**
 * Execute job.
 *
 * @param inst MR job instruction
 * @param inputs input matrices, the inputs are indexed by 0, 1, 2, .. based on the position in this string
 * @param inputInfos the input format information for the input matrices
 * @param rlens array of number of rows
 * @param clens array of number of columns
 * @param brlens array of number of rows in block
 * @param bclens array of number of columns in block
 * @param partitioned boolean array of partitioned status
 * @param pformats array of data partition formats
 * @param psizes does nothing
 * @param recordReaderInstruction record reader instruction
 * @param instructionsInMapper in Mapper, the set of unary operations that need to be performed on each input matrix
 * @param aggInstructionsInReducer in Reducer, right after sorting, the set of aggreagte operations
 * that need to be performed on each input matrix
 * @param otherInstructionsInReducer the mixed operations that need to be performed on matrices after the aggregate operations
 * @param numReducers the number of reducers
 * @param replication the replication factor for the output
 * @param jvmReuse if true, reuse JVM
 * @param resultIndexes the indexes of the result matrices that needs to be outputted
 * @param dimsUnknownFilePrefix file path prefix when dimensions unknown
 * @param outputs the names for the output directories, one for each result index
 * @param outputInfos output format information for the output matrices
 * @return job return object
 * @throws Exception if Exception occurs
 */
@SuppressWarnings({ "unchecked", "rawtypes" })
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, boolean[] partitioned, PDataPartitionFormat[] pformats, int[] psizes, String recordReaderInstruction, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, boolean jvmReuse, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception {
    JobConf job = new JobConf(GMR.class);
    job.setJobName("G-MR");
    boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos);
    // whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);
    // added for handling recordreader instruction
    String[] realinputs = inputs;
    InputInfo[] realinputInfos = inputInfos;
    long[] realrlens = rlens;
    long[] realclens = clens;
    int[] realbrlens = brlens;
    int[] realbclens = bclens;
    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
    if (recordReaderInstruction != null && !recordReaderInstruction.isEmpty()) {
        assert (inputs.length <= 2);
        PickByCountInstruction ins = (PickByCountInstruction) PickByCountInstruction.parseInstruction(recordReaderInstruction);
        PickFromCompactInputFormat.setKeyValueClasses(job, (Class<? extends WritableComparable>) inputInfos[ins.input1].inputKeyClass, inputInfos[ins.input1].inputValueClass);
        job.setInputFormat(PickFromCompactInputFormat.class);
        PickFromCompactInputFormat.setZeroValues(job, (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata);
        if (ins.isValuePick) {
            double[] probs = MapReduceTool.readColumnVectorFromHDFS(inputs[ins.input2], inputInfos[ins.input2], rlens[ins.input2], clens[ins.input2], brlens[ins.input2], bclens[ins.input2]);
            PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, probs);
            realinputs = new String[inputs.length - 1];
            realinputInfos = new InputInfo[inputs.length - 1];
            realrlens = new long[inputs.length - 1];
            realclens = new long[inputs.length - 1];
            realbrlens = new int[inputs.length - 1];
            realbclens = new int[inputs.length - 1];
            realIndexes = new byte[inputs.length - 1];
            byte realIndex = 0;
            for (byte i = 0; i < inputs.length; i++) {
                if (i == ins.input2)
                    continue;
                realinputs[realIndex] = inputs[i];
                realinputInfos[realIndex] = inputInfos[i];
                if (i == ins.input1) {
                    realrlens[realIndex] = rlens[ins.input2];
                    realclens[realIndex] = clens[ins.input2];
                    realbrlens[realIndex] = 1;
                    realbclens[realIndex] = 1;
                    realIndexes[realIndex] = ins.output;
                } else {
                    realrlens[realIndex] = rlens[i];
                    realclens[realIndex] = clens[i];
                    realbrlens[realIndex] = brlens[i];
                    realbclens[realIndex] = bclens[i];
                    realIndexes[realIndex] = i;
                }
                realIndex++;
            }
        } else {
            // PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst, 1-ins.cst);
            PickFromCompactInputFormat.setRangePickPartFiles(job, (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, ins.cst, 1 - ins.cst);
            realrlens[ins.input1] = UtilFunctions.getLengthForInterQuantile((MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, ins.cst);
            realclens[ins.input1] = clens[ins.input1];
            realbrlens[ins.input1] = 1;
            realbclens[ins.input1] = 1;
            realIndexes[ins.input1] = ins.output;
        }
    }
    boolean resetDistCache = setupDistributedCache(job, instructionsInMapper, otherInstructionsInReducer, realinputs, realrlens, realclens);
    // set up the input files and their format information
    boolean[] distCacheOnly = getDistCacheOnlyInputs(realIndexes, recordReaderInstruction, instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer);
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens, distCacheOnly, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL);
    MRJobConfiguration.setInputPartitioningInfo(job, pformats);
    // set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens);
    MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
    // set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens);
    // set up unary instructions that will perform in the mapper
    MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
    // set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);
    // set up the instructions that will happen in the reducer, after the aggregation instructions
    MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
    // set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    // set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);
    // set up map/reduce memory configurations (if in AM context)
    DMLConfig config = ConfigurationManager.getDMLConfig();
    DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
    // set up custom map/reduce configurations
    MRJobConfiguration.setupCustomMRConfigurations(job, config);
    // set up jvm reuse (incl. reuse of loaded dist cache matrices)
    if (jvmReuse)
        job.setNumTasksToExecutePerJvm(-1);
    // set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes);
    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
    MatrixCharacteristics[] stats = ret.stats;
    // set up the number of reducers
    MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);
    // Update resultDimsUnknown based on computed "stats"
    byte[] dimsUnknown = new byte[resultIndexes.length];
    for (int i = 0; i < resultIndexes.length; i++) {
        if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
            dimsUnknown[i] = (byte) 1;
        } else {
            dimsUnknown[i] = (byte) 0;
        }
    }
    // MRJobConfiguration.updateResultDimsUnknown(job,resultDimsUnknown);
    // set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, dimsUnknown, outputs, outputInfos, inBlockRepresentation, true);
    // configure mapper and the mapper output key value pairs
    job.setMapperClass(GMRMapper.class);
    if (numReducers == 0) {
        job.setMapOutputKeyClass(Writable.class);
        job.setMapOutputValueClass(Writable.class);
    } else {
        job.setMapOutputKeyClass(MatrixIndexes.class);
        if (inBlockRepresentation)
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        else
            job.setMapOutputValueClass(TaggedMatrixPackedCell.class);
    }
    // set up combiner
    if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty()) {
        job.setCombinerClass(GMRCombiner.class);
    }
    // configure reducer
    job.setReducerClass(GMRReducer.class);
    // job.setReducerClass(PassThroughReducer.class);
    // By default, the job executes in "cluster" mode.
    // Determine if we can optimize and run it in "local" mode.
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }
    // set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    RunningJob runjob = JobClient.runJob(job);
    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    // cleanups
    String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
    stats = MapReduceTool.processDimsFiles(dir, stats);
    MapReduceTool.deleteFileIfExistOnHDFS(dir);
    if (resetDistCache)
        MRBaseForCommonInstructions.resetDistCache();
    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
Also used : Group(org.apache.hadoop.mapred.Counters.Group) PickByCountInstruction(org.apache.sysml.runtime.instructions.mr.PickByCountInstruction) TaggedMatrixPackedCell(org.apache.sysml.runtime.matrix.data.TaggedMatrixPackedCell) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JobConf(org.apache.hadoop.mapred.JobConf) DMLConfig(org.apache.sysml.conf.DMLConfig) TaggedMatrixBlock(org.apache.sysml.runtime.matrix.data.TaggedMatrixBlock) MatrixChar_N_ReducerGroups(org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups) RunningJob(org.apache.hadoop.mapred.RunningJob)

Example 3 with PickByCountInstruction

use of org.apache.sysml.runtime.instructions.mr.PickByCountInstruction in project systemml by apache.

the class GMR method runJob.

/**
 * Execute job.
 *
 * @param inst MR job instruction
 * @param inputs input matrices, the inputs are indexed by 0, 1, 2, .. based on the position in this string
 * @param inputInfos the input format information for the input matrices
 * @param rlens array of number of rows
 * @param clens array of number of columns
 * @param brlens array of number of rows in block
 * @param bclens array of number of columns in block
 * @param partitioned boolean array of partitioned status
 * @param pformats array of data partition formats
 * @param psizes does nothing
 * @param recordReaderInstruction record reader instruction
 * @param instructionsInMapper in Mapper, the set of unary operations that need to be performed on each input matrix
 * @param aggInstructionsInReducer in Reducer, right after sorting, the set of aggreagte operations
 * that need to be performed on each input matrix
 * @param otherInstructionsInReducer the mixed operations that need to be performed on matrices after the aggregate operations
 * @param numReducers the number of reducers
 * @param replication the replication factor for the output
 * @param jvmReuse if true, reuse JVM
 * @param resultIndexes the indexes of the result matrices that needs to be outputted
 * @param dimsUnknownFilePrefix file path prefix when dimensions unknown
 * @param outputs the names for the output directories, one for each result index
 * @param outputInfos output format information for the output matrices
 * @return job return object
 * @throws Exception if Exception occurs
 */
@SuppressWarnings({ "unchecked", "rawtypes" })
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, boolean[] partitioned, PDataPartitionFormat[] pformats, int[] psizes, String recordReaderInstruction, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, boolean jvmReuse, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception {
    JobConf job = new JobConf(GMR.class);
    job.setJobName("G-MR");
    boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos);
    // whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);
    // added for handling recordreader instruction
    String[] realinputs = inputs;
    InputInfo[] realinputInfos = inputInfos;
    long[] realrlens = rlens;
    long[] realclens = clens;
    int[] realbrlens = brlens;
    int[] realbclens = bclens;
    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
    if (recordReaderInstruction != null && !recordReaderInstruction.isEmpty()) {
        assert (inputs.length <= 2);
        PickByCountInstruction ins = (PickByCountInstruction) PickByCountInstruction.parseInstruction(recordReaderInstruction);
        PickFromCompactInputFormat.setKeyValueClasses(job, (Class<? extends WritableComparable>) inputInfos[ins.input1].inputKeyClass, inputInfos[ins.input1].inputValueClass);
        job.setInputFormat(PickFromCompactInputFormat.class);
        PickFromCompactInputFormat.setZeroValues(job, (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata);
        if (ins.isValuePick) {
            double[] probs = MapReduceTool.readColumnVectorFromHDFS(inputs[ins.input2], inputInfos[ins.input2], rlens[ins.input2], clens[ins.input2], brlens[ins.input2], bclens[ins.input2]);
            PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, probs);
            realinputs = new String[inputs.length - 1];
            realinputInfos = new InputInfo[inputs.length - 1];
            realrlens = new long[inputs.length - 1];
            realclens = new long[inputs.length - 1];
            realbrlens = new int[inputs.length - 1];
            realbclens = new int[inputs.length - 1];
            realIndexes = new byte[inputs.length - 1];
            byte realIndex = 0;
            for (byte i = 0; i < inputs.length; i++) {
                if (i == ins.input2)
                    continue;
                realinputs[realIndex] = inputs[i];
                realinputInfos[realIndex] = inputInfos[i];
                if (i == ins.input1) {
                    realrlens[realIndex] = rlens[ins.input2];
                    realclens[realIndex] = clens[ins.input2];
                    realbrlens[realIndex] = 1;
                    realbclens[realIndex] = 1;
                    realIndexes[realIndex] = ins.output;
                } else {
                    realrlens[realIndex] = rlens[i];
                    realclens[realIndex] = clens[i];
                    realbrlens[realIndex] = brlens[i];
                    realbclens[realIndex] = bclens[i];
                    realIndexes[realIndex] = i;
                }
                realIndex++;
            }
        } else {
            // PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst, 1-ins.cst);
            PickFromCompactInputFormat.setRangePickPartFiles(job, (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, ins.cst, 1 - ins.cst);
            realrlens[ins.input1] = UtilFunctions.getLengthForInterQuantile((MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, ins.cst);
            realclens[ins.input1] = clens[ins.input1];
            realbrlens[ins.input1] = 1;
            realbclens[ins.input1] = 1;
            realIndexes[ins.input1] = ins.output;
        }
    }
    boolean resetDistCache = setupDistributedCache(job, instructionsInMapper, otherInstructionsInReducer, realinputs, realrlens, realclens);
    // set up the input files and their format information
    boolean[] distCacheOnly = getDistCacheOnlyInputs(realIndexes, recordReaderInstruction, instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer);
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens, distCacheOnly, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL);
    MRJobConfiguration.setInputPartitioningInfo(job, pformats);
    // set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens);
    MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
    // set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens);
    // set up unary instructions that will perform in the mapper
    MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
    // set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);
    // set up the instructions that will happen in the reducer, after the aggregation instructions
    MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
    // set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    // set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);
    // set up map/reduce memory configurations (if in AM context)
    DMLConfig config = ConfigurationManager.getDMLConfig();
    DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
    // set up custom map/reduce configurations
    MRJobConfiguration.setupCustomMRConfigurations(job, config);
    // set up jvm reuse (incl. reuse of loaded dist cache matrices)
    if (jvmReuse)
        job.setNumTasksToExecutePerJvm(-1);
    // set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes);
    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
    MatrixCharacteristics[] stats = ret.stats;
    // set up the number of reducers
    MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);
    // Update resultDimsUnknown based on computed "stats"
    byte[] dimsUnknown = new byte[resultIndexes.length];
    for (int i = 0; i < resultIndexes.length; i++) {
        if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
            dimsUnknown[i] = (byte) 1;
        } else {
            dimsUnknown[i] = (byte) 0;
        }
    }
    // MRJobConfiguration.updateResultDimsUnknown(job,resultDimsUnknown);
    // set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, dimsUnknown, outputs, outputInfos, inBlockRepresentation, true);
    // configure mapper and the mapper output key value pairs
    job.setMapperClass(GMRMapper.class);
    if (numReducers == 0) {
        job.setMapOutputKeyClass(Writable.class);
        job.setMapOutputValueClass(Writable.class);
    } else {
        job.setMapOutputKeyClass(MatrixIndexes.class);
        if (inBlockRepresentation)
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        else
            job.setMapOutputValueClass(TaggedMatrixPackedCell.class);
    }
    // set up combiner
    if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty()) {
        job.setCombinerClass(GMRCombiner.class);
    }
    // configure reducer
    job.setReducerClass(GMRReducer.class);
    // job.setReducerClass(PassThroughReducer.class);
    // By default, the job executes in "cluster" mode.
    // Determine if we can optimize and run it in "local" mode.
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }
    // set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    RunningJob runjob = JobClient.runJob(job);
    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    // cleanups
    String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
    stats = MapReduceTool.processDimsFiles(dir, stats);
    MapReduceTool.deleteFileIfExistOnHDFS(dir);
    if (resetDistCache)
        MRBaseForCommonInstructions.resetDistCache();
    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
Also used : Group(org.apache.hadoop.mapred.Counters.Group) PickByCountInstruction(org.apache.sysml.runtime.instructions.mr.PickByCountInstruction) TaggedMatrixPackedCell(org.apache.sysml.runtime.matrix.data.TaggedMatrixPackedCell) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JobConf(org.apache.hadoop.mapred.JobConf) DMLConfig(org.apache.sysml.conf.DMLConfig) TaggedMatrixBlock(org.apache.sysml.runtime.matrix.data.TaggedMatrixBlock) MatrixChar_N_ReducerGroups(org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups) RunningJob(org.apache.hadoop.mapred.RunningJob)

Example 4 with PickByCountInstruction

use of org.apache.sysml.runtime.instructions.mr.PickByCountInstruction in project systemml by apache.

the class CostEstimatorStaticRuntime method extractMRInstStatistics.

private Object[] extractMRInstStatistics(String inst, VarStats[] stats) {
    // stats, attrs
    Object[] ret = new Object[2];
    VarStats[] vs = new VarStats[3];
    String[] attr = null;
    String[] parts = InstructionUtils.getInstructionParts(inst);
    String opcode = parts[0];
    if (opcode.equals(DataGen.RAND_OPCODE)) {
        vs[0] = _unknownStats;
        vs[1] = _unknownStats;
        vs[2] = stats[Integer.parseInt(parts[2])];
        int type = 2;
        // awareness of instruction patching min/max
        if (!parts[7].contains(Lop.VARIABLE_NAME_PLACEHOLDER) && !parts[8].contains(Lop.VARIABLE_NAME_PLACEHOLDER)) {
            double minValue = Double.parseDouble(parts[7]);
            double maxValue = Double.parseDouble(parts[8]);
            double sparsity = Double.parseDouble(parts[9]);
            if (minValue == 0.0 && maxValue == 0.0)
                type = 0;
            else if (sparsity == 1.0 && minValue == maxValue)
                type = 1;
        }
        attr = new String[] { String.valueOf(type) };
    }
    if (opcode.equals(DataGen.SEQ_OPCODE)) {
        vs[0] = _unknownStats;
        vs[1] = _unknownStats;
        vs[2] = stats[Integer.parseInt(parts[2])];
    } else // general case
    {
        String inst2 = replaceInstructionPatch(inst);
        MRInstruction mrinst = MRInstructionParser.parseSingleInstruction(inst2);
        if (mrinst instanceof UnaryMRInstructionBase) {
            UnaryMRInstructionBase uinst = (UnaryMRInstructionBase) mrinst;
            vs[0] = uinst.input >= 0 ? stats[uinst.input] : _unknownStats;
            vs[1] = _unknownStats;
            vs[2] = stats[uinst.output];
            if (// scalar input, e.g., print
            vs[0] == null)
                vs[0] = _scalarStats;
            if (// scalar output
            vs[2] == null)
                vs[2] = _scalarStats;
            if (mrinst instanceof MMTSJMRInstruction) {
                String type = ((MMTSJMRInstruction) mrinst).getMMTSJType().toString();
                attr = new String[] { type };
            } else if (mrinst instanceof CM_N_COVInstruction) {
                if (opcode.equals("cm"))
                    attr = new String[] { parts[parts.length - 2] };
            } else if (mrinst instanceof GroupedAggregateInstruction) {
                if (opcode.equals("groupedagg")) {
                    AggregateOperationTypes type = CMOperator.getAggOpType(parts[2], parts[3]);
                    attr = new String[] { String.valueOf(type.ordinal()) };
                }
            }
        } else if (mrinst instanceof BinaryMRInstructionBase) {
            BinaryMRInstructionBase binst = (BinaryMRInstructionBase) mrinst;
            vs[0] = stats[binst.input1];
            vs[1] = stats[binst.input2];
            vs[2] = stats[binst.output];
            if (// scalar input,
            vs[0] == null)
                vs[0] = _scalarStats;
            if (// scalar input,
            vs[1] == null)
                vs[1] = _scalarStats;
            if (// scalar output
            vs[2] == null)
                vs[2] = _scalarStats;
            if (opcode.equals("rmempty")) {
                RemoveEmptyMRInstruction rbinst = (RemoveEmptyMRInstruction) mrinst;
                attr = new String[] { rbinst.isRemoveRows() ? "0" : "1" };
            }
        } else if (mrinst instanceof TernaryInstruction) {
            TernaryInstruction tinst = (TernaryInstruction) mrinst;
            byte[] ix = tinst.getAllIndexes();
            for (int i = 0; i < ix.length - 1; i++) vs[0] = stats[ix[i]];
            vs[2] = stats[ix[ix.length - 1]];
            if (// scalar input,
            vs[0] == null)
                vs[0] = _scalarStats;
            if (// scalar input,
            vs[1] == null)
                vs[1] = _scalarStats;
            if (// scalar output
            vs[2] == null)
                vs[2] = _scalarStats;
        } else if (mrinst instanceof CtableInstruction) {
            CtableInstruction tinst = (CtableInstruction) mrinst;
            vs[0] = stats[tinst.input1];
            vs[1] = stats[tinst.input2];
            vs[2] = stats[tinst.input3];
            if (// scalar input,
            vs[0] == null)
                vs[0] = _scalarStats;
            if (// scalar input,
            vs[1] == null)
                vs[1] = _scalarStats;
            if (// scalar input
            vs[2] == null)
                vs[2] = _scalarStats;
        } else if (mrinst instanceof PickByCountInstruction) {
            PickByCountInstruction pinst = (PickByCountInstruction) mrinst;
            vs[0] = stats[pinst.input1];
            vs[2] = stats[pinst.output];
            if (// scalar input,
            vs[0] == null)
                vs[0] = _scalarStats;
            if (// scalar input,
            vs[1] == null)
                vs[1] = _scalarStats;
            if (// scalar input
            vs[2] == null)
                vs[2] = _scalarStats;
        } else if (mrinst instanceof MapMultChainInstruction) {
            MapMultChainInstruction minst = (MapMultChainInstruction) mrinst;
            vs[0] = stats[minst.getInput1()];
            vs[1] = stats[minst.getInput2()];
            if (minst.getInput3() >= 0)
                vs[2] = stats[minst.getInput3()];
            if (// scalar input,
            vs[0] == null)
                vs[0] = _scalarStats;
            if (// scalar input,
            vs[1] == null)
                vs[1] = _scalarStats;
            if (// scalar input
            vs[2] == null)
                vs[2] = _scalarStats;
        }
    }
    // maintain var status (CP output always inmem)
    vs[2]._inmem = true;
    ret[0] = vs;
    ret[1] = attr;
    return ret;
}
Also used : CM_N_COVInstruction(org.apache.sysml.runtime.instructions.mr.CM_N_COVInstruction) BinaryMRInstructionBase(org.apache.sysml.runtime.instructions.mr.BinaryMRInstructionBase) PickByCountInstruction(org.apache.sysml.runtime.instructions.mr.PickByCountInstruction) TernaryInstruction(org.apache.sysml.runtime.instructions.mr.TernaryInstruction) AggregateOperationTypes(org.apache.sysml.runtime.matrix.operators.CMOperator.AggregateOperationTypes) RemoveEmptyMRInstruction(org.apache.sysml.runtime.instructions.mr.RemoveEmptyMRInstruction) CtableInstruction(org.apache.sysml.runtime.instructions.mr.CtableInstruction) UnaryMRInstructionBase(org.apache.sysml.runtime.instructions.mr.UnaryMRInstructionBase) MapMultChainInstruction(org.apache.sysml.runtime.instructions.mr.MapMultChainInstruction) MMTSJMRInstruction(org.apache.sysml.runtime.instructions.mr.MMTSJMRInstruction) DataGenMRInstruction(org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction) MMTSJMRInstruction(org.apache.sysml.runtime.instructions.mr.MMTSJMRInstruction) MRInstruction(org.apache.sysml.runtime.instructions.mr.MRInstruction) RemoveEmptyMRInstruction(org.apache.sysml.runtime.instructions.mr.RemoveEmptyMRInstruction) GroupedAggregateInstruction(org.apache.sysml.runtime.instructions.mr.GroupedAggregateInstruction)

Aggregations

PickByCountInstruction (org.apache.sysml.runtime.instructions.mr.PickByCountInstruction)4 Group (org.apache.hadoop.mapred.Counters.Group)2 JobConf (org.apache.hadoop.mapred.JobConf)2 RunningJob (org.apache.hadoop.mapred.RunningJob)2 DMLConfig (org.apache.sysml.conf.DMLConfig)2 BinaryMRInstructionBase (org.apache.sysml.runtime.instructions.mr.BinaryMRInstructionBase)2 CM_N_COVInstruction (org.apache.sysml.runtime.instructions.mr.CM_N_COVInstruction)2 CtableInstruction (org.apache.sysml.runtime.instructions.mr.CtableInstruction)2 DataGenMRInstruction (org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction)2 GroupedAggregateInstruction (org.apache.sysml.runtime.instructions.mr.GroupedAggregateInstruction)2 MMTSJMRInstruction (org.apache.sysml.runtime.instructions.mr.MMTSJMRInstruction)2 MRInstruction (org.apache.sysml.runtime.instructions.mr.MRInstruction)2 MapMultChainInstruction (org.apache.sysml.runtime.instructions.mr.MapMultChainInstruction)2 RemoveEmptyMRInstruction (org.apache.sysml.runtime.instructions.mr.RemoveEmptyMRInstruction)2 TernaryInstruction (org.apache.sysml.runtime.instructions.mr.TernaryInstruction)2 UnaryMRInstructionBase (org.apache.sysml.runtime.instructions.mr.UnaryMRInstructionBase)2 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)2 TaggedMatrixBlock (org.apache.sysml.runtime.matrix.data.TaggedMatrixBlock)2 TaggedMatrixPackedCell (org.apache.sysml.runtime.matrix.data.TaggedMatrixPackedCell)2 MatrixChar_N_ReducerGroups (org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups)2