Search in sources :

Example 1 with NumItemsByEachReducerMetaData

use of org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData in project incubator-systemml by apache.

the class GMR method runJob.

/**
	 * Execute job.
	 * 
	 * @param inst MR job instruction
	 * @param inputs input matrices, the inputs are indexed by 0, 1, 2, .. based on the position in this string
	 * @param inputInfos the input format information for the input matrices
	 * @param rlens array of number of rows
	 * @param clens array of number of columns
	 * @param brlens array of number of rows in block
	 * @param bclens array of number of columns in block
	 * @param partitioned boolean array of partitioned status
	 * @param pformats array of data partition formats
	 * @param psizes does nothing
	 * @param recordReaderInstruction record reader instruction
	 * @param instructionsInMapper in Mapper, the set of unary operations that need to be performed on each input matrix
	 * @param aggInstructionsInReducer in Reducer, right after sorting, the set of aggreagte operations
	 * that need to be performed on each input matrix
	 * @param otherInstructionsInReducer the mixed operations that need to be performed on matrices after the aggregate operations
	 * @param numReducers the number of reducers
	 * @param replication the replication factor for the output
	 * @param jvmReuse if true, reuse JVM
	 * @param resultIndexes the indexes of the result matrices that needs to be outputted
	 * @param dimsUnknownFilePrefix file path prefix when dimensions unknown
	 * @param outputs the names for the output directories, one for each result index
	 * @param outputInfos output format information for the output matrices
	 * @return job return object
	 * @throws Exception if Exception occurs
	 */
@SuppressWarnings({ "unchecked", "rawtypes" })
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, boolean[] partitioned, PDataPartitionFormat[] pformats, int[] psizes, String recordReaderInstruction, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, boolean jvmReuse, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception {
    JobConf job = new JobConf(GMR.class);
    job.setJobName("G-MR");
    boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos);
    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);
    //added for handling recordreader instruction
    String[] realinputs = inputs;
    InputInfo[] realinputInfos = inputInfos;
    long[] realrlens = rlens;
    long[] realclens = clens;
    int[] realbrlens = brlens;
    int[] realbclens = bclens;
    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
    if (recordReaderInstruction != null && !recordReaderInstruction.isEmpty()) {
        assert (inputs.length <= 2);
        PickByCountInstruction ins = (PickByCountInstruction) PickByCountInstruction.parseInstruction(recordReaderInstruction);
        PickFromCompactInputFormat.setKeyValueClasses(job, (Class<? extends WritableComparable>) inputInfos[ins.input1].inputKeyClass, inputInfos[ins.input1].inputValueClass);
        job.setInputFormat(PickFromCompactInputFormat.class);
        PickFromCompactInputFormat.setZeroValues(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata);
        if (ins.isValuePick) {
            double[] probs = MapReduceTool.readColumnVectorFromHDFS(inputs[ins.input2], inputInfos[ins.input2], rlens[ins.input2], clens[ins.input2], brlens[ins.input2], bclens[ins.input2]);
            PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, probs);
            realinputs = new String[inputs.length - 1];
            realinputInfos = new InputInfo[inputs.length - 1];
            realrlens = new long[inputs.length - 1];
            realclens = new long[inputs.length - 1];
            realbrlens = new int[inputs.length - 1];
            realbclens = new int[inputs.length - 1];
            realIndexes = new byte[inputs.length - 1];
            byte realIndex = 0;
            for (byte i = 0; i < inputs.length; i++) {
                if (i == ins.input2)
                    continue;
                realinputs[realIndex] = inputs[i];
                realinputInfos[realIndex] = inputInfos[i];
                if (i == ins.input1) {
                    realrlens[realIndex] = rlens[ins.input2];
                    realclens[realIndex] = clens[ins.input2];
                    realbrlens[realIndex] = 1;
                    realbclens[realIndex] = 1;
                    realIndexes[realIndex] = ins.output;
                } else {
                    realrlens[realIndex] = rlens[i];
                    realclens[realIndex] = clens[i];
                    realbrlens[realIndex] = brlens[i];
                    realbclens[realIndex] = bclens[i];
                    realIndexes[realIndex] = i;
                }
                realIndex++;
            }
        } else {
            //PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst, 1-ins.cst);
            PickFromCompactInputFormat.setRangePickPartFiles(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst, 1 - ins.cst);
            realrlens[ins.input1] = UtilFunctions.getLengthForInterQuantile((NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst);
            realclens[ins.input1] = clens[ins.input1];
            realbrlens[ins.input1] = 1;
            realbclens[ins.input1] = 1;
            realIndexes[ins.input1] = ins.output;
        }
    }
    boolean resetDistCache = setupDistributedCache(job, instructionsInMapper, otherInstructionsInReducer, realinputs, realrlens, realclens);
    //set up the input files and their format information
    boolean[] distCacheOnly = getDistCacheOnlyInputs(realIndexes, recordReaderInstruction, instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer);
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens, distCacheOnly, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL);
    MRJobConfiguration.setInputPartitioningInfo(job, pformats);
    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens);
    MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens);
    //set up unary instructions that will perform in the mapper
    MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);
    //set up the instructions that will happen in the reducer, after the aggregation instructions
    MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
    //set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);
    //set up map/reduce memory configurations (if in AM context)
    DMLConfig config = ConfigurationManager.getDMLConfig();
    DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
    //set up custom map/reduce configurations 
    MRJobConfiguration.setupCustomMRConfigurations(job, config);
    //set up jvm reuse (incl. reuse of loaded dist cache matrices)
    if (jvmReuse)
        job.setNumTasksToExecutePerJvm(-1);
    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes);
    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
    MatrixCharacteristics[] stats = ret.stats;
    //set up the number of reducers
    MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);
    // Update resultDimsUnknown based on computed "stats"
    byte[] dimsUnknown = new byte[resultIndexes.length];
    for (int i = 0; i < resultIndexes.length; i++) {
        if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
            dimsUnknown[i] = (byte) 1;
        } else {
            dimsUnknown[i] = (byte) 0;
        }
    }
    //MRJobConfiguration.updateResultDimsUnknown(job,resultDimsUnknown);
    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, dimsUnknown, outputs, outputInfos, inBlockRepresentation, true);
    // configure mapper and the mapper output key value pairs
    job.setMapperClass(GMRMapper.class);
    if (numReducers == 0) {
        job.setMapOutputKeyClass(Writable.class);
        job.setMapOutputValueClass(Writable.class);
    } else {
        job.setMapOutputKeyClass(MatrixIndexes.class);
        if (inBlockRepresentation)
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        else
            job.setMapOutputValueClass(TaggedMatrixPackedCell.class);
    }
    //set up combiner
    if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty()) {
        job.setCombinerClass(GMRCombiner.class);
    }
    //configure reducer
    job.setReducerClass(GMRReducer.class);
    //job.setReducerClass(PassThroughReducer.class);
    // By default, the job executes in "cluster" mode.
    // Determine if we can optimize and run it in "local" mode.
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }
    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    RunningJob runjob = JobClient.runJob(job);
    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    //cleanups
    String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
    stats = MapReduceTool.processDimsFiles(dir, stats);
    MapReduceTool.deleteFileIfExistOnHDFS(dir);
    if (resetDistCache)
        MRBaseForCommonInstructions.resetDistCache();
    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
Also used : Group(org.apache.hadoop.mapred.Counters.Group) PickByCountInstruction(org.apache.sysml.runtime.instructions.mr.PickByCountInstruction) TaggedMatrixPackedCell(org.apache.sysml.runtime.matrix.data.TaggedMatrixPackedCell) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JobConf(org.apache.hadoop.mapred.JobConf) DMLConfig(org.apache.sysml.conf.DMLConfig) TaggedMatrixBlock(org.apache.sysml.runtime.matrix.data.TaggedMatrixBlock) NumItemsByEachReducerMetaData(org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData) MatrixChar_N_ReducerGroups(org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups) RunningJob(org.apache.hadoop.mapred.RunningJob)

Example 2 with NumItemsByEachReducerMetaData

use of org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData in project incubator-systemml by apache.

the class CacheableData method toString.

public String toString() {
    StringBuilder str = new StringBuilder();
    str.append(getClass().getSimpleName());
    str.append(": ");
    str.append(_hdfsFileName + ", ");
    if (_metaData instanceof NumItemsByEachReducerMetaData) {
        str.append("NumItemsByEachReducerMetaData");
    } else {
        try {
            MatrixFormatMetaData md = (MatrixFormatMetaData) _metaData;
            if (md != null) {
                MatrixCharacteristics mc = ((MatrixDimensionsMetaData) _metaData).getMatrixCharacteristics();
                str.append(mc.toString());
                InputInfo ii = md.getInputInfo();
                if (ii == null)
                    str.append("null");
                else {
                    str.append(", ");
                    str.append(InputInfo.inputInfoToString(ii));
                }
            } else {
                str.append("null, null");
            }
        } catch (Exception ex) {
            LOG.error(ex);
        }
    }
    str.append(", ");
    str.append(isDirty() ? "dirty" : "not-dirty");
    return str.toString();
}
Also used : InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) MatrixDimensionsMetaData(org.apache.sysml.runtime.matrix.MatrixDimensionsMetaData) NumItemsByEachReducerMetaData(org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData) MatrixFormatMetaData(org.apache.sysml.runtime.matrix.MatrixFormatMetaData) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 3 with NumItemsByEachReducerMetaData

use of org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData in project incubator-systemml by apache.

the class MRJobInstruction method populateInputs.

/**
	 * Auxiliary data structures that store information required to spawn MR jobs.
	 * These data structures are populated by pulling out information from symbol
	 * table. More specifically, from information stored in <code>inputMatrices</code>
	 * and <code>outputMatrices</code>.   
	 */
private void populateInputs() {
    // Since inputVars can potentially contain scalar variables,
    // auxiliary data structures of size <code>inputMatrices.length</code>
    // are allocated instead of size <code>inputVars.length</code>
    // Allocate space
    inputs = new String[inputMatrices.length];
    inputInfos = new InputInfo[inputMatrices.length];
    rlens = new long[inputMatrices.length];
    clens = new long[inputMatrices.length];
    brlens = new int[inputMatrices.length];
    bclens = new int[inputMatrices.length];
    partitioned = new boolean[inputMatrices.length];
    pformats = new PDataPartitionFormat[inputMatrices.length];
    psizes = new int[inputMatrices.length];
    // populate information
    for (int i = 0; i < inputMatrices.length; i++) {
        inputs[i] = inputMatrices[i].getFileName();
        MatrixCharacteristics mc = inputMatrices[i].getMatrixCharacteristics();
        rlens[i] = mc.getRows();
        clens[i] = mc.getCols();
        brlens[i] = mc.getRowsPerBlock();
        bclens[i] = mc.getColsPerBlock();
        if (inputMatrices[i].getMetaData() instanceof MatrixFormatMetaData) {
            inputInfos[i] = ((MatrixFormatMetaData) inputMatrices[i].getMetaData()).getInputInfo();
        } else if (inputMatrices[i].getMetaData() instanceof NumItemsByEachReducerMetaData) {
            inputInfos[i] = InputInfo.InputInfoForSortOutput;
            inputInfos[i].metadata = inputMatrices[i].getMetaData();
        }
        partitioned[i] = inputMatrices[i].isPartitioned();
        pformats[i] = inputMatrices[i].getPartitionFormat();
        psizes[i] = inputMatrices[i].getPartitionSize();
    }
}
Also used : MatrixFormatMetaData(org.apache.sysml.runtime.matrix.MatrixFormatMetaData) NumItemsByEachReducerMetaData(org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 4 with NumItemsByEachReducerMetaData

use of org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData in project incubator-systemml by apache.

the class QuantilePickCPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
    switch(_type) {
        case VALUEPICK:
            if (//INMEM VALUEPICK
            _inmem) {
                MatrixBlock matBlock = ec.getMatrixInput(input1.getName());
                if (input2.getDataType() == DataType.SCALAR) {
                    ScalarObject quantile = ec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral());
                    double picked = matBlock.pickValue(quantile.getDoubleValue());
                    ec.setScalarOutput(output.getName(), new DoubleObject(picked));
                } else {
                    MatrixBlock quantiles = ec.getMatrixInput(input2.getName());
                    MatrixBlock resultBlock = (MatrixBlock) matBlock.pickValues(quantiles, new MatrixBlock());
                    quantiles = null;
                    ec.releaseMatrixInput(input2.getName());
                    ec.setMatrixOutput(output.getName(), resultBlock);
                }
                ec.releaseMatrixInput(input1.getName());
            } else //MR VALUEPICK
            {
                MatrixObject mat = ec.getMatrixObject(input1.getName());
                String fname = mat.getFileName();
                MetaData mdata = mat.getMetaData();
                ScalarObject pickindex = ec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral());
                if (mdata != null) {
                    try {
                        double picked = MapReduceTool.pickValue(fname, (NumItemsByEachReducerMetaData) mdata, pickindex.getDoubleValue());
                        ec.setVariable(output.getName(), new DoubleObject(picked));
                    } catch (Exception e) {
                        throw new DMLRuntimeException(e);
                    }
                } else {
                    throw new DMLRuntimeException("Unexpected error while executing ValuePickCP: otherMetaData for file (" + fname + ") not found.");
                }
            }
            break;
        case MEDIAN:
            if (//INMEM MEDIAN
            _inmem) {
                double picked = ec.getMatrixInput(input1.getName()).median();
                ec.setScalarOutput(output.getName(), new DoubleObject(picked));
                ec.releaseMatrixInput(input1.getName());
                break;
            } else //MR MEDIAN
            {
                MatrixObject mat1 = (MatrixObject) ec.getVariable(input1.getName());
                String fname1 = mat1.getFileName();
                MetaData mdata1 = mat1.getMetaData();
                if (mdata1 != null) {
                    try {
                        double median = MapReduceTool.median(fname1, (NumItemsByEachReducerMetaData) mdata1);
                        ec.setVariable(output.getName(), new DoubleObject(median));
                    } catch (Exception e) {
                        throw new DMLRuntimeException(e);
                    }
                } else {
                    throw new DMLRuntimeException("Unexpected error while executing ValuePickCP: otherMetaData for file (" + fname1 + ") not found.");
                }
            }
            break;
        case IQM:
            if (//INMEM IQM
            _inmem) {
                MatrixBlock matBlock1 = ec.getMatrixInput(input1.getName());
                double iqm = matBlock1.interQuartileMean();
                ec.releaseMatrixInput(input1.getName());
                ec.setScalarOutput(output.getName(), new DoubleObject(iqm));
            } else //MR IQM
            {
                MatrixObject inputMatrix = (MatrixObject) ec.getVariable(input1.getName());
                ScalarObject iqsum = ec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral());
                double[] q25 = null;
                double[] q75 = null;
                try {
                    q25 = MapReduceTool.pickValueWeight(inputMatrix.getFileName(), (NumItemsByEachReducerMetaData) inputMatrix.getMetaData(), 0.25, false);
                    q75 = MapReduceTool.pickValueWeight(inputMatrix.getFileName(), (NumItemsByEachReducerMetaData) inputMatrix.getMetaData(), 0.75, false);
                } catch (IOException e1) {
                    throw new DMLRuntimeException(e1);
                }
                double sumwt = UtilFunctions.getTotalLength((NumItemsByEachReducerMetaData) ec.getMetaData(input1.getName()));
                double q25d = sumwt * 0.25;
                double q75d = sumwt * 0.75;
                // iqsum = interQuartileSum that includes complete portions of q25 and q75
                //   . exclude top portion of q25 and bottom portion of q75 
                double q25entry_weight = q25[0] * q25[1];
                double q25portion_include = (q25[2] - q25d) * q25[0];
                double q25portion_exclude = q25entry_weight - q25portion_include;
                double q75portion_exclude = (q75[2] - q75d) * q75[0];
                double mriqm = (iqsum.getDoubleValue() - q25portion_exclude - q75portion_exclude) / (sumwt * 0.5);
                ec.setScalarOutput(output.getName(), new DoubleObject(mriqm));
            }
            break;
        default:
            throw new DMLRuntimeException("Unsupported qpick operation type: " + _type);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) NumItemsByEachReducerMetaData(org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData) MetaData(org.apache.sysml.runtime.matrix.MetaData) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) NumItemsByEachReducerMetaData(org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Aggregations

NumItemsByEachReducerMetaData (org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData)4 IOException (java.io.IOException)2 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)2 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)2 MatrixFormatMetaData (org.apache.sysml.runtime.matrix.MatrixFormatMetaData)2 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)2 Group (org.apache.hadoop.mapred.Counters.Group)1 JobConf (org.apache.hadoop.mapred.JobConf)1 RunningJob (org.apache.hadoop.mapred.RunningJob)1 DMLConfig (org.apache.sysml.conf.DMLConfig)1 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)1 PickByCountInstruction (org.apache.sysml.runtime.instructions.mr.PickByCountInstruction)1 MatrixDimensionsMetaData (org.apache.sysml.runtime.matrix.MatrixDimensionsMetaData)1 MetaData (org.apache.sysml.runtime.matrix.MetaData)1 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)1 TaggedMatrixBlock (org.apache.sysml.runtime.matrix.data.TaggedMatrixBlock)1 TaggedMatrixPackedCell (org.apache.sysml.runtime.matrix.data.TaggedMatrixPackedCell)1 MatrixChar_N_ReducerGroups (org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups)1