Search in sources :

Example 51 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.

the class VariableCPInstruction method parseInstruction.

public static VariableCPInstruction parseInstruction(String str) {
    String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
    String opcode = parts[0];
    VariableOperationCode voc = getVariableOperationCode(opcode);
    if (voc == VariableOperationCode.CreateVariable) {
        if (// && parts.length != 10 )
        parts.length < 5)
            throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
    } else if (voc == VariableOperationCode.MoveVariable) {
        // mvvar tempA A; or mvvar mvar5 "data/out.mtx" "binary"
        if (parts.length != 3 && parts.length != 4)
            throw new DMLRuntimeException("Invalid number of operands in mvvar instruction: " + str);
    } else if (voc == VariableOperationCode.Write) {
        // Write instructions for csv files also include three additional parameters (hasHeader, delimiter, sparse)
        if (parts.length != 5 && parts.length != 8)
            throw new DMLRuntimeException("Invalid number of operands in write instruction: " + str);
    } else {
        if (voc != VariableOperationCode.RemoveVariable)
            // no output
            InstructionUtils.checkNumFields(parts, getArity(voc));
    }
    CPOperand in1 = null, in2 = null, in3 = null, in4 = null, out = null;
    switch(voc) {
        case CreateVariable:
            // variable name
            DataType dt = DataType.valueOf(parts[4]);
            ValueType vt = dt == DataType.MATRIX ? ValueType.DOUBLE : ValueType.STRING;
            int extSchema = (dt == DataType.FRAME && parts.length >= 13) ? 1 : 0;
            in1 = new CPOperand(parts[1], vt, dt);
            // file name
            in2 = new CPOperand(parts[2], ValueType.STRING, DataType.SCALAR);
            // file name override flag (always literal)
            in3 = new CPOperand(parts[3], ValueType.BOOLEAN, DataType.SCALAR);
            // format
            String fmt = parts[5];
            if (fmt.equalsIgnoreCase("csv")) {
                // 14 inputs: createvar corresponding to READ -- includes properties hasHeader, delim, fill, and fillValue
                if (parts.length < 15 + extSchema || parts.length > 17 + extSchema)
                    throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
            } else {
                if (parts.length != 6 && parts.length != 12 + extSchema)
                    throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
            }
            OutputInfo oi = OutputInfo.stringToOutputInfo(fmt);
            InputInfo ii = OutputInfo.getMatchingInputInfo(oi);
            MatrixCharacteristics mc = new MatrixCharacteristics();
            if (parts.length == 6) {
            // do nothing
            } else if (parts.length >= 11) {
                // matrix characteristics
                mc.setDimension(Long.parseLong(parts[6]), Long.parseLong(parts[7]));
                mc.setBlockSize(Integer.parseInt(parts[8]), Integer.parseInt(parts[9]));
                mc.setNonZeros(Long.parseLong(parts[10]));
            } else {
                throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
            }
            MetaDataFormat iimd = new MetaDataFormat(mc, oi, ii);
            UpdateType updateType = UpdateType.COPY;
            if (parts.length >= 12)
                updateType = UpdateType.valueOf(parts[11].toUpperCase());
            // handle frame schema
            String schema = (dt == DataType.FRAME && parts.length >= 13) ? parts[parts.length - 1] : null;
            if (fmt.equalsIgnoreCase("csv")) {
                // Cretevar instructions for CSV format either has 13 or 14 inputs.
                // 13 inputs: createvar corresponding to WRITE -- includes properties hasHeader, delim, and sparse
                // 14 inputs: createvar corresponding to READ -- includes properties hasHeader, delim, fill, and fillValue
                FileFormatProperties fmtProperties = null;
                if (parts.length == 15 + extSchema) {
                    boolean hasHeader = Boolean.parseBoolean(parts[12]);
                    String delim = parts[13];
                    boolean sparse = Boolean.parseBoolean(parts[14]);
                    fmtProperties = new CSVFileFormatProperties(hasHeader, delim, sparse);
                } else {
                    boolean hasHeader = Boolean.parseBoolean(parts[12]);
                    String delim = parts[13];
                    boolean fill = Boolean.parseBoolean(parts[14]);
                    double fillValue = UtilFunctions.parseToDouble(parts[15]);
                    String naStrings = null;
                    if (parts.length == 17 + extSchema)
                        naStrings = parts[16];
                    fmtProperties = new CSVFileFormatProperties(hasHeader, delim, fill, fillValue, naStrings);
                }
                return new VariableCPInstruction(VariableOperationCode.CreateVariable, in1, in2, in3, iimd, updateType, fmtProperties, schema, opcode, str);
            } else {
                return new VariableCPInstruction(VariableOperationCode.CreateVariable, in1, in2, in3, iimd, updateType, schema, opcode, str);
            }
        case AssignVariable:
            in1 = new CPOperand(parts[1]);
            in2 = new CPOperand(parts[2]);
            break;
        case CopyVariable:
            // Value types are not given here
            in1 = new CPOperand(parts[1], ValueType.UNKNOWN, DataType.UNKNOWN);
            in2 = new CPOperand(parts[2], ValueType.UNKNOWN, DataType.UNKNOWN);
            break;
        case MoveVariable:
            in1 = new CPOperand(parts[1], ValueType.UNKNOWN, DataType.UNKNOWN);
            in2 = new CPOperand(parts[2], ValueType.UNKNOWN, DataType.UNKNOWN);
            if (parts.length > 3)
                in3 = new CPOperand(parts[3], ValueType.UNKNOWN, DataType.UNKNOWN);
            break;
        case RemoveVariable:
            VariableCPInstruction rminst = new VariableCPInstruction(getVariableOperationCode(opcode), null, null, null, out, opcode, str);
            for (int i = 1; i < parts.length; i++) rminst.addInput(new CPOperand(parts[i], ValueType.UNKNOWN, DataType.SCALAR));
            return rminst;
        case RemoveVariableAndFile:
            in1 = new CPOperand(parts[1]);
            in2 = new CPOperand(parts[2]);
            // second argument must be a boolean
            if (in2.getValueType() != ValueType.BOOLEAN)
                throw new DMLRuntimeException("Unexpected value type for second argument in: " + str);
            break;
        case CastAsScalarVariable:
        case CastAsMatrixVariable:
        case CastAsFrameVariable:
        case CastAsDoubleVariable:
        case CastAsIntegerVariable:
        case CastAsBooleanVariable:
            // first operand is a variable name => string value type
            in1 = new CPOperand(parts[1]);
            // output variable name
            out = new CPOperand(parts[2]);
            break;
        case Write:
            in1 = new CPOperand(parts[1]);
            in2 = new CPOperand(parts[2]);
            in3 = new CPOperand(parts[3]);
            FileFormatProperties fprops = null;
            if (in3.getName().equalsIgnoreCase("csv")) {
                boolean hasHeader = Boolean.parseBoolean(parts[4]);
                String delim = parts[5];
                boolean sparse = Boolean.parseBoolean(parts[6]);
                fprops = new CSVFileFormatProperties(hasHeader, delim, sparse);
                // description
                in4 = new CPOperand(parts[7]);
            } else {
                fprops = new FileFormatProperties();
                // description
                in4 = new CPOperand(parts[4]);
            }
            VariableCPInstruction inst = new VariableCPInstruction(getVariableOperationCode(opcode), in1, in2, in3, out, null, fprops, null, null, opcode, str);
            inst.addInput(in4);
            return inst;
        case Read:
            in1 = new CPOperand(parts[1]);
            in2 = new CPOperand(parts[2]);
            out = null;
            break;
        case SetFileName:
            // variable name
            in1 = new CPOperand(parts[1]);
            // file name
            in2 = new CPOperand(parts[2], ValueType.UNKNOWN, DataType.UNKNOWN);
            // option: remote or local
            in3 = new CPOperand(parts[3], ValueType.UNKNOWN, DataType.UNKNOWN);
            // return new VariableCPInstruction(getVariableOperationCode(opcode), in1, in2, in3, str);
            break;
    }
    return new VariableCPInstruction(getVariableOperationCode(opcode), in1, in2, in3, out, opcode, str);
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) ValueType(org.apache.sysml.parser.Expression.ValueType) UpdateType(org.apache.sysml.runtime.controlprogram.caching.MatrixObject.UpdateType) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) FileFormatProperties(org.apache.sysml.runtime.matrix.data.FileFormatProperties) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) DataType(org.apache.sysml.parser.Expression.DataType)

Example 52 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.

the class CMCOVMR method runJob.

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String instructionsInMapper, String cmNcomInstructions, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception {
    JobConf job = new JobConf(CMCOVMR.class);
    job.setJobName("CM-COV-MR");
    // whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClassForCM_N_COM(job, true);
    // added for handling recordreader instruction
    String[] realinputs = inputs;
    InputInfo[] realinputInfos = inputInfos;
    long[] realrlens = rlens;
    long[] realclens = clens;
    int[] realbrlens = brlens;
    int[] realbclens = bclens;
    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
    // set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens, true, ConvertTarget.WEIGHTEDCELL);
    // set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens);
    // set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens);
    // set up unary instructions that will perform in the mapper
    MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
    // set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setCM_N_COMInstructions(job, cmNcomInstructions);
    // set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    // set up custom map/reduce configurations
    DMLConfig config = ConfigurationManager.getDMLConfig();
    MRJobConfiguration.setupCustomMRConfigurations(job, config);
    // set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, null, cmNcomInstructions, resultIndexes);
    // set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, new byte[resultIndexes.length], outputs, outputInfos, false);
    // configure mapper and the mapper output key value pairs
    job.setMapperClass(CMCOVMRMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(CM_N_COVCell.class);
    job.setOutputKeyComparatorClass(TaggedFirstSecondIndexes.Comparator.class);
    job.setPartitionerClass(TaggedFirstSecondIndexes.TagPartitioner.class);
    // configure reducer
    job.setReducerClass(CMCOVMRReducer.class);
    // job.setReducerClass(PassThroughReducer.class);
    MatrixCharacteristics[] stats = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, instructionsInMapper, null, null, cmNcomInstructions, resultIndexes, mapoutputIndexes, false).stats;
    // set up the number of reducers
    // each output tag is a group
    MRJobConfiguration.setNumReducers(job, mapoutputIndexes.size(), numReducers);
    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);
    // By default, the job executes in "cluster" mode.
    // Determine if we can optimize and run it in "local" mode.
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }
    // set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    RunningJob runjob = JobClient.runJob(job);
    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
Also used : DMLConfig(org.apache.sysml.conf.DMLConfig) TaggedFirstSecondIndexes(org.apache.sysml.runtime.matrix.data.TaggedFirstSecondIndexes) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) RunningJob(org.apache.hadoop.mapred.RunningJob) JobConf(org.apache.hadoop.mapred.JobConf)

Example 53 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.

the class DataGenMR method runJob.

/**
 * <p>Starts a Rand MapReduce job which will produce one or more random objects.</p>
 *
 * @param inst MR job instruction
 * @param dataGenInstructions array of data gen instructions
 * @param instructionsInMapper instructions in mapper
 * @param aggInstructionsInReducer aggregate instructions in reducer
 * @param otherInstructionsInReducer other instructions in reducer
 * @param numReducers number of reducers
 * @param replication file replication
 * @param resultIndexes result indexes for each random object
 * @param dimsUnknownFilePrefix file path prefix when dimensions unknown
 * @param outputs output file for each random object
 * @param outputInfos output information for each random object
 * @return matrix characteristics for each random object
 * @throws Exception if Exception occurs
 */
public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception {
    JobConf job = new JobConf(DataGenMR.class);
    job.setJobName("DataGen-MR");
    // whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, true);
    byte[] realIndexes = new byte[dataGenInstructions.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
    String[] inputs = new String[dataGenInstructions.length];
    InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
    long[] rlens = new long[dataGenInstructions.length];
    long[] clens = new long[dataGenInstructions.length];
    int[] brlens = new int[dataGenInstructions.length];
    int[] bclens = new int[dataGenInstructions.length];
    FileSystem fs = FileSystem.get(job);
    String dataGenInsStr = "";
    int numblocks = 0;
    int maxbrlen = -1, maxbclen = -1;
    double maxsparsity = -1;
    for (int i = 0; i < dataGenInstructions.length; i++) {
        dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];
        MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
        MRType mrtype = mrins.getMRInstructionType();
        DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;
        rlens[i] = genInst.getRows();
        clens[i] = genInst.getCols();
        brlens[i] = genInst.getRowsInBlock();
        bclens[i] = genInst.getColsInBlock();
        maxbrlen = Math.max(maxbrlen, brlens[i]);
        maxbclen = Math.max(maxbclen, bclens[i]);
        if (mrtype == MRType.Rand) {
            RandInstruction randInst = (RandInstruction) mrins;
            inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir());
            maxsparsity = Math.max(maxsparsity, randInst.getSparsity());
            PrintWriter pw = null;
            try {
                pw = new PrintWriter(fs.create(new Path(inputs[i])));
                // for obj reuse and preventing repeated buffer re-allocations
                StringBuilder sb = new StringBuilder();
                // seed generation
                Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
                for (long r = 0; r < Math.max(rlens[i], 1); r += brlens[i]) {
                    long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
                    for (long c = 0; c < Math.max(clens[i], 1); c += bclens[i]) {
                        long curBlockColSize = Math.min(bclens[i], (clens[i] - c));
                        sb.append((r / brlens[i]) + 1);
                        sb.append(',');
                        sb.append((c / bclens[i]) + 1);
                        sb.append(',');
                        sb.append(curBlockRowSize);
                        sb.append(',');
                        sb.append(curBlockColSize);
                        sb.append(',');
                        sb.append(bigrand.nextLong());
                        pw.println(sb.toString());
                        sb.setLength(0);
                        numblocks++;
                    }
                }
            } finally {
                IOUtilFunctions.closeSilently(pw);
            }
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else if (mrtype == MRType.Seq) {
            SeqInstruction seqInst = (SeqInstruction) mrins;
            inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
            // always dense
            maxsparsity = 1.0;
            double from = seqInst.fromValue;
            double to = seqInst.toValue;
            double incr = seqInst.incrValue;
            // handle default 1 to -1 for special case of from>to
            incr = LibMatrixDatagen.updateSeqIncr(from, to, incr);
            // Correctness checks on (from, to, incr)
            boolean neg = (from > to);
            if (incr == 0)
                throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");
            if (neg != (incr < 0))
                throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");
            // Compute the number of rows in the sequence
            long numrows = UtilFunctions.getSeqLength(from, to, incr);
            if (rlens[i] > 0) {
                if (numrows != rlens[i])
                    throw new DMLRuntimeException("Unexpected error while processing sequence instruction. Expected number of rows does not match given number: " + rlens[i] + " != " + numrows);
            } else {
                rlens[i] = numrows;
            }
            if (clens[i] > 0 && clens[i] != 1)
                throw new DMLRuntimeException("Unexpected error while processing sequence instruction. Number of columns (" + clens[i] + ") must be equal to 1.");
            else
                clens[i] = 1;
            PrintWriter pw = null;
            try {
                pw = new PrintWriter(fs.create(new Path(inputs[i])));
                StringBuilder sb = new StringBuilder();
                double temp = from;
                double block_from, block_to;
                for (long r = 0; r < rlens[i]; r += brlens[i]) {
                    long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
                    // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval)
                    long bid_i = ((r / brlens[i]) + 1);
                    long bid_j = 1;
                    block_from = temp;
                    block_to = temp + (curBlockRowSize - 1) * incr;
                    // next block starts from here
                    temp = block_to + incr;
                    sb.append(bid_i);
                    sb.append(',');
                    sb.append(bid_j);
                    sb.append(',');
                    sb.append(block_from);
                    sb.append(',');
                    sb.append(block_to);
                    sb.append(',');
                    sb.append(incr);
                    pw.println(sb.toString());
                    sb.setLength(0);
                    numblocks++;
                }
            } finally {
                IOUtilFunctions.closeSilently(pw);
            }
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else {
            throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
        }
    }
    // remove the first ","
    dataGenInsStr = dataGenInsStr.substring(1);
    RunningJob runjob;
    MatrixCharacteristics[] stats;
    try {
        // set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
        // set up the input files and their format information
        MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK);
        // set up the dimensions of input matrices
        MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
        MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
        // set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
        // set up the rand Instructions
        MRJobConfiguration.setRandInstructions(job, dataGenInsStr);
        // set up unary instructions that will perform in the mapper
        MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
        // set up the aggregate instructions that will happen in the combiner and reducer
        MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);
        // set up the instructions that will happen in the reducer, after the aggregation instrucions
        MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
        // set up the replication factor for the results
        job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
        // set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getDMLConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
        // set up custom map/reduce configurations
        MRJobConfiguration.setupCustomMRConfigurations(job, config);
        // determine degree of parallelism (nmappers: 1<=n<=capacity)
        // TODO use maxsparsity whenever we have a way of generating sparse rand data
        int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
        long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
        // correction max number of mappers on yarn clusters
        if (InfrastructureAnalyzer.isYarnEnabled())
            capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
        int nmapers = Math.max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1);
        job.setNumMapTasks(nmapers);
        // set up what matrices are needed to pass from the mapper to reducer
        HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes);
        MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
        stats = ret.stats;
        // set up the number of reducers
        MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
        // print the complete MRJob instruction
        if (LOG.isTraceEnabled())
            inst.printCompleteMRJobInstruction(stats);
        // Update resultDimsUnknown based on computed "stats"
        byte[] resultDimsUnknown = new byte[resultIndexes.length];
        for (int i = 0; i < resultIndexes.length; i++) {
            if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
                resultDimsUnknown[i] = (byte) 1;
            } else {
                resultDimsUnknown[i] = (byte) 0;
            }
        }
        boolean mayContainCtable = instructionsInMapper.contains("ctabletransform") || instructionsInMapper.contains("groupedagg");
        // set up the multiple output files, and their format information
        MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable);
        // configure mapper and the mapper output key value pairs
        job.setMapperClass(DataGenMapper.class);
        if (numReducers == 0) {
            job.setMapOutputKeyClass(Writable.class);
            job.setMapOutputValueClass(Writable.class);
        } else {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        }
        // set up combiner
        if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty())
            job.setCombinerClass(GMRCombiner.class);
        // configure reducer
        job.setReducerClass(GMRReducer.class);
        // job.setReducerClass(PassThroughReducer.class);
        // By default, the job executes in "cluster" mode.
        // Determine if we can optimize and run it in "local" mode.
        MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
        for (int i = 0; i < inputs.length; i++) {
            inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
        }
        // set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);
        runjob = JobClient.runJob(job);
        /* Process different counters */
        Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
        for (int i = 0; i < resultIndexes.length; i++) {
            // number of non-zeros
            stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
        }
        String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
        stats = MapReduceTool.processDimsFiles(dir, stats);
        MapReduceTool.deleteFileIfExistOnHDFS(dir);
    } finally {
        for (String input : inputs) MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
    }
    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
Also used : Group(org.apache.hadoop.mapred.Counters.Group) DataGenMRInstruction(org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) GMRCombiner(org.apache.sysml.runtime.matrix.mapred.GMRCombiner) FileSystem(org.apache.hadoop.fs.FileSystem) DataGenMRInstruction(org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction) MRInstruction(org.apache.sysml.runtime.instructions.mr.MRInstruction) JobConf(org.apache.hadoop.mapred.JobConf) PrintWriter(java.io.PrintWriter) Path(org.apache.hadoop.fs.Path) DMLConfig(org.apache.sysml.conf.DMLConfig) SeqInstruction(org.apache.sysml.runtime.instructions.mr.SeqInstruction) RandInstruction(org.apache.sysml.runtime.instructions.mr.RandInstruction) MRType(org.apache.sysml.runtime.instructions.mr.MRInstruction.MRType) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixChar_N_ReducerGroups(org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups) RunningJob(org.apache.hadoop.mapred.RunningJob) Well1024a(org.apache.commons.math3.random.Well1024a)

Example 54 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.

the class ResultMergeLocalMemory method createNewMatrixObject.

private MatrixObject createNewMatrixObject(MatrixBlock data) {
    ValueType vt = _output.getValueType();
    MetaDataFormat metadata = (MetaDataFormat) _output.getMetaData();
    MatrixObject moNew = new MatrixObject(vt, _outputFName);
    // create deep copy of metadata obj
    MatrixCharacteristics mcOld = metadata.getMatrixCharacteristics();
    OutputInfo oiOld = metadata.getOutputInfo();
    InputInfo iiOld = metadata.getInputInfo();
    MatrixCharacteristics mc = new MatrixCharacteristics(mcOld.getRows(), mcOld.getCols(), mcOld.getRowsPerBlock(), mcOld.getColsPerBlock());
    mc.setNonZeros(data.getNonZeros());
    MetaDataFormat meta = new MetaDataFormat(mc, oiOld, iiOld);
    moNew.setMetaData(meta);
    // adjust dense/sparse representation
    data.examSparsity();
    // release new output
    moNew.acquireModify(data);
    moNew.release();
    return moNew;
}
Also used : OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) ValueType(org.apache.sysml.parser.Expression.ValueType) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 55 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.

the class ResultMergeRemoteMR method executeParallelMerge.

@Override
public MatrixObject executeParallelMerge(int par) {
    // always create new matrix object (required for nested parallelism)
    MatrixObject moNew = null;
    if (LOG.isTraceEnabled())
        LOG.trace("ResultMerge (remote, mr): Execute serial merge for output " + _output.hashCode() + " (fname=" + _output.getFileName() + ")");
    try {
        // collect all relevant inputs
        Collection<String> srcFnames = new LinkedList<>();
        ArrayList<MatrixObject> inMO = new ArrayList<>();
        for (MatrixObject in : _inputs) {
            // check for empty inputs (no iterations executed)
            if (in != null && in != _output) {
                // ensure that input file resides on disk
                in.exportData();
                // add to merge list
                srcFnames.add(in.getFileName());
                inMO.add(in);
            }
        }
        if (!srcFnames.isEmpty()) {
            // ensure that outputfile (for comparison) resides on disk
            _output.exportData();
            // actual merge
            MetaDataFormat metadata = (MetaDataFormat) _output.getMetaData();
            MatrixCharacteristics mcOld = metadata.getMatrixCharacteristics();
            String fnameCompare = _output.getFileName();
            if (mcOld.getNonZeros() == 0)
                // no compare required
                fnameCompare = null;
            executeMerge(fnameCompare, _outputFName, srcFnames.toArray(new String[0]), metadata.getInputInfo(), metadata.getOutputInfo(), mcOld.getRows(), mcOld.getCols(), mcOld.getRowsPerBlock(), mcOld.getColsPerBlock());
            // create new output matrix (e.g., to prevent potential export<->read file access conflict
            moNew = new MatrixObject(_output.getValueType(), _outputFName);
            OutputInfo oiOld = metadata.getOutputInfo();
            InputInfo iiOld = metadata.getInputInfo();
            MatrixCharacteristics mc = new MatrixCharacteristics(mcOld);
            mc.setNonZeros(_isAccum ? -1 : computeNonZeros(_output, inMO));
            MetaDataFormat meta = new MetaDataFormat(mc, oiOld, iiOld);
            moNew.setMetaData(meta);
        } else {
            // return old matrix, to prevent copy
            moNew = _output;
        }
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }
    return moNew;
}
Also used : OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Aggregations

InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)74 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)38 OutputInfo (org.apache.sysml.runtime.matrix.data.OutputInfo)30 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)26 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)20 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)20 IOException (java.io.IOException)17 JobConf (org.apache.hadoop.mapred.JobConf)13 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)13 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)12 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)11 RunningJob (org.apache.hadoop.mapred.RunningJob)10 Path (org.apache.hadoop.fs.Path)9 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)9 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)9 DMLConfig (org.apache.sysml.conf.DMLConfig)8 ValueType (org.apache.sysml.parser.Expression.ValueType)8 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)7 CSVFileFormatProperties (org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties)7 Group (org.apache.hadoop.mapred.Counters.Group)6