Search in sources :

Example 1 with CSVWriteInstruction

use of org.apache.sysml.runtime.instructions.mr.CSVWriteInstruction in project incubator-systemml by apache.

the class CSVWriteMapper method configure.

public void configure(JobConf job) {
    super.configure(job);
    try {
        CSVWriteInstruction[] ins = MRJobConfiguration.getCSVWriteInstructions(job);
        for (CSVWriteInstruction in : ins) {
            ArrayList<Byte> outputs = inputOutputMap.get(in.input);
            if (outputs == null) {
                outputs = new ArrayList<Byte>();
                inputOutputMap.put(in.input, outputs);
            }
            outputs.add(in.output);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : CSVWriteInstruction(org.apache.sysml.runtime.instructions.mr.CSVWriteInstruction) IOException(java.io.IOException)

Example 2 with CSVWriteInstruction

use of org.apache.sysml.runtime.instructions.mr.CSVWriteInstruction in project incubator-systemml by apache.

the class CSVWriteReducer method configure.

@Override
public void configure(JobConf job) {
    super.configure(job);
    byte maxIndex = 0;
    HashMap<Byte, CSVWriteInstruction> out2Ins = new HashMap<Byte, CSVWriteInstruction>();
    try {
        CSVWriteInstruction[] ins = MRJobConfiguration.getCSVWriteInstructions(job);
        for (CSVWriteInstruction in : ins) {
            out2Ins.put(in.output, in);
            if (in.output > maxIndex)
                maxIndex = in.output;
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    int numParitions = job.getNumReduceTasks();
    int taskID = MapReduceTool.getUniqueTaskId(job);
    //LOG.info("## taks id: "+taskID);
    //for efficiency only, the arrays may have missing values
    rowIndexes = new long[maxIndex + 1];
    colIndexes = new long[maxIndex + 1];
    maxRowIndexes = new long[maxIndex + 1];
    minRowIndexes = new long[maxIndex + 1];
    numColBlocks = new long[maxIndex + 1];
    lastBlockNCols = new int[maxIndex + 1];
    colsPerBlock = new int[maxIndex + 1];
    delims = new String[maxIndex + 1];
    sparses = new boolean[maxIndex + 1];
    tagToResultIndex = new int[maxIndex + 1];
    for (int i = 0; i < resultIndexes.length; i++) {
        byte ri = resultIndexes[i];
        tagToResultIndex[ri] = i;
        CSVWriteInstruction in = out2Ins.get(ri);
        MatrixCharacteristics dim = MRJobConfiguration.getMatrixCharacteristicsForInput(job, in.input);
        delims[ri] = in.delim;
        sparses[ri] = in.sparse;
        numColBlocks[ri] = (long) Math.ceil((double) dim.getCols() / (double) dim.getColsPerBlock());
        lastBlockNCols[ri] = (int) (dim.getCols() % dim.getColsPerBlock());
        colsPerBlock[ri] = dim.getColsPerBlock();
        long rstep = (long) Math.ceil((double) dim.getRows() / (double) numParitions);
        minRowIndexes[ri] = rowIndexes[ri] = rstep * taskID;
        maxRowIndexes[ri] = Math.min(rstep * (taskID + 1), dim.getRows());
        colIndexes[ri] = 0;
    }
    zeroBlock.setData(new MatrixBlock());
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) HashMap(java.util.HashMap) CSVWriteInstruction(org.apache.sysml.runtime.instructions.mr.CSVWriteInstruction) IOException(java.io.IOException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 3 with CSVWriteInstruction

use of org.apache.sysml.runtime.instructions.mr.CSVWriteInstruction in project incubator-systemml by apache.

the class WriteCSVMR method runJob.

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String csvWriteInstructions, int numReducers, int replication, byte[] resultIndexes, String[] outputs) throws Exception {
    JobConf job = new JobConf(WriteCSVMR.class);
    job.setJobName("WriteCSV-MR");
    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, true, ConvertTarget.CSVWRITE);
    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
    MRJobConfiguration.setCSVWriteInstructions(job, csvWriteInstructions);
    //set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);
    //set up custom map/reduce configurations 
    DMLConfig config = ConfigurationManager.getDMLConfig();
    MRJobConfiguration.setupCustomMRConfigurations(job, config);
    long maxRlen = 0;
    for (long rlen : rlens) if (rlen > maxRlen)
        maxRlen = rlen;
    //set up the number of reducers (according to output size)
    int numRed = determineNumReducers(rlens, clens, config.getIntValue(DMLConfig.NUM_REDUCERS), (int) maxRlen);
    job.setNumReduceTasks(numRed);
    byte[] resultDimsUnknown = new byte[resultIndexes.length];
    MatrixCharacteristics[] stats = new MatrixCharacteristics[resultIndexes.length];
    OutputInfo[] outputInfos = new OutputInfo[outputs.length];
    HashMap<Byte, Integer> indexmap = new HashMap<Byte, Integer>();
    for (int i = 0; i < stats.length; i++) {
        indexmap.put(resultIndexes[i], i);
        resultDimsUnknown[i] = (byte) 0;
        stats[i] = new MatrixCharacteristics();
        outputInfos[i] = OutputInfo.CSVOutputInfo;
    }
    CSVWriteInstruction[] ins = MRInstructionParser.parseCSVWriteInstructions(csvWriteInstructions);
    for (CSVWriteInstruction in : ins) stats[indexmap.get(in.output)].set(rlens[in.input], clens[in.input], -1, -1);
    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);
    //set up what matrices are needed to pass from the mapper to reducer
    MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, "", "", csvWriteInstructions, resultIndexes);
    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, true);
    // configure mapper and the mapper output key value pairs
    job.setMapperClass(CSVWriteMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(MatrixBlock.class);
    //configure reducer
    job.setReducerClass(CSVWriteReducer.class);
    job.setOutputKeyComparatorClass(TaggedFirstSecondIndexes.Comparator.class);
    job.setPartitionerClass(TaggedFirstSecondIndexes.FirstIndexRangePartitioner.class);
    //job.setOutputFormat(UnPaddedOutputFormat.class);
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }
    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    RunningJob runjob = JobClient.runJob(job);
    /* Process different counters */
    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    }
    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
Also used : Group(org.apache.hadoop.mapred.Counters.Group) DMLConfig(org.apache.sysml.conf.DMLConfig) HashMap(java.util.HashMap) TaggedFirstSecondIndexes(org.apache.sysml.runtime.matrix.data.TaggedFirstSecondIndexes) OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) RunningJob(org.apache.hadoop.mapred.RunningJob) JobConf(org.apache.hadoop.mapred.JobConf) CSVWriteInstruction(org.apache.sysml.runtime.instructions.mr.CSVWriteInstruction)

Aggregations

CSVWriteInstruction (org.apache.sysml.runtime.instructions.mr.CSVWriteInstruction)3 IOException (java.io.IOException)2 HashMap (java.util.HashMap)2 Group (org.apache.hadoop.mapred.Counters.Group)1 JobConf (org.apache.hadoop.mapred.JobConf)1 RunningJob (org.apache.hadoop.mapred.RunningJob)1 DMLConfig (org.apache.sysml.conf.DMLConfig)1 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)1 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)1 OutputInfo (org.apache.sysml.runtime.matrix.data.OutputInfo)1 TaggedFirstSecondIndexes (org.apache.sysml.runtime.matrix.data.TaggedFirstSecondIndexes)1