Search in sources :

Example 31 with Group

use of org.apache.hadoop.mapred.Counters.Group in project systemml by apache.

the class SortMR method runJob.

@SuppressWarnings({ "unchecked", "rawtypes" })
public static JobReturn runJob(MRJobInstruction inst, String input, InputInfo inputInfo, long rlen, long clen, int brlen, int bclen, String combineInst, String sortInst, int numReducers, int replication, String output, OutputInfo outputInfo, boolean valueIsWeight) throws Exception {
    boolean sortIndexes = getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes;
    String tmpOutput = sortIndexes ? MRJobConfiguration.constructTempOutputFilename() : output;
    JobConf job = new JobConf(SortMR.class);
    job.setJobName("SortMR");
    // setup partition file
    String pfname = MRJobConfiguration.setUpSortPartitionFilename(job);
    Path partitionFile = new Path(pfname);
    URI partitionUri = new URI(partitionFile.toString());
    // setup input/output paths
    Path inputDir = new Path(input);
    inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
    FileInputFormat.setInputPaths(job, inputDir);
    Path outpath = new Path(tmpOutput);
    FileOutputFormat.setOutputPath(job, outpath);
    MapReduceTool.deleteFileIfExistOnHDFS(outpath, job);
    // set number of reducers (1 if local mode)
    if (!InfrastructureAnalyzer.isLocalMode(job)) {
        MRJobConfiguration.setNumReducers(job, numReducers, numReducers);
        // on cp-side qpick instructions for quantile/iqm/median (~128MB)
        if (!(getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes))
            job.setNumReduceTasks((int) Math.max(job.getNumReduceTasks(), rlen / 10000000));
    } else
        // in case of local mode
        job.setNumReduceTasks(1);
    // setup input/output format
    job.setInputFormat(SamplingSortMRInputFormat.class);
    SamplingSortMRInputFormat.setTargetKeyValueClasses(job, (Class<? extends WritableComparable>) outputInfo.outputKeyClass, outputInfo.outputValueClass);
    // setup instructions and meta information
    if (combineInst != null && !combineInst.trim().isEmpty())
        job.set(COMBINE_INSTRUCTION, combineInst);
    job.set(SORT_INSTRUCTION, sortInst);
    job.setBoolean(VALUE_IS_WEIGHT, valueIsWeight);
    boolean desc = getSortInstructionDescending(sortInst);
    job.setBoolean(SORT_DECREASING, desc);
    MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen);
    MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
    int partitionWith0 = SamplingSortMRInputFormat.writePartitionFile(job, partitionFile);
    // setup mapper/reducer/partitioner/output classes
    if (getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes) {
        MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
        job.setOutputFormat(OutputInfo.BinaryBlockOutputInfo.outputFormatClass);
        job.setMapperClass(IndexSortMapper.class);
        job.setReducerClass(IndexSortReducer.class);
        job.setMapOutputKeyClass(!desc ? IndexSortComparable.class : IndexSortComparableDesc.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setOutputKeyClass(MatrixIndexes.class);
        job.setOutputValueClass(MatrixBlock.class);
    } else {
        // default case: SORT w/wo weights
        MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
        job.setOutputFormat(CompactOutputFormat.class);
        job.setMapperClass(ValueSortMapper.class);
        job.setReducerClass(ValueSortReducer.class);
        // double
        job.setOutputKeyClass(outputInfo.outputKeyClass);
        // int
        job.setOutputValueClass(outputInfo.outputValueClass);
    }
    job.setPartitionerClass(TotalOrderPartitioner.class);
    // setup distributed cache
    DistributedCache.addCacheFile(partitionUri, job);
    DistributedCache.createSymlink(job);
    // setup replication factor
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    // set up custom map/reduce configurations
    DMLConfig config = ConfigurationManager.getDMLConfig();
    MRJobConfiguration.setupCustomMRConfigurations(job, config);
    MatrixCharacteristics[] s = new MatrixCharacteristics[1];
    s[0] = new MatrixCharacteristics(rlen, clen, brlen, bclen);
    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(s);
    // set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    // run mr job
    RunningJob runjob = JobClient.runJob(job);
    Group group = runjob.getCounters().getGroup(NUM_VALUES_PREFIX);
    numReducers = job.getNumReduceTasks();
    // process final meta data
    long[] counts = new long[numReducers];
    long total = 0;
    for (int i = 0; i < numReducers; i++) {
        counts[i] = group.getCounter(Integer.toString(i));
        total += counts[i];
    }
    // add missing 0s back to the results
    long missing0s = 0;
    if (total < rlen * clen) {
        if (partitionWith0 < 0)
            throw new RuntimeException("no partition contains 0, which is wrong!");
        missing0s = rlen * clen - total;
        counts[partitionWith0] += missing0s;
    } else
        partitionWith0 = -1;
    if (sortIndexes) {
        // run builtin job for shifting partially sorted blocks according to global offsets
        // we do this in this custom form since it would not fit into the current structure
        // of systemml to output two intermediates (partially sorted data, offsets) out of a
        // single SortKeys lop
        boolean success = runjob.isSuccessful();
        if (success) {
            success = runStitchupJob(tmpOutput, rlen, clen, brlen, bclen, counts, numReducers, replication, output);
        }
        MapReduceTool.deleteFileIfExistOnHDFS(tmpOutput);
        MapReduceTool.deleteFileIfExistOnHDFS(pfname);
        return new JobReturn(s[0], OutputInfo.BinaryBlockOutputInfo, success);
    } else {
        MapReduceTool.deleteFileIfExistOnHDFS(pfname);
        return new JobReturn(s[0], counts, partitionWith0, missing0s, runjob.isSuccessful());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.hadoop.mapred.Counters.Group) DMLConfig(org.apache.sysml.conf.DMLConfig) IndexSortComparableDesc(org.apache.sysml.runtime.matrix.sort.IndexSortComparableDesc) URI(java.net.URI) RunningJob(org.apache.hadoop.mapred.RunningJob) IndexSortComparable(org.apache.sysml.runtime.matrix.sort.IndexSortComparable) JobConf(org.apache.hadoop.mapred.JobConf)

Example 32 with Group

use of org.apache.hadoop.mapred.Counters.Group in project systemml by apache.

the class WriteCSVMR method runJob.

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String csvWriteInstructions, int numReducers, int replication, byte[] resultIndexes, String[] outputs) throws Exception {
    JobConf job = new JobConf(WriteCSVMR.class);
    job.setJobName("WriteCSV-MR");
    // check for valid output dimensions
    for (int i = 0; i < rlens.length; i++) if (rlens[i] == 0 || clens[i] == 0)
        throw new IOException("Write of matrices with zero" + " rows or columns not supported (" + rlens[i] + "x" + clens[i] + ").");
    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
    // set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, true, ConvertTarget.CSVWRITE);
    // set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
    // set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
    MRJobConfiguration.setCSVWriteInstructions(job, csvWriteInstructions);
    // set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    // set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);
    // set up custom map/reduce configurations
    DMLConfig config = ConfigurationManager.getDMLConfig();
    MRJobConfiguration.setupCustomMRConfigurations(job, config);
    long maxRlen = 0;
    for (long rlen : rlens) if (rlen > maxRlen)
        maxRlen = rlen;
    // set up the number of reducers (according to output size)
    int numRed = determineNumReducers(rlens, clens, config.getIntValue(DMLConfig.NUM_REDUCERS), (int) maxRlen);
    job.setNumReduceTasks(numRed);
    byte[] resultDimsUnknown = new byte[resultIndexes.length];
    MatrixCharacteristics[] stats = new MatrixCharacteristics[resultIndexes.length];
    OutputInfo[] outputInfos = new OutputInfo[outputs.length];
    HashMap<Byte, Integer> indexmap = new HashMap<>();
    for (int i = 0; i < stats.length; i++) {
        indexmap.put(resultIndexes[i], i);
        resultDimsUnknown[i] = (byte) 0;
        stats[i] = new MatrixCharacteristics();
        outputInfos[i] = OutputInfo.CSVOutputInfo;
    }
    CSVWriteInstruction[] ins = MRInstructionParser.parseCSVWriteInstructions(csvWriteInstructions);
    for (CSVWriteInstruction in : ins) stats[indexmap.get(in.output)].set(rlens[in.input], clens[in.input], -1, -1);
    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);
    // set up what matrices are needed to pass from the mapper to reducer
    MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, "", "", csvWriteInstructions, resultIndexes);
    // set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, true);
    // configure mapper and the mapper output key value pairs
    job.setMapperClass(CSVWriteMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(MatrixBlock.class);
    // configure reducer
    job.setReducerClass(CSVWriteReducer.class);
    job.setOutputKeyComparatorClass(TaggedFirstSecondIndexes.Comparator.class);
    job.setPartitionerClass(TaggedFirstSecondIndexes.FirstIndexRangePartitioner.class);
    // job.setOutputFormat(UnPaddedOutputFormat.class);
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }
    // set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    RunningJob runjob = JobClient.runJob(job);
    /* Process different counters */
    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    }
    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
Also used : Group(org.apache.hadoop.mapred.Counters.Group) DMLConfig(org.apache.sysml.conf.DMLConfig) HashMap(java.util.HashMap) IOException(java.io.IOException) TaggedFirstSecondIndexes(org.apache.sysml.runtime.matrix.data.TaggedFirstSecondIndexes) OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) RunningJob(org.apache.hadoop.mapred.RunningJob) JobConf(org.apache.hadoop.mapred.JobConf) CSVWriteInstruction(org.apache.sysml.runtime.instructions.mr.CSVWriteInstruction)

Example 33 with Group

use of org.apache.hadoop.mapred.Counters.Group in project hive by apache.

the class HiveHistoryImpl method setTaskCounters.

@Override
public void setTaskCounters(String queryId, String taskId, Counters ctrs) {
    String id = queryId + ":" + taskId;
    QueryInfo ji = queryInfoMap.get(queryId);
    StringBuilder sb1 = new StringBuilder("");
    TaskInfo ti = taskInfoMap.get(id);
    if ((ti == null) || (ctrs == null)) {
        return;
    }
    StringBuilder sb = new StringBuilder("");
    try {
        boolean first = true;
        for (Group group : ctrs) {
            for (Counter counter : group) {
                if (first) {
                    first = false;
                } else {
                    sb.append(',');
                }
                sb.append(group.getDisplayName());
                sb.append('.');
                sb.append(counter.getDisplayName());
                sb.append(':');
                sb.append(counter.getCounter());
                String tab = getRowCountTableName(counter.getDisplayName());
                if (tab != null) {
                    if (sb1.length() > 0) {
                        sb1.append(",");
                    }
                    sb1.append(tab);
                    sb1.append('~');
                    sb1.append(counter.getCounter());
                    ji.rowCountMap.put(tab, counter.getCounter());
                }
            }
        }
    } catch (Exception e) {
        LOG.warn("Failed to set task counters", e);
    }
    if (sb1.length() > 0) {
        taskInfoMap.get(id).hm.put(Keys.ROWS_INSERTED.name(), sb1.toString());
        queryInfoMap.get(queryId).hm.put(Keys.ROWS_INSERTED.name(), sb1.toString());
    }
    if (sb.length() > 0) {
        taskInfoMap.get(id).hm.put(Keys.TASK_COUNTERS.name(), sb.toString());
    }
}
Also used : Group(org.apache.hadoop.mapred.Counters.Group) Counter(org.apache.hadoop.mapred.Counters.Counter) IOException(java.io.IOException)

Aggregations

Group (org.apache.hadoop.mapred.Counters.Group)33 JobConf (org.apache.hadoop.mapred.JobConf)26 RunningJob (org.apache.hadoop.mapred.RunningJob)26 DMLConfig (org.apache.sysml.conf.DMLConfig)23 Path (org.apache.hadoop.fs.Path)14 MatrixChar_N_ReducerGroups (org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups)11 IOException (java.io.IOException)7 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)6 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)6 FrameworkCounterGroup (org.apache.hadoop.mapreduce.counters.FrameworkCounterGroup)5 Test (org.junit.Test)5 Counter (org.apache.hadoop.mapred.Counters.Counter)4 LocalVariableMap (org.apache.sysml.runtime.controlprogram.LocalVariableMap)4 TaggedMatrixBlock (org.apache.sysml.runtime.matrix.data.TaggedMatrixBlock)4 GroupFactory (org.apache.hadoop.mapred.Counters.GroupFactory)3 FrameworkGroupFactory (org.apache.hadoop.mapreduce.counters.CounterGroupFactory.FrameworkGroupFactory)3 PrintWriter (java.io.PrintWriter)2 URI (java.net.URI)2 HashMap (java.util.HashMap)2 Well1024a (org.apache.commons.math3.random.Well1024a)2