Examples with RunningJob - org.apache.hadoop.mapred.RunningJob

Example 21 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project incubator-systemml by apache.

the class GenTfMtdMR method runJob.

public static long runJob(String inputPath, String txMtdPath, String specWithIDs, String smallestFile, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols, int replication, String headerLine) throws IOException, ClassNotFoundException, InterruptedException {
    JobConf job = new JobConf(GenTfMtdMR.class);
    job.setJobName("GenTfMTD");
    /* Setup MapReduce Job */
    job.setJarByClass(GenTfMtdMR.class);
    // set relevant classes
    job.setMapperClass(GTFMTDMapper.class);
    job.setReducerClass(GTFMTDReducer.class);
    // set input and output properties
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(DistinctValue.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // delete outputPath, if exists already.
    Path outPath = new Path(txMtdPath);
    FileSystem fs = IOUtilFunctions.getFileSystem(outPath, job);
    fs.delete(outPath, true);
    FileOutputFormat.setOutputPath(job, outPath);
    job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
    job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
    if (inputDataProperties.getNAStrings() != null)
        // Adding "dummy" string to handle the case of na_strings = ""
        job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
    job.set(MRJobConfiguration.TF_SPEC, specWithIDs);
    job.set(MRJobConfiguration.TF_SMALLEST_FILE, smallestFile);
    job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols);
    job.set(MRJobConfiguration.TF_HEADER, headerLine);
    job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, txMtdPath);
    // offsets file to store part-file names and offsets for each input split
    job.set(MRJobConfiguration.TF_OFFSETS_FILE, partOffsetsFile);
    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);
    // Run the job
    RunningJob runjob = JobClient.runJob(job);
    Counters c = runjob.getCounters();
    long tx_numRows = c.findCounter(MRJobConfiguration.DataTransformCounters.TRANSFORMED_NUM_ROWS).getCounter();
    return tx_numRows;
}

Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) JobConf(org.apache.hadoop.mapred.JobConf)

Example 22 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project incubator-systemml by apache.

the class ApplyTfCSVMR method runJob.

public static JobReturn runJob(String inputPath, String spec, String mapsPath, String tmpPath, String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols, int replication, String headerLine) throws IOException, ClassNotFoundException, InterruptedException {
    JobConf job = new JobConf(ApplyTfCSVMR.class);
    job.setJobName("ApplyTfCSV");
    /* Setup MapReduce Job */
    job.setJarByClass(ApplyTfCSVMR.class);
    // set relevant classes
    job.setMapperClass(ApplyTfCSVMapper.class);
    job.setNumReduceTasks(0);
    // Add transformation metadata file as well as partOffsetsFile to Distributed cache
    DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job);
    DistributedCache.createSymlink(job);
    Path cachefile = new Path(partOffsetsFile);
    DistributedCache.addCacheFile(cachefile.toUri(), job);
    DistributedCache.createSymlink(job);
    // set input and output properties
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // delete outputPath, if exists already.
    Path outPath = new Path(outputPath);
    FileSystem fs = IOUtilFunctions.getFileSystem(outPath, job);
    fs.delete(outPath, true);
    FileOutputFormat.setOutputPath(job, outPath);
    job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
    job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
    if (inputDataProperties.getNAStrings() != null)
        // Adding "dummy" string to handle the case of na_strings = ""
        job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
    job.set(MRJobConfiguration.TF_SPEC, spec);
    job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath));
    job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath);
    job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols);
    job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath);
    job.set(MRJobConfiguration.TF_HEADER, headerLine);
    job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString());
    job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath);
    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);
    // Run the job
    RunningJob runjob = JobClient.runJob(job);
    // Since transform CSV produces part files w/ prefix transform-part-*,
    // delete all the "default" part-..... files
    deletePartFiles(fs, outPath);
    MatrixCharacteristics mc = new MatrixCharacteristics();
    return new JobReturn(new MatrixCharacteristics[] { mc }, runjob.isSuccessful());
}

Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) RunningJob(org.apache.hadoop.mapred.RunningJob) JobConf(org.apache.hadoop.mapred.JobConf) JobReturn(org.apache.sysml.runtime.matrix.JobReturn) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 23 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project incubator-systemml by apache.

the class ApplyTfBBMR method runJob.

public static JobReturn runJob(String inputPath, String rblkInst, String otherInst, String spec, String mapsPath, String tmpPath, String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numRows, long numColsBefore, long numColsAfter, int replication, String headerLine) throws Exception {
    CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(rblkInst);
    long[] rlens = new long[] { numRows };
    long[] clens = new long[] { numColsAfter };
    int[] brlens = new int[] { rblk.brlen };
    int[] bclens = new int[] { rblk.bclen };
    byte[] realIndexes = new byte[] { rblk.input };
    byte[] resultIndexes = new byte[] { rblk.output };
    JobConf job = new JobConf(ApplyTfBBMR.class);
    job.setJobName("ApplyTfBB");
    /* Setup MapReduce Job */
    job.setJarByClass(ApplyTfBBMR.class);
    // set relevant classes
    job.setMapperClass(ApplyTfBBMapper.class);
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, new String[] { inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, brlens, bclens, false, ConvertTarget.CELL);
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
    MRJobConfiguration.setCSVReblockInstructions(job, rblkInst);
    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, otherInst);
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);
    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, rblkInst, null, otherInst, resultIndexes);
    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null, rblkInst, null, null, null, resultIndexes, mapoutputIndexes, false);
    //set up the number of reducers
    int numRed = WriteCSVMR.determineNumReducers(rlens, clens, ConfigurationManager.getNumReducers(), ret.numReducerGroups);
    job.setNumReduceTasks(numRed);
    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, new byte[] { rblk.output }, new byte[] { 0 }, new String[] { outputPath }, new OutputInfo[] { OutputInfo.BinaryBlockOutputInfo }, true, false);
    // configure mapper and the mapper output key value pairs
    job.setMapperClass(ApplyTfBBMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(BlockRow.class);
    //configure reducer
    job.setReducerClass(CSVReblockReducer.class);
    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);
    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    // Add transformation metadata file as well as partOffsetsFile to Distributed cache
    DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job);
    DistributedCache.createSymlink(job);
    Path cachefile = new Path(new Path(partOffsetsFile), "part-00000");
    DistributedCache.addCacheFile(cachefile.toUri(), job);
    DistributedCache.createSymlink(job);
    job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
    job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
    // Adding "dummy" string to handle the case of na_strings = ""
    if (inputDataProperties.getNAStrings() != null)
        job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
    job.set(MRJobConfiguration.TF_SPEC, spec);
    job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath));
    job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath);
    job.setLong(MRJobConfiguration.TF_NUM_COLS, numColsBefore);
    job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath);
    job.set(MRJobConfiguration.TF_HEADER, headerLine);
    job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString());
    job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath);
    RunningJob runjob = JobClient.runJob(job);
    MapReduceTool.deleteFileIfExistOnHDFS(cachefile, job);
    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        ret.stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    }
    return new JobReturn(ret.stats, runjob.isSuccessful());
}

Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.hadoop.mapred.Counters.Group) CSVReblockInstruction(org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction) JobReturn(org.apache.sysml.runtime.matrix.JobReturn) MatrixChar_N_ReducerGroups(org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups) RunningJob(org.apache.hadoop.mapred.RunningJob) JobConf(org.apache.hadoop.mapred.JobConf)

Example 24 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project incubator-systemml by apache.

the class SortMR method runStitchupJob.

private static boolean runStitchupJob(String input, long rlen, long clen, int brlen, int bclen, long[] counts, int numReducers, int replication, String output) throws Exception {
    JobConf job = new JobConf(SortMR.class);
    job.setJobName("SortIndexesMR");
    //setup input/output paths
    Path inpath = new Path(input);
    Path outpath = new Path(output);
    FileInputFormat.setInputPaths(job, inpath);
    FileOutputFormat.setOutputPath(job, outpath);
    MapReduceTool.deleteFileIfExistOnHDFS(outpath, job);
    //set number of reducers (1 if local mode)
    if (InfrastructureAnalyzer.isLocalMode(job))
        job.setNumReduceTasks(1);
    else
        MRJobConfiguration.setNumReducers(job, numReducers, numReducers);
    //setup input/output format
    InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
    OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
    job.setInputFormat(iinfo.inputFormatClass);
    job.setOutputFormat(oinfo.outputFormatClass);
    CompactInputFormat.setKeyValueClasses(job, MatrixIndexes.class, MatrixBlock.class);
    //setup mapper/reducer/output classes
    MRJobConfiguration.setInputInfo(job, (byte) 0, InputInfo.BinaryBlockInputInfo, brlen, bclen, ConvertTarget.BLOCK);
    job.setMapperClass(IndexSortStitchupMapper.class);
    job.setReducerClass(IndexSortStitchupReducer.class);
    job.setOutputKeyClass(oinfo.outputKeyClass);
    job.setOutputValueClass(oinfo.outputValueClass);
    MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen);
    MRJobConfiguration.setMatricesDimensions(job, new byte[] { 0 }, new long[] { rlen }, new long[] { clen });
    //compute shifted prefix sum of offsets and put into configuration
    long[] cumsumCounts = new long[counts.length];
    long sum = 0;
    for (int i = 0; i < counts.length; i++) {
        cumsumCounts[i] = sum;
        sum += counts[i];
    }
    job.set(SORT_INDEXES_OFFSETS, Arrays.toString(cumsumCounts));
    //setup replication factor
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    //run mr job
    RunningJob runJob = JobClient.runJob(job);
    return runJob.isSuccessful();
}

Also used : Path(org.apache.hadoop.fs.Path) OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) RunningJob(org.apache.hadoop.mapred.RunningJob) JobConf(org.apache.hadoop.mapred.JobConf)

Example 25 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project incubator-systemml by apache.

the class GMR method runJob.

/**
	 * Execute job.
	 * 
	 * @param inst MR job instruction
	 * @param inputs input matrices, the inputs are indexed by 0, 1, 2, .. based on the position in this string
	 * @param inputInfos the input format information for the input matrices
	 * @param rlens array of number of rows
	 * @param clens array of number of columns
	 * @param brlens array of number of rows in block
	 * @param bclens array of number of columns in block
	 * @param partitioned boolean array of partitioned status
	 * @param pformats array of data partition formats
	 * @param psizes does nothing
	 * @param recordReaderInstruction record reader instruction
	 * @param instructionsInMapper in Mapper, the set of unary operations that need to be performed on each input matrix
	 * @param aggInstructionsInReducer in Reducer, right after sorting, the set of aggreagte operations
	 * that need to be performed on each input matrix
	 * @param otherInstructionsInReducer the mixed operations that need to be performed on matrices after the aggregate operations
	 * @param numReducers the number of reducers
	 * @param replication the replication factor for the output
	 * @param jvmReuse if true, reuse JVM
	 * @param resultIndexes the indexes of the result matrices that needs to be outputted
	 * @param dimsUnknownFilePrefix file path prefix when dimensions unknown
	 * @param outputs the names for the output directories, one for each result index
	 * @param outputInfos output format information for the output matrices
	 * @return job return object
	 * @throws Exception if Exception occurs
	 */
@SuppressWarnings({ "unchecked", "rawtypes" })
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, boolean[] partitioned, PDataPartitionFormat[] pformats, int[] psizes, String recordReaderInstruction, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, boolean jvmReuse, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception {
    JobConf job = new JobConf(GMR.class);
    job.setJobName("G-MR");
    boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos);
    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);
    //added for handling recordreader instruction
    String[] realinputs = inputs;
    InputInfo[] realinputInfos = inputInfos;
    long[] realrlens = rlens;
    long[] realclens = clens;
    int[] realbrlens = brlens;
    int[] realbclens = bclens;
    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
    if (recordReaderInstruction != null && !recordReaderInstruction.isEmpty()) {
        assert (inputs.length <= 2);
        PickByCountInstruction ins = (PickByCountInstruction) PickByCountInstruction.parseInstruction(recordReaderInstruction);
        PickFromCompactInputFormat.setKeyValueClasses(job, (Class<? extends WritableComparable>) inputInfos[ins.input1].inputKeyClass, inputInfos[ins.input1].inputValueClass);
        job.setInputFormat(PickFromCompactInputFormat.class);
        PickFromCompactInputFormat.setZeroValues(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata);
        if (ins.isValuePick) {
            double[] probs = MapReduceTool.readColumnVectorFromHDFS(inputs[ins.input2], inputInfos[ins.input2], rlens[ins.input2], clens[ins.input2], brlens[ins.input2], bclens[ins.input2]);
            PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, probs);
            realinputs = new String[inputs.length - 1];
            realinputInfos = new InputInfo[inputs.length - 1];
            realrlens = new long[inputs.length - 1];
            realclens = new long[inputs.length - 1];
            realbrlens = new int[inputs.length - 1];
            realbclens = new int[inputs.length - 1];
            realIndexes = new byte[inputs.length - 1];
            byte realIndex = 0;
            for (byte i = 0; i < inputs.length; i++) {
                if (i == ins.input2)
                    continue;
                realinputs[realIndex] = inputs[i];
                realinputInfos[realIndex] = inputInfos[i];
                if (i == ins.input1) {
                    realrlens[realIndex] = rlens[ins.input2];
                    realclens[realIndex] = clens[ins.input2];
                    realbrlens[realIndex] = 1;
                    realbclens[realIndex] = 1;
                    realIndexes[realIndex] = ins.output;
                } else {
                    realrlens[realIndex] = rlens[i];
                    realclens[realIndex] = clens[i];
                    realbrlens[realIndex] = brlens[i];
                    realbclens[realIndex] = bclens[i];
                    realIndexes[realIndex] = i;
                }
                realIndex++;
            }
        } else {
            //PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst, 1-ins.cst);
            PickFromCompactInputFormat.setRangePickPartFiles(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst, 1 - ins.cst);
            realrlens[ins.input1] = UtilFunctions.getLengthForInterQuantile((NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst);
            realclens[ins.input1] = clens[ins.input1];
            realbrlens[ins.input1] = 1;
            realbclens[ins.input1] = 1;
            realIndexes[ins.input1] = ins.output;
        }
    }
    boolean resetDistCache = setupDistributedCache(job, instructionsInMapper, otherInstructionsInReducer, realinputs, realrlens, realclens);
    //set up the input files and their format information
    boolean[] distCacheOnly = getDistCacheOnlyInputs(realIndexes, recordReaderInstruction, instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer);
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens, distCacheOnly, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL);
    MRJobConfiguration.setInputPartitioningInfo(job, pformats);
    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens);
    MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens);
    //set up unary instructions that will perform in the mapper
    MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);
    //set up the instructions that will happen in the reducer, after the aggregation instructions
    MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
    //set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);
    //set up map/reduce memory configurations (if in AM context)
    DMLConfig config = ConfigurationManager.getDMLConfig();
    DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
    //set up custom map/reduce configurations 
    MRJobConfiguration.setupCustomMRConfigurations(job, config);
    //set up jvm reuse (incl. reuse of loaded dist cache matrices)
    if (jvmReuse)
        job.setNumTasksToExecutePerJvm(-1);
    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes);
    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
    MatrixCharacteristics[] stats = ret.stats;
    //set up the number of reducers
    MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);
    // Update resultDimsUnknown based on computed "stats"
    byte[] dimsUnknown = new byte[resultIndexes.length];
    for (int i = 0; i < resultIndexes.length; i++) {
        if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
            dimsUnknown[i] = (byte) 1;
        } else {
            dimsUnknown[i] = (byte) 0;
        }
    }
    //MRJobConfiguration.updateResultDimsUnknown(job,resultDimsUnknown);
    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, dimsUnknown, outputs, outputInfos, inBlockRepresentation, true);
    // configure mapper and the mapper output key value pairs
    job.setMapperClass(GMRMapper.class);
    if (numReducers == 0) {
        job.setMapOutputKeyClass(Writable.class);
        job.setMapOutputValueClass(Writable.class);
    } else {
        job.setMapOutputKeyClass(MatrixIndexes.class);
        if (inBlockRepresentation)
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        else
            job.setMapOutputValueClass(TaggedMatrixPackedCell.class);
    }
    //set up combiner
    if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty()) {
        job.setCombinerClass(GMRCombiner.class);
    }
    //configure reducer
    job.setReducerClass(GMRReducer.class);
    //job.setReducerClass(PassThroughReducer.class);
    // By default, the job executes in "cluster" mode.
    // Determine if we can optimize and run it in "local" mode.
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }
    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    RunningJob runjob = JobClient.runJob(job);
    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    //cleanups
    String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
    stats = MapReduceTool.processDimsFiles(dir, stats);
    MapReduceTool.deleteFileIfExistOnHDFS(dir);
    if (resetDistCache)
        MRBaseForCommonInstructions.resetDistCache();
    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

Also used : Group(org.apache.hadoop.mapred.Counters.Group) PickByCountInstruction(org.apache.sysml.runtime.instructions.mr.PickByCountInstruction) TaggedMatrixPackedCell(org.apache.sysml.runtime.matrix.data.TaggedMatrixPackedCell) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JobConf(org.apache.hadoop.mapred.JobConf) DMLConfig(org.apache.sysml.conf.DMLConfig) TaggedMatrixBlock(org.apache.sysml.runtime.matrix.data.TaggedMatrixBlock) NumItemsByEachReducerMetaData(org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData) MatrixChar_N_ReducerGroups(org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups) RunningJob(org.apache.hadoop.mapred.RunningJob)

Aggregations

RunningJob (org.apache.hadoop.mapred.RunningJob)61 JobConf (org.apache.hadoop.mapred.JobConf)45 Path (org.apache.hadoop.fs.Path)35 FileSystem (org.apache.hadoop.fs.FileSystem)24 JobClient (org.apache.hadoop.mapred.JobClient)20 IOException (java.io.IOException)15 Counters (org.apache.hadoop.mapred.Counters)14 Group (org.apache.hadoop.mapred.Counters.Group)13 DMLConfig (org.apache.sysml.conf.DMLConfig)13 Configuration (org.apache.hadoop.conf.Configuration)7 MatrixChar_N_ReducerGroups (org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups)7 DataOutputStream (java.io.DataOutputStream)6 File (java.io.File)5 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)5 FileStatus (org.apache.hadoop.fs.FileStatus)5 Text (org.apache.hadoop.io.Text)5 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)5 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)5 Test (org.junit.Test)5 URI (java.net.URI)4