Search in sources :

Example 26 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project incubator-systemml by apache.

the class GroupedAggMR method runJob.

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String grpAggInstructions, String simpleReduceInstructions, /*only scalar or reorg instructions allowed*/
int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception {
    JobConf job = new JobConf(GroupedAggMR.class);
    job.setJobName("GroupedAgg-MR");
    //whether use block representation or cell representation
    //MRJobConfiguration.setMatrixValueClassForCM_N_COM(job, true);
    MRJobConfiguration.setMatrixValueClass(job, false);
    //added for handling recordreader instruction
    String[] realinputs = inputs;
    InputInfo[] realinputInfos = inputInfos;
    long[] realrlens = rlens;
    long[] realclens = clens;
    int[] realbrlens = brlens;
    int[] realbclens = bclens;
    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens, true, ConvertTarget.WEIGHTEDCELL);
    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens);
    MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens);
    //set up the grouped aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setGroupedAggInstructions(job, grpAggInstructions);
    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, simpleReduceInstructions);
    //set up the number of reducers
    MRJobConfiguration.setNumReducers(job, numReducers, numReducers);
    //set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    //set up custom map/reduce configurations 
    DMLConfig config = ConfigurationManager.getDMLConfig();
    MRJobConfiguration.setupCustomMRConfigurations(job, config);
    //set up what matrices are needed to pass from the mapper to reducer
    MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, null, grpAggInstructions, resultIndexes);
    MatrixCharacteristics[] stats = new MatrixCharacteristics[resultIndexes.length];
    for (int i = 0; i < resultIndexes.length; i++) stats[i] = new MatrixCharacteristics();
    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);
    byte[] resultDimsUnknown = new byte[resultIndexes.length];
    // Update resultDimsUnknown based on computed "stats"
    for (int i = 0; i < resultIndexes.length; i++) resultDimsUnknown[i] = (byte) 2;
    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, false);
    // configure mapper and the mapper output key value pairs
    job.setMapperClass(GroupedAggMRMapper.class);
    job.setCombinerClass(GroupedAggMRCombiner.class);
    job.setMapOutputKeyClass(TaggedMatrixIndexes.class);
    job.setMapOutputValueClass(WeightedCell.class);
    //configure reducer
    job.setReducerClass(GroupedAggMRReducer.class);
    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    //execute job
    RunningJob runjob = JobClient.runJob(job);
    //get important output statistics 
    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i] = new MatrixCharacteristics();
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    }
    String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
    stats = MapReduceTool.processDimsFiles(dir, stats);
    MapReduceTool.deleteFileIfExistOnHDFS(dir);
    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
Also used : Group(org.apache.hadoop.mapred.Counters.Group) DMLConfig(org.apache.sysml.conf.DMLConfig) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) RunningJob(org.apache.hadoop.mapred.RunningJob) JobConf(org.apache.hadoop.mapred.JobConf)

Example 27 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project incubator-systemml by apache.

the class MMCJMR method runJob.

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String instructionsInMapper, String aggInstructionsInReducer, String aggBinInstrction, int numReducers, int replication, String output, OutputInfo outputinfo) throws Exception {
    JobConf job = new JobConf(MMCJMR.class);
    // TODO: check w/ yuanyuan. This job always runs in blocked mode, and hence derivation is not necessary.
    boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos);
    // by default, assume that dimensions of MMCJ's output are known at compile time
    byte resultDimsUnknown = (byte) 0;
    MatrixCharacteristics[] stats = commonSetup(job, inBlockRepresentation, inputs, inputInfos, rlens, clens, brlens, bclens, instructionsInMapper, aggInstructionsInReducer, aggBinInstrction, numReducers, replication, resultDimsUnknown, output, outputinfo);
    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);
    // There is always a single output
    if (stats[0].getRows() == -1 || stats[0].getCols() == -1) {
        resultDimsUnknown = (byte) 1;
        // if the dimensions are unknown, then setup done in commonSetup() must be updated
        byte[] resultIndexes = new byte[] { MRInstructionParser.parseSingleInstruction(aggBinInstrction).output };
        byte[] resultDimsUnknown_Array = new byte[] { resultDimsUnknown };
        //set up the multiple output files, and their format information
        MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown_Array, new String[] { output }, new OutputInfo[] { outputinfo }, inBlockRepresentation);
    }
    AggregateBinaryInstruction ins = (AggregateBinaryInstruction) MRInstructionParser.parseSingleInstruction(aggBinInstrction);
    MatrixCharacteristics dim1 = MRJobConfiguration.getMatrixCharactristicsForBinAgg(job, ins.input1);
    MatrixCharacteristics dim2 = MRJobConfiguration.getMatrixCharactristicsForBinAgg(job, ins.input2);
    if (dim1.getRowsPerBlock() > dim1.getRows())
        dim1.setRowsPerBlock((int) dim1.getRows());
    if (dim1.getColsPerBlock() > dim1.getCols())
        dim1.setColsPerBlock((int) dim1.getCols());
    if (dim2.getRowsPerBlock() > dim2.getRows())
        dim2.setRowsPerBlock((int) dim2.getRows());
    if (dim2.getColsPerBlock() > dim2.getCols())
        dim2.setColsPerBlock((int) dim2.getCols());
    long blockSize1 = 77 + 8 * dim1.getRowsPerBlock() * dim1.getColsPerBlock();
    long blockSize2 = 77 + 8 * dim2.getRowsPerBlock() * dim2.getColsPerBlock();
    long blockSizeResult = 77 + 8 * dim1.getRowsPerBlock() * dim2.getColsPerBlock();
    long cacheSize = -1;
    //cache the first result
    if (dim1.getRows() < dim2.getCols()) {
        long numBlocks = (long) Math.ceil((double) dim1.getRows() / (double) dim1.getRowsPerBlock());
        cacheSize = numBlocks * (20 + blockSize1) + 32;
    } else //cache the second result
    {
        long numBlocks = (long) Math.ceil((double) dim2.getCols() / (double) dim2.getColsPerBlock());
        cacheSize = numBlocks * (20 + blockSize2) + 32;
    }
    //add known memory consumption (will be substracted from output buffer)
    cacheSize += //the cached key-value pair  (plus input instance)
    2 * Math.max(blockSize1, blockSize2) + //the cached single result
    blockSizeResult + //misc memory requirement by hadoop
    MRJobConfiguration.getMiscMemRequired(job);
    MRJobConfiguration.setMMCJCacheSize(job, (int) cacheSize);
    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    //run mmcj job
    RunningJob runjob = JobClient.runJob(job);
    /* Process different counters */
    // NOTE: MMCJ job always has only a single output. 
    // Hence, no need to scan resultIndexes[] like other jobs
    int outputIndex = 0;
    Byte outputMatrixID = MRInstructionParser.parseSingleInstruction(aggBinInstrction).output;
    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    // number of non-zeros
    stats[outputIndex].setNonZeros(group.getCounter(Byte.toString(outputMatrixID)));
    return new JobReturn(stats[outputIndex], outputinfo, runjob.isSuccessful());
}
Also used : Group(org.apache.hadoop.mapred.Counters.Group) AggregateBinaryInstruction(org.apache.sysml.runtime.instructions.mr.AggregateBinaryInstruction) RunningJob(org.apache.hadoop.mapred.RunningJob) JobConf(org.apache.hadoop.mapred.JobConf)

Example 28 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project incubator-systemml by apache.

the class CleanupMR method runJob.

public static boolean runJob(DMLConfig conf) throws Exception {
    boolean ret = false;
    try {
        JobConf job;
        job = new JobConf(CleanupMR.class);
        job.setJobName("Cleanup-MR");
        //set up SystemML local tmp dir
        String dir = conf.getTextValue(DMLConfig.LOCAL_TMP_DIR);
        MRJobConfiguration.setSystemMLLocalTmpDir(job, dir);
        //set mappers, reducers 
        int numNodes = InfrastructureAnalyzer.getRemoteParallelNodes();
        //map-only
        job.setMapperClass(CleanupMapper.class);
        //numMappers
        job.setNumMapTasks(numNodes);
        job.setNumReduceTasks(0);
        //set input/output format, input path
        String inFileName = conf.getTextValue(DMLConfig.SCRATCH_SPACE) + "/cleanup_tasks";
        job.setInputFormat(NLineInputFormat.class);
        job.setOutputFormat(NullOutputFormat.class);
        Path path = new Path(inFileName);
        FileInputFormat.setInputPaths(job, path);
        writeCleanupTasksToFile(path, numNodes);
        //disable automatic tasks timeouts and speculative task exec
        job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0);
        job.setMapSpeculativeExecution(false);
        /////
        // execute the MR job			
        RunningJob runjob = JobClient.runJob(job);
        ret = runjob.isSuccessful();
    } catch (Exception ex) {
        //don't raise an exception, just gracefully an error message.
        LOG.error("Failed to run cleanup MR job. ", ex);
    }
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) RunningJob(org.apache.hadoop.mapred.RunningJob) JobConf(org.apache.hadoop.mapred.JobConf) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException)

Example 29 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project incubator-systemml by apache.

the class CombineMR method runJob.

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String combineInstructions, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception {
    JobConf job;
    job = new JobConf(CombineMR.class);
    job.setJobName("Standalone-MR");
    boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos);
    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);
    byte[] inputIndexes = new byte[inputs.length];
    for (byte b = 0; b < inputs.length; b++) inputIndexes[b] = b;
    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, inputIndexes, inputs, inputInfos, brlens, bclens, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL);
    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, inputIndexes, rlens, clens);
    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, inputIndexes, brlens, bclens);
    //set up unary instructions that will perform in the mapper
    MRJobConfiguration.setInstructionsInMapper(job, "");
    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setAggregateInstructions(job, "");
    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, "");
    MRJobConfiguration.setCombineInstructions(job, combineInstructions);
    //set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    //set up custom map/reduce configurations 
    DMLConfig config = ConfigurationManager.getDMLConfig();
    MRJobConfiguration.setupCustomMRConfigurations(job, config);
    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, inputIndexes, null, null, combineInstructions, resultIndexes);
    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, null, outputs, outputInfos, inBlockRepresentation);
    // configure mapper and the mapper output key value pairs
    job.setMapperClass(GMRMapper.class);
    job.setMapOutputKeyClass(MatrixIndexes.class);
    if (inBlockRepresentation)
        job.setMapOutputValueClass(TaggedMatrixBlock.class);
    else
        job.setMapOutputValueClass(TaggedMatrixCell.class);
    //configure reducer
    job.setReducerClass(InnerReducer.class);
    //job.setReducerClass(PassThroughReducer.class);
    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, inputIndexes, null, null, null, combineInstructions, resultIndexes, mapoutputIndexes, false);
    MatrixCharacteristics[] stats = ret.stats;
    //set up the number of reducers
    MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);
    // By default, the job executes in "cluster" mode.
    // Determine if we can optimize and run it in "local" mode.
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }
    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    RunningJob runjob = JobClient.runJob(job);
    return new JobReturn(stats, runjob.isSuccessful());
}
Also used : DMLConfig(org.apache.sysml.conf.DMLConfig) TaggedMatrixBlock(org.apache.sysml.runtime.matrix.data.TaggedMatrixBlock) MatrixChar_N_ReducerGroups(org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups) TaggedMatrixCell(org.apache.sysml.runtime.matrix.data.TaggedMatrixCell) RunningJob(org.apache.hadoop.mapred.RunningJob) JobConf(org.apache.hadoop.mapred.JobConf)

Example 30 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project incubator-systemml by apache.

the class CSVReblockMR method runCSVReblockJob.

private static JobReturn runCSVReblockJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String reblockInstructions, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos, Path counterFile, String[] smallestFiles) throws Exception {
    JobConf job;
    job = new JobConf(ReblockMR.class);
    job.setJobName("CSV-Reblock-MR");
    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.CELL);
    job.setStrings(SMALLEST_FILE_NAME_PER_INPUT, smallestFiles);
    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setCSVReblockInstructions(job, reblockInstructions);
    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
    //set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);
    //set up custom map/reduce configurations 
    DMLConfig config = ConfigurationManager.getDMLConfig();
    MRJobConfiguration.setupCustomMRConfigurations(job, config);
    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, reblockInstructions, null, otherInstructionsInReducer, resultIndexes);
    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null, reblockInstructions, null, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
    MatrixCharacteristics[] stats = ret.stats;
    //set up the number of reducers
    int numRed = WriteCSVMR.determineNumReducers(rlens, clens, config.getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups);
    job.setNumReduceTasks(numRed);
    // Print the complete instruction
    //if (LOG.isTraceEnabled())
    //	inst.printCompelteMRJobInstruction(stats);
    // Update resultDimsUnknown based on computed "stats"
    byte[] resultDimsUnknown = new byte[resultIndexes.length];
    for (int i = 0; i < resultIndexes.length; i++) {
        if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
            resultDimsUnknown[i] = (byte) 1;
        } else {
            resultDimsUnknown[i] = (byte) 0;
        }
    }
    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, true);
    // configure mapper and the mapper output key value pairs
    job.setMapperClass(CSVReblockMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(BlockRow.class);
    //configure reducer
    job.setReducerClass(CSVReblockReducer.class);
    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);
    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    Path cachefile = new Path(counterFile, "part-00000");
    DistributedCache.addCacheFile(cachefile.toUri(), job);
    DistributedCache.createSymlink(job);
    job.set(ROWID_FILE_NAME, cachefile.toString());
    RunningJob runjob = JobClient.runJob(job);
    MapReduceTool.deleteFileIfExistOnHDFS(counterFile, job);
    /* Process different counters */
    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    //	System.out.println("result #"+resultIndexes[i]+" ===>\n"+stats[i]);
    }
    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.hadoop.mapred.Counters.Group) DMLConfig(org.apache.sysml.conf.DMLConfig) MatrixChar_N_ReducerGroups(org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups) RunningJob(org.apache.hadoop.mapred.RunningJob) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

RunningJob (org.apache.hadoop.mapred.RunningJob)61 JobConf (org.apache.hadoop.mapred.JobConf)45 Path (org.apache.hadoop.fs.Path)35 FileSystem (org.apache.hadoop.fs.FileSystem)24 JobClient (org.apache.hadoop.mapred.JobClient)20 IOException (java.io.IOException)15 Counters (org.apache.hadoop.mapred.Counters)14 Group (org.apache.hadoop.mapred.Counters.Group)13 DMLConfig (org.apache.sysml.conf.DMLConfig)13 Configuration (org.apache.hadoop.conf.Configuration)7 MatrixChar_N_ReducerGroups (org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups)7 DataOutputStream (java.io.DataOutputStream)6 File (java.io.File)5 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)5 FileStatus (org.apache.hadoop.fs.FileStatus)5 Text (org.apache.hadoop.io.Text)5 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)5 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)5 Test (org.junit.Test)5 URI (java.net.URI)4