Search in sources :

Example 11 with DMLConfig

use of org.apache.sysml.conf.DMLConfig in project incubator-systemml by apache.

the class CSVReblockMR method runAssignRowIDMRJob.

public static AssignRowIDMRReturn runAssignRowIDMRJob(String[] inputs, InputInfo[] inputInfos, int[] brlens, int[] bclens, String reblockInstructions, int replication, String[] smallestFiles, boolean transform, String naStrings, String spec) throws Exception {
    AssignRowIDMRReturn ret = new AssignRowIDMRReturn();
    JobConf job;
    job = new JobConf(CSVReblockMR.class);
    job.setJobName("Assign-RowID-MR");
    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.CELL);
    job.setStrings(SMALLEST_FILE_NAME_PER_INPUT, smallestFiles);
    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setCSVReblockInstructions(job, reblockInstructions);
    //set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    //set up custom map/reduce configurations 
    DMLConfig config = ConfigurationManager.getDMLConfig();
    MRJobConfiguration.setupCustomMRConfigurations(job, config);
    //set up the number of reducers
    job.setNumReduceTasks(1);
    // Print the complete instruction
    //if (LOG.isTraceEnabled())
    //inst.printCompelteMRJobInstruction();
    // configure mapper and the mapper output key value pairs
    job.setMapperClass(CSVAssignRowIDMapper.class);
    job.setMapOutputKeyClass(ByteWritable.class);
    job.setMapOutputValueClass(OffsetCount.class);
    //configure reducer
    job.setReducerClass(CSVAssignRowIDReducer.class);
    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);
    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    //set up the output file
    ret.counterFile = new Path(MRJobConfiguration.constructTempOutputFilename());
    job.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, ret.counterFile);
    job.setOutputKeyClass(ByteWritable.class);
    job.setOutputValueClass(OffsetCount.class);
    // setup properties relevant to transform
    job.setBoolean(MRJobConfiguration.TF_TRANSFORM, transform);
    if (transform) {
        if (naStrings != null)
            // Adding "dummy" string to handle the case of na_strings = ""
            job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(naStrings));
        job.set(MRJobConfiguration.TF_SPEC, spec);
    }
    RunningJob runjob = JobClient.runJob(job);
    /* Process different counters */
    Group rgroup = runjob.getCounters().getGroup(NUM_ROWS_IN_MATRIX);
    Group cgroup = runjob.getCounters().getGroup(NUM_COLS_IN_MATRIX);
    ret.rlens = new long[inputs.length];
    ret.clens = new long[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        // number of non-zeros
        ret.rlens[i] = rgroup.getCounter(Integer.toString(i));
        ret.clens[i] = cgroup.getCounter(Integer.toString(i));
    }
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.hadoop.mapred.Counters.Group) DMLConfig(org.apache.sysml.conf.DMLConfig) RunningJob(org.apache.hadoop.mapred.RunningJob) JobConf(org.apache.hadoop.mapred.JobConf)

Example 12 with DMLConfig

use of org.apache.sysml.conf.DMLConfig in project incubator-systemml by apache.

the class ParForDependencyAnalysisTest method runTest.

/**
	 * 
	 * @param scriptFilename
	 * @param expectedException
	 */
private void runTest(String scriptFilename, boolean expectedException) {
    boolean raisedException = false;
    try {
        // Tell the superclass about the name of this test, so that the superclass can
        // create temporary directories.
        int index = scriptFilename.lastIndexOf(".dml");
        String testName = scriptFilename.substring(0, index > 0 ? index : scriptFilename.length());
        TestConfiguration testConfig = new TestConfiguration(TEST_CLASS_DIR, testName, new String[] {});
        addTestConfiguration(testName, testConfig);
        loadTestConfiguration(testConfig);
        DMLConfig conf = new DMLConfig(getCurConfigFile().getPath());
        ConfigurationManager.setLocalConfig(conf);
        String dmlScriptString = "";
        HashMap<String, String> argVals = new HashMap<String, String>();
        //read script
        try (BufferedReader in = new BufferedReader(new FileReader(HOME + scriptFilename))) {
            String s1 = null;
            while ((s1 = in.readLine()) != null) dmlScriptString += s1 + "\n";
        }
        //parsing and dependency analysis
        ParserWrapper parser = ParserFactory.createParser(org.apache.sysml.api.mlcontext.ScriptType.DML);
        DMLProgram prog = parser.parse(DMLScript.DML_FILE_PATH_ANTLR_PARSER, dmlScriptString, argVals);
        DMLTranslator dmlt = new DMLTranslator(prog);
        dmlt.validateParseTree(prog);
    } catch (LanguageException ex) {
        raisedException = true;
        if (raisedException != expectedException)
            ex.printStackTrace();
    } catch (Exception ex2) {
        ex2.printStackTrace();
        throw new RuntimeException(ex2);
    //Assert.fail( "Unexpected exception occured during test run." );
    }
    //check correctness
    Assert.assertEquals(expectedException, raisedException);
}
Also used : DMLConfig(org.apache.sysml.conf.DMLConfig) HashMap(java.util.HashMap) TestConfiguration(org.apache.sysml.test.integration.TestConfiguration) DMLTranslator(org.apache.sysml.parser.DMLTranslator) LanguageException(org.apache.sysml.parser.LanguageException) LanguageException(org.apache.sysml.parser.LanguageException) BufferedReader(java.io.BufferedReader) DMLProgram(org.apache.sysml.parser.DMLProgram) FileReader(java.io.FileReader) ParserWrapper(org.apache.sysml.parser.ParserWrapper)

Example 13 with DMLConfig

use of org.apache.sysml.conf.DMLConfig in project incubator-systemml by apache.

the class RemoteDPParForMR method runJob.

public static RemoteParForJobReturn runJob(long pfid, String itervar, String matrixvar, String program, //config params
String resultFile, //config params
MatrixObject input, //config params
PartitionFormat dpf, //config params
OutputInfo oi, //config params
boolean tSparseCol, //opt params
boolean enableCPCaching, //opt params
int numReducers, //opt params
int replication) throws DMLRuntimeException {
    RemoteParForJobReturn ret = null;
    String jobname = "ParFor-DPEMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
    JobConf job;
    job = new JobConf(RemoteDPParForMR.class);
    job.setJobName(jobname + pfid);
    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();
    try {
        /////
        //configure the MR job
        //set arbitrary CP program blocks that will perform in the reducers
        MRJobConfiguration.setProgramBlocks(job, program);
        //enable/disable caching
        MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);
        //setup input matrix
        Path path = new Path(input.getFileName());
        long rlen = input.getNumRows();
        long clen = input.getNumColumns();
        int brlen = (int) input.getNumRowsPerBlock();
        int bclen = (int) input.getNumColumnsPerBlock();
        MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, InputInfo.BinaryBlockInputInfo, oi, dpf._dpf, dpf._N, input.getFileName(), itervar, matrixvar, tSparseCol);
        job.setInputFormat(InputInfo.BinaryBlockInputInfo.inputFormatClass);
        FileInputFormat.setInputPaths(job, path);
        //set mapper and reducers classes
        job.setMapperClass(DataPartitionerRemoteMapper.class);
        job.setReducerClass(RemoteDPParWorkerReducer.class);
        //set output format
        job.setOutputFormat(SequenceFileOutputFormat.class);
        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(resultFile);
        FileOutputFormat.setOutputPath(job, new Path(resultFile));
        //set the output key, value schema
        //parfor partitioning outputs (intermediates)
        job.setMapOutputKeyClass(LongWritable.class);
        if (oi == OutputInfo.BinaryBlockOutputInfo)
            job.setMapOutputValueClass(PairWritableBlock.class);
        else if (oi == OutputInfo.BinaryCellOutputInfo)
            job.setMapOutputValueClass(PairWritableCell.class);
        else
            throw new DMLRuntimeException("Unsupported intermrediate output info: " + oi);
        //parfor exec output
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);
        //////
        //set optimization parameters
        //set the number of mappers and reducers 
        job.setNumReduceTasks(numReducers);
        //disable automatic tasks timeouts and speculative task exec
        job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0);
        job.setMapSpeculativeExecution(false);
        //set up preferred custom serialization framework for binary block format
        if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
            MRJobConfiguration.addBinaryBlockSerializationFramework(job);
        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getDMLConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
        //set up custom map/reduce configurations 
        MRJobConfiguration.setupCustomMRConfigurations(job, config);
        //disable JVM reuse
        //-1 for unlimited 
        job.setNumTasksToExecutePerJvm(1);
        //set the replication factor for the results
        job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
        //set the max number of retries per map task
        //note: currently disabled to use cluster config
        //job.setInt(MRConfigurationNames.MR_MAP_MAXATTEMPTS, max_retry);
        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);
        /////
        // execute the MR job			
        RunningJob runjob = JobClient.runJob(job);
        // Process different counters 
        Statistics.incrementNoOfExecutedMRJobs();
        Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
        int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
        int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
        if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
            Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString()));
            Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString()));
            Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString()));
            Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString());
            CacheStatistics.incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
            CacheStatistics.incrementFSBuffHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
            CacheStatistics.incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
            CacheStatistics.incrementHDFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
            CacheStatistics.incrementFSBuffWrites((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
            CacheStatistics.incrementFSWrites((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
            CacheStatistics.incrementHDFSWrites((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));
            CacheStatistics.incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString()));
            CacheStatistics.incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString()));
            CacheStatistics.incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString()));
            CacheStatistics.incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString()));
        }
        // read all files of result variables and prepare for return
        LocalVariableMap[] results = readResultFile(job, resultFile);
        ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    } finally {
        // remove created files 
        try {
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
        } catch (IOException ex) {
            throw new DMLRuntimeException(ex);
        }
    }
    if (DMLScript.STATISTICS) {
        long t1 = System.nanoTime();
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.hadoop.mapred.Counters.Group) DMLConfig(org.apache.sysml.conf.DMLConfig) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) PairWritableBlock(org.apache.sysml.runtime.controlprogram.parfor.util.PairWritableBlock) LocalVariableMap(org.apache.sysml.runtime.controlprogram.LocalVariableMap) RunningJob(org.apache.hadoop.mapred.RunningJob) JobConf(org.apache.hadoop.mapred.JobConf)

Example 14 with DMLConfig

use of org.apache.sysml.conf.DMLConfig in project incubator-systemml by apache.

the class ResultMergeRemoteMR method executeMerge.

@SuppressWarnings({ "unused", "deprecation" })
protected void executeMerge(String fname, String fnameNew, String[] srcFnames, InputInfo ii, OutputInfo oi, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException {
    String jobname = "ParFor-RMMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
    JobConf job = new JobConf(ResultMergeRemoteMR.class);
    job.setJobName(jobname + _pfid);
    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();
    //warning for textcell/binarycell without compare
    boolean withCompare = (fname != null);
    if ((oi == OutputInfo.TextCellOutputInfo || oi == OutputInfo.BinaryCellOutputInfo) && !withCompare && ResultMergeLocalFile.ALLOW_COPY_CELLFILES)
        LOG.warn("Result merge for " + OutputInfo.outputInfoToString(oi) + " without compare can be realized more efficiently with LOCAL_FILE than REMOTE_MR.");
    try {
        Path pathCompare = null;
        Path pathNew = new Path(fnameNew);
        //configure the MR job
        if (withCompare) {
            FileSystem fs = IOUtilFunctions.getFileSystem(pathNew, job);
            pathCompare = new Path(fname).makeQualified(fs);
            MRJobConfiguration.setResultMergeInfo(job, pathCompare.toString(), ii, LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, brlen, bclen);
        } else
            MRJobConfiguration.setResultMergeInfo(job, "null", ii, LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, bclen, bclen);
        //set mappers, reducers, combiners
        job.setMapperClass(ResultMergeRemoteMapper.class);
        job.setReducerClass(ResultMergeRemoteReducer.class);
        if (oi == OutputInfo.TextCellOutputInfo) {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixCell.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(Text.class);
        } else if (oi == OutputInfo.BinaryCellOutputInfo) {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixCell.class);
            job.setOutputKeyClass(MatrixIndexes.class);
            job.setOutputValueClass(MatrixCell.class);
        } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
            //setup partitioning, grouping, sorting for composite key (old API)
            //partitioning
            job.setPartitionerClass(ResultMergeRemotePartitioning.class);
            //grouping
            job.setOutputValueGroupingComparator(ResultMergeRemoteGrouping.class);
            //sorting
            job.setOutputKeyComparatorClass(ResultMergeRemoteSorting.class);
            job.setMapOutputKeyClass(ResultMergeTaggedMatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
            job.setOutputKeyClass(MatrixIndexes.class);
            job.setOutputValueClass(MatrixBlock.class);
        }
        //set input format 
        job.setInputFormat(ii.inputFormatClass);
        //set the input path 
        Path[] paths = null;
        if (withCompare) {
            paths = new Path[srcFnames.length + 1];
            paths[0] = pathCompare;
            for (int i = 1; i < paths.length; i++) paths[i] = new Path(srcFnames[i - 1]);
        } else {
            paths = new Path[srcFnames.length];
            for (int i = 0; i < paths.length; i++) paths[i] = new Path(srcFnames[i]);
        }
        FileInputFormat.setInputPaths(job, paths);
        //set output format
        job.setOutputFormat(oi.outputFormatClass);
        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
        FileOutputFormat.setOutputPath(job, pathNew);
        //////
        //set optimization parameters
        //set the number of mappers and reducers 
        //job.setNumMapTasks( _numMappers ); //use default num mappers
        long reducerGroups = _numReducers;
        if (oi == OutputInfo.BinaryBlockOutputInfo)
            reducerGroups = Math.max(rlen / brlen, 1) * Math.max(clen / bclen, 1);
        else
            //textcell/binarycell
            reducerGroups = Math.max((rlen * clen) / StagingFileUtils.CELL_BUFFER_SIZE, 1);
        job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups));
        //disable automatic tasks timeouts and speculative task exec
        job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0);
        job.setMapSpeculativeExecution(false);
        //set up preferred custom serialization framework for binary block format
        if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
            MRJobConfiguration.addBinaryBlockSerializationFramework(job);
        //set up custom map/reduce configurations 
        DMLConfig config = ConfigurationManager.getDMLConfig();
        MRJobConfiguration.setupCustomMRConfigurations(job, config);
        //enables the reuse of JVMs (multiple tasks per MR task)
        if (_jvmReuse)
            //unlimited
            job.setNumTasksToExecutePerJvm(-1);
        //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower)
        //job.set(MRConfigurationNames.MR_MAP_OUTPUT_COMPRESS, "true");
        //job.set(MRConfigurationNames.MR_MAP_OUTPUT_COMPRESS_CODEC, "org.apache.hadoop.io.compress.GzipCodec");
        //set the replication factor for the results
        job.setInt(MRConfigurationNames.DFS_REPLICATION, _replication);
        //set the max number of retries per map task
        //  disabled job-level configuration to respect cluster configuration
        //  note: this refers to hadoop2, hence it never had effect on mr1
        //job.setInt(MRConfigurationNames.MR_MAP_MAXATTEMPTS, _max_retry);
        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);
        /////
        // execute the MR job	
        JobClient.runJob(job);
        //maintain dml script counters
        Statistics.incrementNoOfExecutedMRJobs();
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }
    if (DMLScript.STATISTICS) {
        long t1 = System.nanoTime();
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) DMLConfig(org.apache.sysml.conf.DMLConfig) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) TaggedMatrixCell(org.apache.sysml.runtime.matrix.data.TaggedMatrixCell) FileSystem(org.apache.hadoop.fs.FileSystem) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) TaggedMatrixCell(org.apache.sysml.runtime.matrix.data.TaggedMatrixCell) JobConf(org.apache.hadoop.mapred.JobConf)

Example 15 with DMLConfig

use of org.apache.sysml.conf.DMLConfig in project incubator-systemml by apache.

the class DataGenMR method runJob.

/**
	 * <p>Starts a Rand MapReduce job which will produce one or more random objects.</p>
	 * 
	 * @param inst MR job instruction
	 * @param dataGenInstructions array of data gen instructions
	 * @param instructionsInMapper instructions in mapper
	 * @param aggInstructionsInReducer aggregate instructions in reducer
	 * @param otherInstructionsInReducer other instructions in reducer
	 * @param numReducers number of reducers
	 * @param replication file replication
	 * @param resultIndexes result indexes for each random object
	 * @param dimsUnknownFilePrefix file path prefix when dimensions unknown
	 * @param outputs output file for each random object
	 * @param outputInfos output information for each random object
	 * @return matrix characteristics for each random object
	 * @throws Exception if Exception occurs
	 */
public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception {
    JobConf job = new JobConf(DataGenMR.class);
    job.setJobName("DataGen-MR");
    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, true);
    byte[] realIndexes = new byte[dataGenInstructions.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
    String[] inputs = new String[dataGenInstructions.length];
    InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
    long[] rlens = new long[dataGenInstructions.length];
    long[] clens = new long[dataGenInstructions.length];
    int[] brlens = new int[dataGenInstructions.length];
    int[] bclens = new int[dataGenInstructions.length];
    FileSystem fs = FileSystem.get(job);
    String dataGenInsStr = "";
    int numblocks = 0;
    int maxbrlen = -1, maxbclen = -1;
    double maxsparsity = -1;
    for (int i = 0; i < dataGenInstructions.length; i++) {
        dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];
        MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
        MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType();
        DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;
        rlens[i] = genInst.getRows();
        clens[i] = genInst.getCols();
        brlens[i] = genInst.getRowsInBlock();
        bclens[i] = genInst.getColsInBlock();
        maxbrlen = Math.max(maxbrlen, brlens[i]);
        maxbclen = Math.max(maxbclen, bclens[i]);
        if (mrtype == MRINSTRUCTION_TYPE.Rand) {
            RandInstruction randInst = (RandInstruction) mrins;
            inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir());
            maxsparsity = Math.max(maxsparsity, randInst.getSparsity());
            PrintWriter pw = null;
            try {
                pw = new PrintWriter(fs.create(new Path(inputs[i])));
                //for obj reuse and preventing repeated buffer re-allocations
                StringBuilder sb = new StringBuilder();
                //seed generation
                Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
                LongStream nnz = LibMatrixDatagen.computeNNZperBlock(rlens[i], clens[i], brlens[i], bclens[i], randInst.getSparsity());
                PrimitiveIterator.OfLong nnzIter = nnz.iterator();
                for (long r = 0; r < rlens[i]; r += brlens[i]) {
                    long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
                    for (long c = 0; c < clens[i]; c += bclens[i]) {
                        long curBlockColSize = Math.min(bclens[i], (clens[i] - c));
                        sb.append((r / brlens[i]) + 1);
                        sb.append(',');
                        sb.append((c / bclens[i]) + 1);
                        sb.append(',');
                        sb.append(curBlockRowSize);
                        sb.append(',');
                        sb.append(curBlockColSize);
                        sb.append(',');
                        sb.append(nnzIter.nextLong());
                        sb.append(',');
                        sb.append(bigrand.nextLong());
                        pw.println(sb.toString());
                        sb.setLength(0);
                        numblocks++;
                    }
                }
            } finally {
                IOUtilFunctions.closeSilently(pw);
            }
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else if (mrtype == MRINSTRUCTION_TYPE.Seq) {
            SeqInstruction seqInst = (SeqInstruction) mrins;
            inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
            //always dense
            maxsparsity = 1.0;
            double from = seqInst.fromValue;
            double to = seqInst.toValue;
            double incr = seqInst.incrValue;
            //handle default 1 to -1 for special case of from>to
            incr = LibMatrixDatagen.updateSeqIncr(from, to, incr);
            // Correctness checks on (from, to, incr)
            boolean neg = (from > to);
            if (incr == 0)
                throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");
            if (neg != (incr < 0))
                throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");
            // Compute the number of rows in the sequence
            long numrows = 1 + (long) Math.floor((to - from) / incr);
            if (rlens[i] > 0) {
                if (numrows != rlens[i])
                    throw new DMLRuntimeException("Unexpected error while processing sequence instruction. Expected number of rows does not match given number: " + rlens[i] + " != " + numrows);
            } else {
                rlens[i] = numrows;
            }
            if (clens[i] > 0 && clens[i] != 1)
                throw new DMLRuntimeException("Unexpected error while processing sequence instruction. Number of columns (" + clens[i] + ") must be equal to 1.");
            else
                clens[i] = 1;
            PrintWriter pw = null;
            try {
                pw = new PrintWriter(fs.create(new Path(inputs[i])));
                StringBuilder sb = new StringBuilder();
                double temp = from;
                double block_from, block_to;
                for (long r = 0; r < rlens[i]; r += brlens[i]) {
                    long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
                    // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval) 
                    long bid_i = ((r / brlens[i]) + 1);
                    long bid_j = 1;
                    block_from = temp;
                    block_to = temp + (curBlockRowSize - 1) * incr;
                    // next block starts from here
                    temp = block_to + incr;
                    sb.append(bid_i);
                    sb.append(',');
                    sb.append(bid_j);
                    sb.append(',');
                    sb.append(block_from);
                    sb.append(',');
                    sb.append(block_to);
                    sb.append(',');
                    sb.append(incr);
                    pw.println(sb.toString());
                    sb.setLength(0);
                    numblocks++;
                }
            } finally {
                IOUtilFunctions.closeSilently(pw);
            }
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else {
            throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
        }
    }
    //remove the first ","
    dataGenInsStr = dataGenInsStr.substring(1);
    RunningJob runjob;
    MatrixCharacteristics[] stats;
    try {
        //set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
        //set up the input files and their format information
        MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK);
        //set up the dimensions of input matrices
        MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
        MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
        //set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
        //set up the rand Instructions
        MRJobConfiguration.setRandInstructions(job, dataGenInsStr);
        //set up unary instructions that will perform in the mapper
        MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
        //set up the aggregate instructions that will happen in the combiner and reducer
        MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);
        //set up the instructions that will happen in the reducer, after the aggregation instrucions
        MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
        //set up the replication factor for the results
        job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getDMLConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
        //set up custom map/reduce configurations 
        MRJobConfiguration.setupCustomMRConfigurations(job, config);
        //determine degree of parallelism (nmappers: 1<=n<=capacity)
        //TODO use maxsparsity whenever we have a way of generating sparse rand data
        int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
        long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
        //correction max number of mappers on yarn clusters
        if (InfrastructureAnalyzer.isYarnEnabled())
            capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
        int nmapers = Math.max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1);
        job.setNumMapTasks(nmapers);
        //set up what matrices are needed to pass from the mapper to reducer
        HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes);
        MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
        stats = ret.stats;
        //set up the number of reducers
        MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
        // print the complete MRJob instruction
        if (LOG.isTraceEnabled())
            inst.printCompleteMRJobInstruction(stats);
        // Update resultDimsUnknown based on computed "stats"
        byte[] resultDimsUnknown = new byte[resultIndexes.length];
        for (int i = 0; i < resultIndexes.length; i++) {
            if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
                resultDimsUnknown[i] = (byte) 1;
            } else {
                resultDimsUnknown[i] = (byte) 0;
            }
        }
        boolean mayContainCtable = instructionsInMapper.contains("ctabletransform") || instructionsInMapper.contains("groupedagg");
        //set up the multiple output files, and their format information
        MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable);
        // configure mapper and the mapper output key value pairs
        job.setMapperClass(DataGenMapper.class);
        if (numReducers == 0) {
            job.setMapOutputKeyClass(Writable.class);
            job.setMapOutputValueClass(Writable.class);
        } else {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        }
        //set up combiner
        if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty())
            job.setCombinerClass(GMRCombiner.class);
        //configure reducer
        job.setReducerClass(GMRReducer.class);
        //job.setReducerClass(PassThroughReducer.class);
        // By default, the job executes in "cluster" mode.
        // Determine if we can optimize and run it in "local" mode.
        MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
        for (int i = 0; i < inputs.length; i++) {
            inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
        }
        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);
        runjob = JobClient.runJob(job);
        /* Process different counters */
        Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
        for (int i = 0; i < resultIndexes.length; i++) {
            // number of non-zeros
            stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
        }
        String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
        stats = MapReduceTool.processDimsFiles(dir, stats);
        MapReduceTool.deleteFileIfExistOnHDFS(dir);
    } finally {
        for (String input : inputs) MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
    }
    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
Also used : Group(org.apache.hadoop.mapred.Counters.Group) DataGenMRInstruction(org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) GMRCombiner(org.apache.sysml.runtime.matrix.mapred.GMRCombiner) FileSystem(org.apache.hadoop.fs.FileSystem) DataGenMRInstruction(org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction) MRInstruction(org.apache.sysml.runtime.instructions.mr.MRInstruction) JobConf(org.apache.hadoop.mapred.JobConf) PrintWriter(java.io.PrintWriter) Path(org.apache.hadoop.fs.Path) DMLConfig(org.apache.sysml.conf.DMLConfig) PrimitiveIterator(java.util.PrimitiveIterator) SeqInstruction(org.apache.sysml.runtime.instructions.mr.SeqInstruction) LongStream(java.util.stream.LongStream) RandInstruction(org.apache.sysml.runtime.instructions.mr.RandInstruction) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixChar_N_ReducerGroups(org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups) MRINSTRUCTION_TYPE(org.apache.sysml.runtime.instructions.mr.MRInstruction.MRINSTRUCTION_TYPE) RunningJob(org.apache.hadoop.mapred.RunningJob) Well1024a(org.apache.commons.math3.random.Well1024a)

Aggregations

DMLConfig (org.apache.sysml.conf.DMLConfig)31 JobConf (org.apache.hadoop.mapred.JobConf)17 RunningJob (org.apache.hadoop.mapred.RunningJob)13 Group (org.apache.hadoop.mapred.Counters.Group)11 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)11 Path (org.apache.hadoop.fs.Path)10 MatrixChar_N_ReducerGroups (org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups)7 IOException (java.io.IOException)6 DMLProgram (org.apache.sysml.parser.DMLProgram)6 DMLTranslator (org.apache.sysml.parser.DMLTranslator)5 FileSystem (org.apache.hadoop.fs.FileSystem)4 ParserWrapper (org.apache.sysml.parser.ParserWrapper)4 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)4 HashMap (java.util.HashMap)3 LanguageException (org.apache.sysml.parser.LanguageException)3 TaggedFirstSecondIndexes (org.apache.sysml.runtime.matrix.data.TaggedFirstSecondIndexes)3 BufferedReader (java.io.BufferedReader)2 FileReader (java.io.FileReader)2 ArrayList (java.util.ArrayList)2 CompilerConfig (org.apache.sysml.conf.CompilerConfig)2