Search in sources :

Example 1 with JobReturn

use of org.apache.sysml.runtime.matrix.JobReturn in project incubator-systemml by apache.

the class DataTransform method mrDataTransform.

/**
	 * Main method to create and/or apply transformation metdata using MapReduce.
	 * 
	 * @param jobinst MR job instruction
	 * @param inputs array of input matrices
	 * @param shuffleInst shuffle instructions
	 * @param otherInst other instructions
	 * @param resultIndices byte array of result indices
	 * @param outputs array of output matrices
	 * @param numReducers number of reducers
	 * @param replication ?
	 * @return MR job result
	 * @throws Exception if IOException occurs
	 */
public static JobReturn mrDataTransform(MRJobInstruction jobinst, MatrixObject[] inputs, String shuffleInst, String otherInst, byte[] resultIndices, MatrixObject[] outputs, int numReducers, int replication) throws Exception {
    String[] insts = shuffleInst.split(Instruction.INSTRUCTION_DELIM);
    // Parse transform instruction (the first instruction) to obtain relevant fields
    TransformOperands oprnds = new TransformOperands(insts[0], inputs[0]);
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    // find the first file in alphabetical ordering of part files in directory inputPath 
    String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
    // find column names
    FileSystem fs = IOUtilFunctions.getFileSystem(smallestFile);
    String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
    HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
    String outHeader = getOutputHeader(fs, headerLine, oprnds);
    int numColumns = colNamesToIds.size();
    int numColumnsTf = 0;
    long numRowsTf = 0;
    ArrayList<Integer> csvoutputs = new ArrayList<Integer>();
    ArrayList<Integer> bboutputs = new ArrayList<Integer>();
    // divide output objects based on output format (CSV or BinaryBlock)
    for (int i = 0; i < outputs.length; i++) {
        if (outputs[i].getFileFormatProperties() != null && outputs[i].getFileFormatProperties().getFileFormat() == FileFormatProperties.FileFormat.CSV)
            csvoutputs.add(i);
        else
            bboutputs.add(i);
    }
    boolean isCSV = (csvoutputs.size() > 0);
    boolean isBB = (bboutputs.size() > 0);
    String tmpPath = MRJobConfiguration.constructTempOutputFilename();
    checkIfOutputOverlapsWithTxMtd(outputs, oprnds, isCSV, isBB, csvoutputs, bboutputs, fs);
    JobReturn retCSV = null, retBB = null;
    if (!oprnds.isApply) {
        // build specification file with column IDs insteadof column names
        String specWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds, oprnds.inputCSVProperties, oprnds.spec);
        // enable GC on colNamesToIds
        colNamesToIds = null;
        // Build transformation metadata, including recode maps, bin definitions, etc.
        // Also, generate part offsets file (counters file), which is to be used in csv-reblock
        String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
        numRowsTf = GenTfMtdMR.runJob(oprnds.inputPath, oprnds.txMtdPath, specWithIDs, smallestFile, partOffsetsFile, oprnds.inputCSVProperties, numColumns, replication, outHeader);
        if (numRowsTf == 0)
            throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
        // store the specFileWithIDs as transformation metadata
        MapReduceTool.writeStringToHDFS(specWithIDs, oprnds.txMtdPath + "/" + "spec.json");
        numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
        // Apply transformation metadata, and perform actual transformation 
        if (isCSV)
            retCSV = ApplyTfCSVMR.runJob(oprnds.inputPath, specWithIDs, oprnds.txMtdPath, tmpPath, outputs[csvoutputs.get(0)].getFileName(), partOffsetsFile, oprnds.inputCSVProperties, numColumns, replication, outHeader);
        if (isBB) {
            DMLConfig conf = ConfigurationManager.getDMLConfig();
            int blockSize = conf.getIntValue(DMLConfig.DEFAULT_BLOCK_SIZE);
            CSVReblockInstruction rblk = prepDummyReblockInstruction(oprnds.inputCSVProperties, blockSize);
            AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { blockSize }, new int[] { blockSize }, rblk.toString(), replication, new String[] { smallestFile }, true, oprnds.inputCSVProperties.getNAStrings(), specWithIDs);
            if (ret1.rlens[0] == 0)
                throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
            retBB = ApplyTfBBMR.runJob(oprnds.inputPath, insts[1], otherInst, specWithIDs, oprnds.txMtdPath, tmpPath, outputs[bboutputs.get(0)].getFileName(), ret1.counterFile.toString(), oprnds.inputCSVProperties, numRowsTf, numColumns, numColumnsTf, replication, outHeader);
        }
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);
    } else {
        // enable GC on colNamesToIds
        colNamesToIds = null;
        // copy given transform metadata (applyTxPath) to specified location (txMtdPath)
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
        MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
        // path to specification file
        String specWithIDs = (oprnds.spec != null) ? oprnds.spec : MapReduceTool.readStringFromHDFSFile(oprnds.txMtdPath + "/" + "spec.json");
        numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
        if (isCSV) {
            DMLConfig conf = ConfigurationManager.getDMLConfig();
            int blockSize = conf.getIntValue(DMLConfig.DEFAULT_BLOCK_SIZE);
            CSVReblockInstruction rblk = prepDummyReblockInstruction(oprnds.inputCSVProperties, blockSize);
            AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { blockSize }, new int[] { blockSize }, rblk.toString(), replication, new String[] { smallestFile }, true, oprnds.inputCSVProperties.getNAStrings(), specWithIDs);
            numRowsTf = ret1.rlens[0];
            if (ret1.rlens[0] == 0)
                throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
            // Apply transformation metadata, and perform actual transformation 
            retCSV = ApplyTfCSVMR.runJob(oprnds.inputPath, specWithIDs, oprnds.applyTxPath, tmpPath, outputs[csvoutputs.get(0)].getFileName(), ret1.counterFile.toString(), oprnds.inputCSVProperties, numColumns, replication, outHeader);
        }
        if (isBB) {
            // compute part offsets file
            CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(insts[1]);
            CSVReblockInstruction newrblk = (CSVReblockInstruction) rblk.clone((byte) 0);
            AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { newrblk.brlen }, new int[] { newrblk.bclen }, newrblk.toString(), replication, new String[] { smallestFile }, true, oprnds.inputCSVProperties.getNAStrings(), specWithIDs);
            numRowsTf = ret1.rlens[0];
            if (ret1.rlens[0] == 0)
                throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
            // apply transformation metadata, as well as reblock the resulting data
            retBB = ApplyTfBBMR.runJob(oprnds.inputPath, insts[1], otherInst, specWithIDs, oprnds.txMtdPath, tmpPath, outputs[bboutputs.get(0)].getFileName(), ret1.counterFile.toString(), oprnds.inputCSVProperties, ret1.rlens[0], ret1.clens[0], numColumnsTf, replication, outHeader);
        }
    }
    // copy auxiliary data (old and new header lines) from temporary location to txMtdPath
    moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);
    // generate matrix metadata file for outputs
    if (retCSV != null) {
        retCSV.getMatrixCharacteristics(0).setDimension(numRowsTf, numColumnsTf);
        CSVFileFormatProperties prop = new CSVFileFormatProperties(false, // use the same header as the input
        oprnds.inputCSVProperties.getDelim(), false, Double.NaN, null);
        MapReduceTool.writeMetaDataFile(outputs[csvoutputs.get(0)].getFileName() + ".mtd", ValueType.DOUBLE, retCSV.getMatrixCharacteristics(0), OutputInfo.CSVOutputInfo, prop);
        return retCSV;
    }
    if (retBB != null) {
        retBB.getMatrixCharacteristics(0).setDimension(numRowsTf, numColumnsTf);
        MapReduceTool.writeMetaDataFile(outputs[bboutputs.get(0)].getFileName() + ".mtd", ValueType.DOUBLE, retBB.getMatrixCharacteristics(0), OutputInfo.BinaryBlockOutputInfo);
        return retBB;
    }
    return null;
}
Also used : AssignRowIDMRReturn(org.apache.sysml.runtime.matrix.CSVReblockMR.AssignRowIDMRReturn) Path(org.apache.hadoop.fs.Path) DMLConfig(org.apache.sysml.conf.DMLConfig) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) CSVReblockInstruction(org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction) ArrayList(java.util.ArrayList) JobReturn(org.apache.sysml.runtime.matrix.JobReturn) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf)

Example 2 with JobReturn

use of org.apache.sysml.runtime.matrix.JobReturn in project incubator-systemml by apache.

the class DataTransform method performTransform.

/**
	 * Main method to create and/or apply transformation metdata in-memory, on a single node.
	 * 
	 * @param job job configuration
	 * @param fs file system
	 * @param inputPath path to input files
	 * @param ncols number of columns
	 * @param prop csv file format properties
	 * @param specWithIDs JSON transform specification with IDs
	 * @param tfMtdPath transform metadata path
	 * @param isApply ?
	 * @param result output matrix
	 * @param headerLine header line
	 * @param isBB true if binary block
	 * @param isCSV true if CSV
	 * @return MR job result
	 * @throws IOException if IOException occurs
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 * @throws IllegalArgumentException if IllegalArgumentException occurs
	 * @throws JSONException if JSONException occurs
	 */
private static JobReturn performTransform(JobConf job, FileSystem fs, String inputPath, int ncols, CSVFileFormatProperties prop, String specWithIDs, String tfMtdPath, boolean isApply, MatrixObject result, String headerLine, boolean isBB, boolean isCSV) throws IOException, DMLRuntimeException, IllegalArgumentException, JSONException {
    String[] na = TfUtils.parseNAStrings(prop.getNAStrings());
    JSONObject spec = new JSONObject(specWithIDs);
    TfUtils agents = new TfUtils(headerLine, prop.hasHeader(), prop.getDelim(), na, spec, ncols, tfMtdPath, null, null);
    MVImputeAgent _mia = agents.getMVImputeAgent();
    RecodeAgent _ra = agents.getRecodeAgent();
    BinAgent _ba = agents.getBinAgent();
    DummycodeAgent _da = agents.getDummycodeAgent();
    // List of files to read
    ArrayList<Path> files = collectInputFiles(inputPath, fs);
    // ---------------------------------
    // Construct transformation metadata
    // ---------------------------------
    String line = null;
    String[] words = null;
    int numColumnsTf = 0;
    if (!isApply) {
        for (int fileNo = 0; fileNo < files.size(); fileNo++) {
            try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))))) {
                if (fileNo == 0 && prop.hasHeader())
                    //ignore header
                    br.readLine();
                line = null;
                while ((line = br.readLine()) != null) {
                    agents.prepareTfMtd(line);
                }
            }
        }
        if (agents.getValid() == 0)
            throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
        _mia.outputTransformationMetadata(tfMtdPath, fs, agents);
        _ba.outputTransformationMetadata(tfMtdPath, fs, agents);
        _ra.outputTransformationMetadata(tfMtdPath, fs, agents);
        // prepare agents for the subsequent phase of applying transformation metadata
        // NO need to loadTxMtd for _ra, since the maps are already present in the memory
        Path tmp = new Path(tfMtdPath);
        _mia.loadTxMtd(job, fs, tmp, agents);
        _ba.loadTxMtd(job, fs, tmp, agents);
        _da.setRecodeMapsCP(_ra.getCPRecodeMaps());
        _da.setNumBins(_ba.getColList(), _ba.getNumBins());
        _da.loadTxMtd(job, fs, tmp, agents);
    } else {
        // Count the number of rows
        int[] rows = countNumRows(files, prop, fs, agents);
        agents.setTotal(rows[0]);
        agents.setValid(rows[1]);
        if (agents.getValid() == 0)
            throw new DMLRuntimeException("Number of rows in the transformed output (potentially, after ommitting the ones with missing values) is zero. Cannot proceed.");
        // Load transformation metadata
        // prepare agents for the subsequent phase of applying transformation metadata
        Path tmp = new Path(tfMtdPath);
        _mia.loadTxMtd(job, fs, tmp, agents);
        _ra.loadTxMtd(job, fs, tmp, agents);
        _ba.loadTxMtd(job, fs, tmp, agents);
        _da.setRecodeMaps(_ra.getRecodeMaps());
        _da.setNumBins(_ba.getColList(), _ba.getNumBins());
        _da.loadTxMtd(job, fs, tmp, agents);
    }
    // -----------------------------
    // Apply transformation metadata
    // -----------------------------
    numColumnsTf = getNumColumnsTf(fs, headerLine, prop.getDelim(), tfMtdPath);
    MapReduceTool.deleteFileIfExistOnHDFS(result.getFileName());
    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(result.getFileName()), true)));
    StringBuilder sb = new StringBuilder();
    try {
        MatrixBlock mb = null;
        if (isBB) {
            int estNNZ = (int) agents.getValid() * ncols;
            mb = new MatrixBlock((int) agents.getValid(), numColumnsTf, estNNZ);
            if (mb.isInSparseFormat())
                mb.allocateSparseRowsBlock();
            else
                mb.allocateDenseBlock();
        }
        // rowid to be used in filling the matrix block
        int rowID = 0;
        for (int fileNo = 0; fileNo < files.size(); fileNo++) {
            try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))))) {
                if (fileNo == 0) {
                    if (prop.hasHeader())
                        // ignore the header line from data file
                        br.readLine();
                    //TODO: fix hard-wired header propagation to meta data column names
                    String dcdHeader = _da.constructDummycodedHeader(headerLine, agents.getDelim());
                    numColumnsTf = _da.genDcdMapsAndColTypes(fs, tfMtdPath, ncols, agents);
                    generateHeaderFiles(fs, tfMtdPath, headerLine, dcdHeader);
                }
                line = null;
                while ((line = br.readLine()) != null) {
                    words = agents.getWords(line);
                    if (!agents.omit(words)) {
                        words = agents.apply(words);
                        if (isCSV) {
                            out.write(agents.checkAndPrepOutputString(words, sb));
                            out.write("\n");
                        }
                        if (isBB) {
                            agents.check(words);
                            for (int c = 0; c < words.length; c++) {
                                if (words[c] == null || words[c].isEmpty())
                                    ;
                                else
                                    mb.appendValue(rowID, c, UtilFunctions.parseToDouble(words[c]));
                            }
                        }
                        rowID++;
                    }
                }
            }
        }
        if (mb != null) {
            mb.recomputeNonZeros();
            mb.examSparsity();
            result.acquireModify(mb);
            result.release();
            result.exportData();
        }
    } finally {
        IOUtilFunctions.closeSilently(out);
    }
    MatrixCharacteristics mc = new MatrixCharacteristics(agents.getValid(), numColumnsTf, (int) result.getNumRowsPerBlock(), (int) result.getNumColumnsPerBlock());
    JobReturn ret = new JobReturn(new MatrixCharacteristics[] { mc }, true);
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) InputStreamReader(java.io.InputStreamReader) JobReturn(org.apache.sysml.runtime.matrix.JobReturn) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) BufferedWriter(java.io.BufferedWriter) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) JSONObject(org.apache.wink.json4j.JSONObject) BufferedReader(java.io.BufferedReader) OutputStreamWriter(java.io.OutputStreamWriter)

Example 3 with JobReturn

use of org.apache.sysml.runtime.matrix.JobReturn in project incubator-systemml by apache.

the class DataTransform method cpDataTransform.

public static JobReturn cpDataTransform(TransformOperands oprnds, CacheableData<?>[] inputs, MatrixObject[] outputs) throws IOException, DMLRuntimeException, IllegalArgumentException, JSONException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    // find the first file in alphabetical ordering of partfiles in directory inputPath 
    String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
    FileSystem fs = IOUtilFunctions.getFileSystem(smallestFile);
    // find column names
    String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
    HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
    String outHeader = getOutputHeader(fs, headerLine, oprnds);
    ArrayList<Integer> csvoutputs = new ArrayList<Integer>();
    ArrayList<Integer> bboutputs = new ArrayList<Integer>();
    // divide output objects based on output format (CSV or BinaryBlock)
    for (int i = 0; i < outputs.length; i++) {
        if (outputs[i].getFileFormatProperties() != null && outputs[i].getFileFormatProperties().getFileFormat() == FileFormatProperties.FileFormat.CSV)
            csvoutputs.add(i);
        else
            bboutputs.add(i);
    }
    boolean isCSV = (csvoutputs.size() > 0);
    boolean isBB = (bboutputs.size() > 0);
    checkIfOutputOverlapsWithTxMtd(outputs, oprnds, isCSV, isBB, csvoutputs, bboutputs, fs);
    JobReturn ret = null;
    if (!oprnds.isApply) {
        // build specification file with column IDs insteadof column names
        String specWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds, oprnds.inputCSVProperties, oprnds.spec);
        MapReduceTool.writeStringToHDFS(specWithIDs, oprnds.txMtdPath + "/" + "spec.json");
        ret = performTransform(job, fs, oprnds.inputPath, colNamesToIds.size(), oprnds.inputCSVProperties, specWithIDs, oprnds.txMtdPath, oprnds.isApply, outputs[0], outHeader, isBB, isCSV);
    } else {
        // copy given transform metadata (applyTxPath) to specified location (txMtdPath)
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
        MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
        // path to specification file (optionally specified)
        String specWithIDs = (oprnds.spec != null) ? oprnds.spec : MapReduceTool.readStringFromHDFSFile(oprnds.txMtdPath + "/" + "spec.json");
        ret = performTransform(job, fs, oprnds.inputPath, colNamesToIds.size(), oprnds.inputCSVProperties, specWithIDs, oprnds.txMtdPath, oprnds.isApply, outputs[0], outHeader, isBB, isCSV);
    }
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList) JobConf(org.apache.hadoop.mapred.JobConf) JobReturn(org.apache.sysml.runtime.matrix.JobReturn)

Example 4 with JobReturn

use of org.apache.sysml.runtime.matrix.JobReturn in project incubator-systemml by apache.

the class ApplyTfCSVMR method runJob.

public static JobReturn runJob(String inputPath, String spec, String mapsPath, String tmpPath, String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols, int replication, String headerLine) throws IOException, ClassNotFoundException, InterruptedException {
    JobConf job = new JobConf(ApplyTfCSVMR.class);
    job.setJobName("ApplyTfCSV");
    /* Setup MapReduce Job */
    job.setJarByClass(ApplyTfCSVMR.class);
    // set relevant classes
    job.setMapperClass(ApplyTfCSVMapper.class);
    job.setNumReduceTasks(0);
    // Add transformation metadata file as well as partOffsetsFile to Distributed cache
    DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job);
    DistributedCache.createSymlink(job);
    Path cachefile = new Path(partOffsetsFile);
    DistributedCache.addCacheFile(cachefile.toUri(), job);
    DistributedCache.createSymlink(job);
    // set input and output properties
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // delete outputPath, if exists already.
    Path outPath = new Path(outputPath);
    FileSystem fs = IOUtilFunctions.getFileSystem(outPath, job);
    fs.delete(outPath, true);
    FileOutputFormat.setOutputPath(job, outPath);
    job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
    job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
    if (inputDataProperties.getNAStrings() != null)
        // Adding "dummy" string to handle the case of na_strings = ""
        job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
    job.set(MRJobConfiguration.TF_SPEC, spec);
    job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath));
    job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath);
    job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols);
    job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath);
    job.set(MRJobConfiguration.TF_HEADER, headerLine);
    job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString());
    job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath);
    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);
    // Run the job
    RunningJob runjob = JobClient.runJob(job);
    // Since transform CSV produces part files w/ prefix transform-part-*,
    // delete all the "default" part-..... files
    deletePartFiles(fs, outPath);
    MatrixCharacteristics mc = new MatrixCharacteristics();
    return new JobReturn(new MatrixCharacteristics[] { mc }, runjob.isSuccessful());
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) RunningJob(org.apache.hadoop.mapred.RunningJob) JobConf(org.apache.hadoop.mapred.JobConf) JobReturn(org.apache.sysml.runtime.matrix.JobReturn) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 5 with JobReturn

use of org.apache.sysml.runtime.matrix.JobReturn in project incubator-systemml by apache.

the class ApplyTfBBMR method runJob.

public static JobReturn runJob(String inputPath, String rblkInst, String otherInst, String spec, String mapsPath, String tmpPath, String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numRows, long numColsBefore, long numColsAfter, int replication, String headerLine) throws Exception {
    CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(rblkInst);
    long[] rlens = new long[] { numRows };
    long[] clens = new long[] { numColsAfter };
    int[] brlens = new int[] { rblk.brlen };
    int[] bclens = new int[] { rblk.bclen };
    byte[] realIndexes = new byte[] { rblk.input };
    byte[] resultIndexes = new byte[] { rblk.output };
    JobConf job = new JobConf(ApplyTfBBMR.class);
    job.setJobName("ApplyTfBB");
    /* Setup MapReduce Job */
    job.setJarByClass(ApplyTfBBMR.class);
    // set relevant classes
    job.setMapperClass(ApplyTfBBMapper.class);
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, new String[] { inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, brlens, bclens, false, ConvertTarget.CELL);
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
    MRJobConfiguration.setCSVReblockInstructions(job, rblkInst);
    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, otherInst);
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);
    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, rblkInst, null, otherInst, resultIndexes);
    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null, rblkInst, null, null, null, resultIndexes, mapoutputIndexes, false);
    //set up the number of reducers
    int numRed = WriteCSVMR.determineNumReducers(rlens, clens, ConfigurationManager.getNumReducers(), ret.numReducerGroups);
    job.setNumReduceTasks(numRed);
    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, new byte[] { rblk.output }, new byte[] { 0 }, new String[] { outputPath }, new OutputInfo[] { OutputInfo.BinaryBlockOutputInfo }, true, false);
    // configure mapper and the mapper output key value pairs
    job.setMapperClass(ApplyTfBBMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(BlockRow.class);
    //configure reducer
    job.setReducerClass(CSVReblockReducer.class);
    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);
    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    // Add transformation metadata file as well as partOffsetsFile to Distributed cache
    DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job);
    DistributedCache.createSymlink(job);
    Path cachefile = new Path(new Path(partOffsetsFile), "part-00000");
    DistributedCache.addCacheFile(cachefile.toUri(), job);
    DistributedCache.createSymlink(job);
    job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
    job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
    // Adding "dummy" string to handle the case of na_strings = ""
    if (inputDataProperties.getNAStrings() != null)
        job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
    job.set(MRJobConfiguration.TF_SPEC, spec);
    job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath));
    job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath);
    job.setLong(MRJobConfiguration.TF_NUM_COLS, numColsBefore);
    job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath);
    job.set(MRJobConfiguration.TF_HEADER, headerLine);
    job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString());
    job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath);
    RunningJob runjob = JobClient.runJob(job);
    MapReduceTool.deleteFileIfExistOnHDFS(cachefile, job);
    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        ret.stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    }
    return new JobReturn(ret.stats, runjob.isSuccessful());
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.hadoop.mapred.Counters.Group) CSVReblockInstruction(org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction) JobReturn(org.apache.sysml.runtime.matrix.JobReturn) MatrixChar_N_ReducerGroups(org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups) RunningJob(org.apache.hadoop.mapred.RunningJob) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

JobReturn (org.apache.sysml.runtime.matrix.JobReturn)15 Path (org.apache.hadoop.fs.Path)6 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)6 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)6 JobConf (org.apache.hadoop.mapred.JobConf)4 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)4 FileSystem (org.apache.hadoop.fs.FileSystem)3 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)3 ArrayList (java.util.ArrayList)2 LinkedList (java.util.LinkedList)2 RunningJob (org.apache.hadoop.mapred.RunningJob)2 CSVReblockInstruction (org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction)2 BufferedReader (java.io.BufferedReader)1 BufferedWriter (java.io.BufferedWriter)1 IOException (java.io.IOException)1 InputStreamReader (java.io.InputStreamReader)1 OutputStreamWriter (java.io.OutputStreamWriter)1 Group (org.apache.hadoop.mapred.Counters.Group)1 DMLConfig (org.apache.sysml.conf.DMLConfig)1 FrameObject (org.apache.sysml.runtime.controlprogram.caching.FrameObject)1