Examples with CSVReblockInstruction - org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction

Example 1 with CSVReblockInstruction

use of org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction in project incubator-systemml by apache.

the class DataTransform method mrDataTransform.

/**
	 * Main method to create and/or apply transformation metdata using MapReduce.
	 * 
	 * @param jobinst MR job instruction
	 * @param inputs array of input matrices
	 * @param shuffleInst shuffle instructions
	 * @param otherInst other instructions
	 * @param resultIndices byte array of result indices
	 * @param outputs array of output matrices
	 * @param numReducers number of reducers
	 * @param replication ?
	 * @return MR job result
	 * @throws Exception if IOException occurs
	 */
public static JobReturn mrDataTransform(MRJobInstruction jobinst, MatrixObject[] inputs, String shuffleInst, String otherInst, byte[] resultIndices, MatrixObject[] outputs, int numReducers, int replication) throws Exception {
    String[] insts = shuffleInst.split(Instruction.INSTRUCTION_DELIM);
    // Parse transform instruction (the first instruction) to obtain relevant fields
    TransformOperands oprnds = new TransformOperands(insts[0], inputs[0]);
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    // find the first file in alphabetical ordering of part files in directory inputPath 
    String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
    // find column names
    FileSystem fs = IOUtilFunctions.getFileSystem(smallestFile);
    String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
    HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
    String outHeader = getOutputHeader(fs, headerLine, oprnds);
    int numColumns = colNamesToIds.size();
    int numColumnsTf = 0;
    long numRowsTf = 0;
    ArrayList<Integer> csvoutputs = new ArrayList<Integer>();
    ArrayList<Integer> bboutputs = new ArrayList<Integer>();
    // divide output objects based on output format (CSV or BinaryBlock)
    for (int i = 0; i < outputs.length; i++) {
        if (outputs[i].getFileFormatProperties() != null && outputs[i].getFileFormatProperties().getFileFormat() == FileFormatProperties.FileFormat.CSV)
            csvoutputs.add(i);
        else
            bboutputs.add(i);
    }
    boolean isCSV = (csvoutputs.size() > 0);
    boolean isBB = (bboutputs.size() > 0);
    String tmpPath = MRJobConfiguration.constructTempOutputFilename();
    checkIfOutputOverlapsWithTxMtd(outputs, oprnds, isCSV, isBB, csvoutputs, bboutputs, fs);
    JobReturn retCSV = null, retBB = null;
    if (!oprnds.isApply) {
        // build specification file with column IDs insteadof column names
        String specWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds, oprnds.inputCSVProperties, oprnds.spec);
        // enable GC on colNamesToIds
        colNamesToIds = null;
        // Build transformation metadata, including recode maps, bin definitions, etc.
        // Also, generate part offsets file (counters file), which is to be used in csv-reblock
        String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
        numRowsTf = GenTfMtdMR.runJob(oprnds.inputPath, oprnds.txMtdPath, specWithIDs, smallestFile, partOffsetsFile, oprnds.inputCSVProperties, numColumns, replication, outHeader);
        if (numRowsTf == 0)
            throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
        // store the specFileWithIDs as transformation metadata
        MapReduceTool.writeStringToHDFS(specWithIDs, oprnds.txMtdPath + "/" + "spec.json");
        numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
        // Apply transformation metadata, and perform actual transformation 
        if (isCSV)
            retCSV = ApplyTfCSVMR.runJob(oprnds.inputPath, specWithIDs, oprnds.txMtdPath, tmpPath, outputs[csvoutputs.get(0)].getFileName(), partOffsetsFile, oprnds.inputCSVProperties, numColumns, replication, outHeader);
        if (isBB) {
            DMLConfig conf = ConfigurationManager.getDMLConfig();
            int blockSize = conf.getIntValue(DMLConfig.DEFAULT_BLOCK_SIZE);
            CSVReblockInstruction rblk = prepDummyReblockInstruction(oprnds.inputCSVProperties, blockSize);
            AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { blockSize }, new int[] { blockSize }, rblk.toString(), replication, new String[] { smallestFile }, true, oprnds.inputCSVProperties.getNAStrings(), specWithIDs);
            if (ret1.rlens[0] == 0)
                throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
            retBB = ApplyTfBBMR.runJob(oprnds.inputPath, insts[1], otherInst, specWithIDs, oprnds.txMtdPath, tmpPath, outputs[bboutputs.get(0)].getFileName(), ret1.counterFile.toString(), oprnds.inputCSVProperties, numRowsTf, numColumns, numColumnsTf, replication, outHeader);
        }
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);
    } else {
        // enable GC on colNamesToIds
        colNamesToIds = null;
        // copy given transform metadata (applyTxPath) to specified location (txMtdPath)
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
        MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
        // path to specification file
        String specWithIDs = (oprnds.spec != null) ? oprnds.spec : MapReduceTool.readStringFromHDFSFile(oprnds.txMtdPath + "/" + "spec.json");
        numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
        if (isCSV) {
            DMLConfig conf = ConfigurationManager.getDMLConfig();
            int blockSize = conf.getIntValue(DMLConfig.DEFAULT_BLOCK_SIZE);
            CSVReblockInstruction rblk = prepDummyReblockInstruction(oprnds.inputCSVProperties, blockSize);
            AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { blockSize }, new int[] { blockSize }, rblk.toString(), replication, new String[] { smallestFile }, true, oprnds.inputCSVProperties.getNAStrings(), specWithIDs);
            numRowsTf = ret1.rlens[0];
            if (ret1.rlens[0] == 0)
                throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
            // Apply transformation metadata, and perform actual transformation 
            retCSV = ApplyTfCSVMR.runJob(oprnds.inputPath, specWithIDs, oprnds.applyTxPath, tmpPath, outputs[csvoutputs.get(0)].getFileName(), ret1.counterFile.toString(), oprnds.inputCSVProperties, numColumns, replication, outHeader);
        }
        if (isBB) {
            // compute part offsets file
            CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(insts[1]);
            CSVReblockInstruction newrblk = (CSVReblockInstruction) rblk.clone((byte) 0);
            AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { newrblk.brlen }, new int[] { newrblk.bclen }, newrblk.toString(), replication, new String[] { smallestFile }, true, oprnds.inputCSVProperties.getNAStrings(), specWithIDs);
            numRowsTf = ret1.rlens[0];
            if (ret1.rlens[0] == 0)
                throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
            // apply transformation metadata, as well as reblock the resulting data
            retBB = ApplyTfBBMR.runJob(oprnds.inputPath, insts[1], otherInst, specWithIDs, oprnds.txMtdPath, tmpPath, outputs[bboutputs.get(0)].getFileName(), ret1.counterFile.toString(), oprnds.inputCSVProperties, ret1.rlens[0], ret1.clens[0], numColumnsTf, replication, outHeader);
        }
    }
    // copy auxiliary data (old and new header lines) from temporary location to txMtdPath
    moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);
    // generate matrix metadata file for outputs
    if (retCSV != null) {
        retCSV.getMatrixCharacteristics(0).setDimension(numRowsTf, numColumnsTf);
        CSVFileFormatProperties prop = new CSVFileFormatProperties(false, // use the same header as the input
        oprnds.inputCSVProperties.getDelim(), false, Double.NaN, null);
        MapReduceTool.writeMetaDataFile(outputs[csvoutputs.get(0)].getFileName() + ".mtd", ValueType.DOUBLE, retCSV.getMatrixCharacteristics(0), OutputInfo.CSVOutputInfo, prop);
        return retCSV;
    }
    if (retBB != null) {
        retBB.getMatrixCharacteristics(0).setDimension(numRowsTf, numColumnsTf);
        MapReduceTool.writeMetaDataFile(outputs[bboutputs.get(0)].getFileName() + ".mtd", ValueType.DOUBLE, retBB.getMatrixCharacteristics(0), OutputInfo.BinaryBlockOutputInfo);
        return retBB;
    }
    return null;
}

Also used : AssignRowIDMRReturn(org.apache.sysml.runtime.matrix.CSVReblockMR.AssignRowIDMRReturn) Path(org.apache.hadoop.fs.Path) DMLConfig(org.apache.sysml.conf.DMLConfig) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) CSVReblockInstruction(org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction) ArrayList(java.util.ArrayList) JobReturn(org.apache.sysml.runtime.matrix.JobReturn) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf)

Example 2 with CSVReblockInstruction

use of org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction in project incubator-systemml by apache.

the class ApplyTfBBMapper method configure.

@Override
public void configure(JobConf job) {
    super.configure(job);
    try {
        _partFileWithHeader = TfUtils.isPartFileWithHeader(job);
        tfmapper = new TfUtils(job);
        tfmapper.loadTfMetadata(job, true);
        // Load relevant information for CSV Reblock
        ByteWritable key = new ByteWritable();
        OffsetCount value = new OffsetCount();
        Path p = new Path(job.get(CSVReblockMR.ROWID_FILE_NAME));
        Path path = new Path(job.get(MRConfigurationNames.MR_MAP_INPUT_FILE));
        FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
        String thisfile = path.makeQualified(fs).toString();
        SequenceFile.Reader reader = null;
        try {
            reader = new SequenceFile.Reader(fs, p, job);
            while (reader.next(key, value)) {
                // "key" needn't be checked since the offset file has information about a single CSV input (the raw data file)
                if (thisfile.equals(value.filename))
                    offsetMap.put(value.fileOffset, value.count);
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
        idxRow = new CSVReblockMapper.IndexedBlockRow();
        int maxBclen = 0;
        for (ArrayList<CSVReblockInstruction> insv : csv_reblock_instructions) for (CSVReblockInstruction in : insv) {
            if (maxBclen < in.bclen)
                maxBclen = in.bclen;
        }
        //always dense since common csv usecase
        idxRow.getRow().data.reset(1, maxBclen, false);
    } catch (IOException e) {
        throw new RuntimeException(e);
    } catch (JSONException e) {
        throw new RuntimeException(e);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) CSVReblockMapper(org.apache.sysml.runtime.matrix.mapred.CSVReblockMapper) IndexedBlockRow(org.apache.sysml.runtime.matrix.mapred.CSVReblockMapper.IndexedBlockRow) CSVReblockInstruction(org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction) JSONException(org.apache.wink.json4j.JSONException) IOException(java.io.IOException) OffsetCount(org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) ByteWritable(org.apache.hadoop.io.ByteWritable)

Example 3 with CSVReblockInstruction

use of org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction in project incubator-systemml by apache.

the class ApplyTfBBMR method runJob.

public static JobReturn runJob(String inputPath, String rblkInst, String otherInst, String spec, String mapsPath, String tmpPath, String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numRows, long numColsBefore, long numColsAfter, int replication, String headerLine) throws Exception {
    CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(rblkInst);
    long[] rlens = new long[] { numRows };
    long[] clens = new long[] { numColsAfter };
    int[] brlens = new int[] { rblk.brlen };
    int[] bclens = new int[] { rblk.bclen };
    byte[] realIndexes = new byte[] { rblk.input };
    byte[] resultIndexes = new byte[] { rblk.output };
    JobConf job = new JobConf(ApplyTfBBMR.class);
    job.setJobName("ApplyTfBB");
    /* Setup MapReduce Job */
    job.setJarByClass(ApplyTfBBMR.class);
    // set relevant classes
    job.setMapperClass(ApplyTfBBMapper.class);
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, new String[] { inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, brlens, bclens, false, ConvertTarget.CELL);
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
    MRJobConfiguration.setCSVReblockInstructions(job, rblkInst);
    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, otherInst);
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);
    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, rblkInst, null, otherInst, resultIndexes);
    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null, rblkInst, null, null, null, resultIndexes, mapoutputIndexes, false);
    //set up the number of reducers
    int numRed = WriteCSVMR.determineNumReducers(rlens, clens, ConfigurationManager.getNumReducers(), ret.numReducerGroups);
    job.setNumReduceTasks(numRed);
    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, new byte[] { rblk.output }, new byte[] { 0 }, new String[] { outputPath }, new OutputInfo[] { OutputInfo.BinaryBlockOutputInfo }, true, false);
    // configure mapper and the mapper output key value pairs
    job.setMapperClass(ApplyTfBBMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(BlockRow.class);
    //configure reducer
    job.setReducerClass(CSVReblockReducer.class);
    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);
    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    // Add transformation metadata file as well as partOffsetsFile to Distributed cache
    DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job);
    DistributedCache.createSymlink(job);
    Path cachefile = new Path(new Path(partOffsetsFile), "part-00000");
    DistributedCache.addCacheFile(cachefile.toUri(), job);
    DistributedCache.createSymlink(job);
    job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
    job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
    // Adding "dummy" string to handle the case of na_strings = ""
    if (inputDataProperties.getNAStrings() != null)
        job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
    job.set(MRJobConfiguration.TF_SPEC, spec);
    job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath));
    job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath);
    job.setLong(MRJobConfiguration.TF_NUM_COLS, numColsBefore);
    job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath);
    job.set(MRJobConfiguration.TF_HEADER, headerLine);
    job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString());
    job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath);
    RunningJob runjob = JobClient.runJob(job);
    MapReduceTool.deleteFileIfExistOnHDFS(cachefile, job);
    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        ret.stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    }
    return new JobReturn(ret.stats, runjob.isSuccessful());
}

Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.hadoop.mapred.Counters.Group) CSVReblockInstruction(org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction) JobReturn(org.apache.sysml.runtime.matrix.JobReturn) MatrixChar_N_ReducerGroups(org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups) RunningJob(org.apache.hadoop.mapred.RunningJob) JobConf(org.apache.hadoop.mapred.JobConf)

Example 4 with CSVReblockInstruction

use of org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction in project incubator-systemml by apache.

the class ApplyTfBBMapper method map.

@Override
public void map(LongWritable rawKey, Text rawValue, OutputCollector<TaggedFirstSecondIndexes, CSVReblockMR.BlockRow> out, Reporter reporter) throws IOException {
    if (_first) {
        rowOffset = offsetMap.get(rawKey.get());
        _reporter = reporter;
        _first = false;
    }
    // output the header line
    if (rawKey.get() == 0 && _partFileWithHeader) {
        tfmapper.processHeaderLine();
        if (tfmapper.hasHeader())
            return;
    }
    // parse the input line and apply transformation
    String[] words = tfmapper.getWords(rawValue);
    if (!tfmapper.omit(words)) {
        words = tfmapper.apply(words);
        try {
            tfmapper.check(words);
            // Perform CSV Reblock
            CSVReblockInstruction ins = csv_reblock_instructions.get(0).get(0);
            idxRow = CSVReblockMapper.processRow(idxRow, words, rowOffset, num, ins.output, ins.brlen, ins.bclen, ins.fill, ins.fillValue, out);
        } catch (DMLRuntimeException e) {
            throw new RuntimeException(e.getMessage() + ":" + rawValue.toString());
        }
        num++;
    }
}

Also used : DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) CSVReblockInstruction(org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 5 with CSVReblockInstruction

use of org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction in project incubator-systemml by apache.

the class MapperBase method configure.

@Override
public void configure(JobConf job) {
    super.configure(job);
    // since one matrix file can occur multiple times in a statement
    try {
        representativeMatrixes = MRJobConfiguration.getInputMatrixIndexesInMapper(job);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    // get input converter information
    inputConverter = MRJobConfiguration.getInputConverter(job, representativeMatrixes.get(0));
    DataGenMRInstruction[] allDataGenIns;
    MRInstruction[] allMapperIns;
    ReblockInstruction[] allReblockIns;
    CSVReblockInstruction[] allCSVReblockIns;
    try {
        allDataGenIns = MRJobConfiguration.getDataGenInstructions(job);
        // parse the instructions on the matrices that this file represent
        allMapperIns = MRJobConfiguration.getInstructionsInMapper(job);
        // parse the reblock instructions on the matrices that this file represent
        allReblockIns = MRJobConfiguration.getReblockInstructions(job);
        allCSVReblockIns = MRJobConfiguration.getCSVReblockInstructions(job);
    } catch (DMLRuntimeException e) {
        throw new RuntimeException(e);
    }
    // get all the output indexes
    byte[] outputs = MRJobConfiguration.getOutputIndexesInMapper(job);
    // get the dimension of all the representative matrices
    rlens = new long[representativeMatrixes.size()];
    clens = new long[representativeMatrixes.size()];
    for (int i = 0; i < representativeMatrixes.size(); i++) {
        rlens[i] = MRJobConfiguration.getNumRows(job, representativeMatrixes.get(i));
        clens[i] = MRJobConfiguration.getNumColumns(job, representativeMatrixes.get(i));
    }
    // get the block sizes of the representative matrices
    brlens = new int[representativeMatrixes.size()];
    bclens = new int[representativeMatrixes.size()];
    for (int i = 0; i < representativeMatrixes.size(); i++) {
        brlens[i] = MRJobConfiguration.getNumRowsPerBlock(job, representativeMatrixes.get(i));
        bclens[i] = MRJobConfiguration.getNumColumnsPerBlock(job, representativeMatrixes.get(i));
    }
    rbounds = new long[representativeMatrixes.size()];
    cbounds = new long[representativeMatrixes.size()];
    lastblockrlens = new int[representativeMatrixes.size()];
    lastblockclens = new int[representativeMatrixes.size()];
    // calculate upper boundaries for key value pairs
    if (valueClass.equals(MatrixBlock.class)) {
        for (int i = 0; i < representativeMatrixes.size(); i++) {
            rbounds[i] = (long) Math.max(Math.ceil((double) rlens[i] / brlens[i]), 1);
            cbounds[i] = (long) Math.max(Math.ceil((double) clens[i] / bclens[i]), 1);
            lastblockrlens[i] = (int) (rlens[i] % brlens[i]);
            lastblockclens[i] = (int) (clens[i] % bclens[i]);
            if (lastblockrlens[i] == 0)
                lastblockrlens[i] = brlens[i];
            if (lastblockclens[i] == 0)
                lastblockclens[i] = bclens[i];
        }
    } else {
        for (int i = 0; i < representativeMatrixes.size(); i++) {
            rbounds[i] = rlens[i];
            cbounds[i] = clens[i];
            lastblockrlens[i] = 1;
            lastblockclens[i] = 1;
        }
    }
    // load data from distributed cache (if required, reuse if jvm_reuse)
    try {
        setupDistCacheFiles(job);
    } catch (IOException ex) {
        throw new RuntimeException(ex);
    }
    // collect unary instructions for each representative matrix
    HashSet<Byte> set = new HashSet<>();
    for (int i = 0; i < representativeMatrixes.size(); i++) {
        set.clear();
        set.add(representativeMatrixes.get(i));
        // collect the relavent datagen instructions for this representative matrix
        ArrayList<DataGenMRInstruction> dataGensForThisMatrix = new ArrayList<>();
        if (allDataGenIns != null) {
            for (DataGenMRInstruction ins : allDataGenIns) {
                if (set.contains(ins.getInput())) {
                    dataGensForThisMatrix.add(ins);
                    set.add(ins.output);
                }
            }
        }
        if (dataGensForThisMatrix.size() > 1)
            throw new RuntimeException("only expects at most one rand instruction per input");
        if (dataGensForThisMatrix.isEmpty())
            dataGen_instructions.add(null);
        else
            dataGen_instructions.add(dataGensForThisMatrix.get(0));
        // collect the relavent instructions for this representative matrix
        ArrayList<MRInstruction> opsForThisMatrix = new ArrayList<>();
        if (allMapperIns != null) {
            for (MRInstruction ins : allMapperIns) {
                try {
                    /*
						boolean toAdd=true;
						for(byte input: ins.getInputIndexes())
							if(!set.contains(input))
							{
								toAdd=false;
								break;
							}
							*/
                    boolean toAdd = false;
                    for (byte input : ins.getInputIndexes()) if (set.contains(input)) {
                        toAdd = true;
                        break;
                    }
                    if (toAdd) {
                        opsForThisMatrix.add(ins);
                        set.add(ins.output);
                    }
                } catch (DMLRuntimeException e) {
                    throw new RuntimeException(e);
                }
            }
        }
        mapper_instructions.add(opsForThisMatrix);
        // collect the relavent reblock instructions for this representative matrix
        ArrayList<ReblockInstruction> reblocksForThisMatrix = new ArrayList<>();
        if (allReblockIns != null) {
            for (ReblockInstruction ins : allReblockIns) {
                if (set.contains(ins.input)) {
                    reblocksForThisMatrix.add(ins);
                    set.add(ins.output);
                }
            }
        }
        reblock_instructions.add(reblocksForThisMatrix);
        // collect the relavent reblock instructions for this representative matrix
        ArrayList<CSVReblockInstruction> csvReblocksForThisMatrix = new ArrayList<>();
        if (allCSVReblockIns != null) {
            for (CSVReblockInstruction ins : allCSVReblockIns) {
                if (set.contains(ins.input)) {
                    csvReblocksForThisMatrix.add(ins);
                    set.add(ins.output);
                }
            }
        }
        csv_reblock_instructions.add(csvReblocksForThisMatrix);
        // collect the output indexes for this representative matrix
        ArrayList<Byte> outsForThisMatrix = new ArrayList<>();
        for (byte output : outputs) {
            if (set.contains(output))
                outsForThisMatrix.add(output);
        }
        outputIndexes.add(outsForThisMatrix);
    }
}

Also used : CSVReblockInstruction(org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction) ArrayList(java.util.ArrayList) IOException(java.io.IOException) DataGenMRInstruction(org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction) ReblockInstruction(org.apache.sysml.runtime.instructions.mr.ReblockInstruction) CSVReblockInstruction(org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) PMMJMRInstruction(org.apache.sysml.runtime.instructions.mr.PMMJMRInstruction) DataGenMRInstruction(org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction) MRInstruction(org.apache.sysml.runtime.instructions.mr.MRInstruction) HashSet(java.util.HashSet)

Aggregations

CSVReblockInstruction (org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction)12 IOException (java.io.IOException)7 Path (org.apache.hadoop.fs.Path)7 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)7 FileSystem (org.apache.hadoop.fs.FileSystem)6 ReblockInstruction (org.apache.sysml.runtime.instructions.mr.ReblockInstruction)4 ArrayList (java.util.ArrayList)3 ByteWritable (org.apache.hadoop.io.ByteWritable)3 SequenceFile (org.apache.hadoop.io.SequenceFile)3 OffsetCount (org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount)3 HashSet (java.util.HashSet)2 JobConf (org.apache.hadoop.mapred.JobConf)2 DataGenMRInstruction (org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction)2 MRInstruction (org.apache.sysml.runtime.instructions.mr.MRInstruction)2 PMMJMRInstruction (org.apache.sysml.runtime.instructions.mr.PMMJMRInstruction)2 JobReturn (org.apache.sysml.runtime.matrix.JobReturn)2 Group (org.apache.hadoop.mapred.Counters.Group)1 RunningJob (org.apache.hadoop.mapred.RunningJob)1 DMLConfig (org.apache.sysml.conf.DMLConfig)1 AssignRowIDMRReturn (org.apache.sysml.runtime.matrix.CSVReblockMR.AssignRowIDMRReturn)1