use of org.apache.sysml.runtime.matrix.CSVReblockMR.AssignRowIDMRReturn in project incubator-systemml by apache.
the class DataTransform method mrDataTransform.
/**
* Main method to create and/or apply transformation metdata using MapReduce.
*
* @param jobinst MR job instruction
* @param inputs array of input matrices
* @param shuffleInst shuffle instructions
* @param otherInst other instructions
* @param resultIndices byte array of result indices
* @param outputs array of output matrices
* @param numReducers number of reducers
* @param replication ?
* @return MR job result
* @throws Exception if IOException occurs
*/
public static JobReturn mrDataTransform(MRJobInstruction jobinst, MatrixObject[] inputs, String shuffleInst, String otherInst, byte[] resultIndices, MatrixObject[] outputs, int numReducers, int replication) throws Exception {
String[] insts = shuffleInst.split(Instruction.INSTRUCTION_DELIM);
// Parse transform instruction (the first instruction) to obtain relevant fields
TransformOperands oprnds = new TransformOperands(insts[0], inputs[0]);
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
// find the first file in alphabetical ordering of part files in directory inputPath
String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
// find column names
FileSystem fs = IOUtilFunctions.getFileSystem(smallestFile);
String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
String outHeader = getOutputHeader(fs, headerLine, oprnds);
int numColumns = colNamesToIds.size();
int numColumnsTf = 0;
long numRowsTf = 0;
ArrayList<Integer> csvoutputs = new ArrayList<Integer>();
ArrayList<Integer> bboutputs = new ArrayList<Integer>();
// divide output objects based on output format (CSV or BinaryBlock)
for (int i = 0; i < outputs.length; i++) {
if (outputs[i].getFileFormatProperties() != null && outputs[i].getFileFormatProperties().getFileFormat() == FileFormatProperties.FileFormat.CSV)
csvoutputs.add(i);
else
bboutputs.add(i);
}
boolean isCSV = (csvoutputs.size() > 0);
boolean isBB = (bboutputs.size() > 0);
String tmpPath = MRJobConfiguration.constructTempOutputFilename();
checkIfOutputOverlapsWithTxMtd(outputs, oprnds, isCSV, isBB, csvoutputs, bboutputs, fs);
JobReturn retCSV = null, retBB = null;
if (!oprnds.isApply) {
// build specification file with column IDs insteadof column names
String specWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds, oprnds.inputCSVProperties, oprnds.spec);
// enable GC on colNamesToIds
colNamesToIds = null;
// Build transformation metadata, including recode maps, bin definitions, etc.
// Also, generate part offsets file (counters file), which is to be used in csv-reblock
String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
numRowsTf = GenTfMtdMR.runJob(oprnds.inputPath, oprnds.txMtdPath, specWithIDs, smallestFile, partOffsetsFile, oprnds.inputCSVProperties, numColumns, replication, outHeader);
if (numRowsTf == 0)
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
// store the specFileWithIDs as transformation metadata
MapReduceTool.writeStringToHDFS(specWithIDs, oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
// Apply transformation metadata, and perform actual transformation
if (isCSV)
retCSV = ApplyTfCSVMR.runJob(oprnds.inputPath, specWithIDs, oprnds.txMtdPath, tmpPath, outputs[csvoutputs.get(0)].getFileName(), partOffsetsFile, oprnds.inputCSVProperties, numColumns, replication, outHeader);
if (isBB) {
DMLConfig conf = ConfigurationManager.getDMLConfig();
int blockSize = conf.getIntValue(DMLConfig.DEFAULT_BLOCK_SIZE);
CSVReblockInstruction rblk = prepDummyReblockInstruction(oprnds.inputCSVProperties, blockSize);
AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { blockSize }, new int[] { blockSize }, rblk.toString(), replication, new String[] { smallestFile }, true, oprnds.inputCSVProperties.getNAStrings(), specWithIDs);
if (ret1.rlens[0] == 0)
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
retBB = ApplyTfBBMR.runJob(oprnds.inputPath, insts[1], otherInst, specWithIDs, oprnds.txMtdPath, tmpPath, outputs[bboutputs.get(0)].getFileName(), ret1.counterFile.toString(), oprnds.inputCSVProperties, numRowsTf, numColumns, numColumnsTf, replication, outHeader);
}
MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);
} else {
// enable GC on colNamesToIds
colNamesToIds = null;
// copy given transform metadata (applyTxPath) to specified location (txMtdPath)
MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
// path to specification file
String specWithIDs = (oprnds.spec != null) ? oprnds.spec : MapReduceTool.readStringFromHDFSFile(oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
if (isCSV) {
DMLConfig conf = ConfigurationManager.getDMLConfig();
int blockSize = conf.getIntValue(DMLConfig.DEFAULT_BLOCK_SIZE);
CSVReblockInstruction rblk = prepDummyReblockInstruction(oprnds.inputCSVProperties, blockSize);
AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { blockSize }, new int[] { blockSize }, rblk.toString(), replication, new String[] { smallestFile }, true, oprnds.inputCSVProperties.getNAStrings(), specWithIDs);
numRowsTf = ret1.rlens[0];
if (ret1.rlens[0] == 0)
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
// Apply transformation metadata, and perform actual transformation
retCSV = ApplyTfCSVMR.runJob(oprnds.inputPath, specWithIDs, oprnds.applyTxPath, tmpPath, outputs[csvoutputs.get(0)].getFileName(), ret1.counterFile.toString(), oprnds.inputCSVProperties, numColumns, replication, outHeader);
}
if (isBB) {
// compute part offsets file
CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(insts[1]);
CSVReblockInstruction newrblk = (CSVReblockInstruction) rblk.clone((byte) 0);
AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { newrblk.brlen }, new int[] { newrblk.bclen }, newrblk.toString(), replication, new String[] { smallestFile }, true, oprnds.inputCSVProperties.getNAStrings(), specWithIDs);
numRowsTf = ret1.rlens[0];
if (ret1.rlens[0] == 0)
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
// apply transformation metadata, as well as reblock the resulting data
retBB = ApplyTfBBMR.runJob(oprnds.inputPath, insts[1], otherInst, specWithIDs, oprnds.txMtdPath, tmpPath, outputs[bboutputs.get(0)].getFileName(), ret1.counterFile.toString(), oprnds.inputCSVProperties, ret1.rlens[0], ret1.clens[0], numColumnsTf, replication, outHeader);
}
}
// copy auxiliary data (old and new header lines) from temporary location to txMtdPath
moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);
// generate matrix metadata file for outputs
if (retCSV != null) {
retCSV.getMatrixCharacteristics(0).setDimension(numRowsTf, numColumnsTf);
CSVFileFormatProperties prop = new CSVFileFormatProperties(false, // use the same header as the input
oprnds.inputCSVProperties.getDelim(), false, Double.NaN, null);
MapReduceTool.writeMetaDataFile(outputs[csvoutputs.get(0)].getFileName() + ".mtd", ValueType.DOUBLE, retCSV.getMatrixCharacteristics(0), OutputInfo.CSVOutputInfo, prop);
return retCSV;
}
if (retBB != null) {
retBB.getMatrixCharacteristics(0).setDimension(numRowsTf, numColumnsTf);
MapReduceTool.writeMetaDataFile(outputs[bboutputs.get(0)].getFileName() + ".mtd", ValueType.DOUBLE, retBB.getMatrixCharacteristics(0), OutputInfo.BinaryBlockOutputInfo);
return retBB;
}
return null;
}
Aggregations