use of org.apache.sysml.runtime.matrix.JobReturn in project incubator-systemml by apache.
the class DataTransform method mrDataTransform.
/**
* Main method to create and/or apply transformation metdata using MapReduce.
*
* @param jobinst MR job instruction
* @param inputs array of input matrices
* @param shuffleInst shuffle instructions
* @param otherInst other instructions
* @param resultIndices byte array of result indices
* @param outputs array of output matrices
* @param numReducers number of reducers
* @param replication ?
* @return MR job result
* @throws Exception if IOException occurs
*/
public static JobReturn mrDataTransform(MRJobInstruction jobinst, MatrixObject[] inputs, String shuffleInst, String otherInst, byte[] resultIndices, MatrixObject[] outputs, int numReducers, int replication) throws Exception {
String[] insts = shuffleInst.split(Instruction.INSTRUCTION_DELIM);
// Parse transform instruction (the first instruction) to obtain relevant fields
TransformOperands oprnds = new TransformOperands(insts[0], inputs[0]);
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
// find the first file in alphabetical ordering of part files in directory inputPath
String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
// find column names
FileSystem fs = IOUtilFunctions.getFileSystem(smallestFile);
String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
String outHeader = getOutputHeader(fs, headerLine, oprnds);
int numColumns = colNamesToIds.size();
int numColumnsTf = 0;
long numRowsTf = 0;
ArrayList<Integer> csvoutputs = new ArrayList<Integer>();
ArrayList<Integer> bboutputs = new ArrayList<Integer>();
// divide output objects based on output format (CSV or BinaryBlock)
for (int i = 0; i < outputs.length; i++) {
if (outputs[i].getFileFormatProperties() != null && outputs[i].getFileFormatProperties().getFileFormat() == FileFormatProperties.FileFormat.CSV)
csvoutputs.add(i);
else
bboutputs.add(i);
}
boolean isCSV = (csvoutputs.size() > 0);
boolean isBB = (bboutputs.size() > 0);
String tmpPath = MRJobConfiguration.constructTempOutputFilename();
checkIfOutputOverlapsWithTxMtd(outputs, oprnds, isCSV, isBB, csvoutputs, bboutputs, fs);
JobReturn retCSV = null, retBB = null;
if (!oprnds.isApply) {
// build specification file with column IDs insteadof column names
String specWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds, oprnds.inputCSVProperties, oprnds.spec);
// enable GC on colNamesToIds
colNamesToIds = null;
// Build transformation metadata, including recode maps, bin definitions, etc.
// Also, generate part offsets file (counters file), which is to be used in csv-reblock
String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
numRowsTf = GenTfMtdMR.runJob(oprnds.inputPath, oprnds.txMtdPath, specWithIDs, smallestFile, partOffsetsFile, oprnds.inputCSVProperties, numColumns, replication, outHeader);
if (numRowsTf == 0)
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
// store the specFileWithIDs as transformation metadata
MapReduceTool.writeStringToHDFS(specWithIDs, oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
// Apply transformation metadata, and perform actual transformation
if (isCSV)
retCSV = ApplyTfCSVMR.runJob(oprnds.inputPath, specWithIDs, oprnds.txMtdPath, tmpPath, outputs[csvoutputs.get(0)].getFileName(), partOffsetsFile, oprnds.inputCSVProperties, numColumns, replication, outHeader);
if (isBB) {
DMLConfig conf = ConfigurationManager.getDMLConfig();
int blockSize = conf.getIntValue(DMLConfig.DEFAULT_BLOCK_SIZE);
CSVReblockInstruction rblk = prepDummyReblockInstruction(oprnds.inputCSVProperties, blockSize);
AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { blockSize }, new int[] { blockSize }, rblk.toString(), replication, new String[] { smallestFile }, true, oprnds.inputCSVProperties.getNAStrings(), specWithIDs);
if (ret1.rlens[0] == 0)
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
retBB = ApplyTfBBMR.runJob(oprnds.inputPath, insts[1], otherInst, specWithIDs, oprnds.txMtdPath, tmpPath, outputs[bboutputs.get(0)].getFileName(), ret1.counterFile.toString(), oprnds.inputCSVProperties, numRowsTf, numColumns, numColumnsTf, replication, outHeader);
}
MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);
} else {
// enable GC on colNamesToIds
colNamesToIds = null;
// copy given transform metadata (applyTxPath) to specified location (txMtdPath)
MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
// path to specification file
String specWithIDs = (oprnds.spec != null) ? oprnds.spec : MapReduceTool.readStringFromHDFSFile(oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
if (isCSV) {
DMLConfig conf = ConfigurationManager.getDMLConfig();
int blockSize = conf.getIntValue(DMLConfig.DEFAULT_BLOCK_SIZE);
CSVReblockInstruction rblk = prepDummyReblockInstruction(oprnds.inputCSVProperties, blockSize);
AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { blockSize }, new int[] { blockSize }, rblk.toString(), replication, new String[] { smallestFile }, true, oprnds.inputCSVProperties.getNAStrings(), specWithIDs);
numRowsTf = ret1.rlens[0];
if (ret1.rlens[0] == 0)
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
// Apply transformation metadata, and perform actual transformation
retCSV = ApplyTfCSVMR.runJob(oprnds.inputPath, specWithIDs, oprnds.applyTxPath, tmpPath, outputs[csvoutputs.get(0)].getFileName(), ret1.counterFile.toString(), oprnds.inputCSVProperties, numColumns, replication, outHeader);
}
if (isBB) {
// compute part offsets file
CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(insts[1]);
CSVReblockInstruction newrblk = (CSVReblockInstruction) rblk.clone((byte) 0);
AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { newrblk.brlen }, new int[] { newrblk.bclen }, newrblk.toString(), replication, new String[] { smallestFile }, true, oprnds.inputCSVProperties.getNAStrings(), specWithIDs);
numRowsTf = ret1.rlens[0];
if (ret1.rlens[0] == 0)
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
// apply transformation metadata, as well as reblock the resulting data
retBB = ApplyTfBBMR.runJob(oprnds.inputPath, insts[1], otherInst, specWithIDs, oprnds.txMtdPath, tmpPath, outputs[bboutputs.get(0)].getFileName(), ret1.counterFile.toString(), oprnds.inputCSVProperties, ret1.rlens[0], ret1.clens[0], numColumnsTf, replication, outHeader);
}
}
// copy auxiliary data (old and new header lines) from temporary location to txMtdPath
moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);
// generate matrix metadata file for outputs
if (retCSV != null) {
retCSV.getMatrixCharacteristics(0).setDimension(numRowsTf, numColumnsTf);
CSVFileFormatProperties prop = new CSVFileFormatProperties(false, // use the same header as the input
oprnds.inputCSVProperties.getDelim(), false, Double.NaN, null);
MapReduceTool.writeMetaDataFile(outputs[csvoutputs.get(0)].getFileName() + ".mtd", ValueType.DOUBLE, retCSV.getMatrixCharacteristics(0), OutputInfo.CSVOutputInfo, prop);
return retCSV;
}
if (retBB != null) {
retBB.getMatrixCharacteristics(0).setDimension(numRowsTf, numColumnsTf);
MapReduceTool.writeMetaDataFile(outputs[bboutputs.get(0)].getFileName() + ".mtd", ValueType.DOUBLE, retBB.getMatrixCharacteristics(0), OutputInfo.BinaryBlockOutputInfo);
return retBB;
}
return null;
}
use of org.apache.sysml.runtime.matrix.JobReturn in project incubator-systemml by apache.
the class DataTransform method performTransform.
/**
* Main method to create and/or apply transformation metdata in-memory, on a single node.
*
* @param job job configuration
* @param fs file system
* @param inputPath path to input files
* @param ncols number of columns
* @param prop csv file format properties
* @param specWithIDs JSON transform specification with IDs
* @param tfMtdPath transform metadata path
* @param isApply ?
* @param result output matrix
* @param headerLine header line
* @param isBB true if binary block
* @param isCSV true if CSV
* @return MR job result
* @throws IOException if IOException occurs
* @throws DMLRuntimeException if DMLRuntimeException occurs
* @throws IllegalArgumentException if IllegalArgumentException occurs
* @throws JSONException if JSONException occurs
*/
private static JobReturn performTransform(JobConf job, FileSystem fs, String inputPath, int ncols, CSVFileFormatProperties prop, String specWithIDs, String tfMtdPath, boolean isApply, MatrixObject result, String headerLine, boolean isBB, boolean isCSV) throws IOException, DMLRuntimeException, IllegalArgumentException, JSONException {
String[] na = TfUtils.parseNAStrings(prop.getNAStrings());
JSONObject spec = new JSONObject(specWithIDs);
TfUtils agents = new TfUtils(headerLine, prop.hasHeader(), prop.getDelim(), na, spec, ncols, tfMtdPath, null, null);
MVImputeAgent _mia = agents.getMVImputeAgent();
RecodeAgent _ra = agents.getRecodeAgent();
BinAgent _ba = agents.getBinAgent();
DummycodeAgent _da = agents.getDummycodeAgent();
// List of files to read
ArrayList<Path> files = collectInputFiles(inputPath, fs);
// ---------------------------------
// Construct transformation metadata
// ---------------------------------
String line = null;
String[] words = null;
int numColumnsTf = 0;
if (!isApply) {
for (int fileNo = 0; fileNo < files.size(); fileNo++) {
try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))))) {
if (fileNo == 0 && prop.hasHeader())
//ignore header
br.readLine();
line = null;
while ((line = br.readLine()) != null) {
agents.prepareTfMtd(line);
}
}
}
if (agents.getValid() == 0)
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
_mia.outputTransformationMetadata(tfMtdPath, fs, agents);
_ba.outputTransformationMetadata(tfMtdPath, fs, agents);
_ra.outputTransformationMetadata(tfMtdPath, fs, agents);
// prepare agents for the subsequent phase of applying transformation metadata
// NO need to loadTxMtd for _ra, since the maps are already present in the memory
Path tmp = new Path(tfMtdPath);
_mia.loadTxMtd(job, fs, tmp, agents);
_ba.loadTxMtd(job, fs, tmp, agents);
_da.setRecodeMapsCP(_ra.getCPRecodeMaps());
_da.setNumBins(_ba.getColList(), _ba.getNumBins());
_da.loadTxMtd(job, fs, tmp, agents);
} else {
// Count the number of rows
int[] rows = countNumRows(files, prop, fs, agents);
agents.setTotal(rows[0]);
agents.setValid(rows[1]);
if (agents.getValid() == 0)
throw new DMLRuntimeException("Number of rows in the transformed output (potentially, after ommitting the ones with missing values) is zero. Cannot proceed.");
// Load transformation metadata
// prepare agents for the subsequent phase of applying transformation metadata
Path tmp = new Path(tfMtdPath);
_mia.loadTxMtd(job, fs, tmp, agents);
_ra.loadTxMtd(job, fs, tmp, agents);
_ba.loadTxMtd(job, fs, tmp, agents);
_da.setRecodeMaps(_ra.getRecodeMaps());
_da.setNumBins(_ba.getColList(), _ba.getNumBins());
_da.loadTxMtd(job, fs, tmp, agents);
}
// -----------------------------
// Apply transformation metadata
// -----------------------------
numColumnsTf = getNumColumnsTf(fs, headerLine, prop.getDelim(), tfMtdPath);
MapReduceTool.deleteFileIfExistOnHDFS(result.getFileName());
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(result.getFileName()), true)));
StringBuilder sb = new StringBuilder();
try {
MatrixBlock mb = null;
if (isBB) {
int estNNZ = (int) agents.getValid() * ncols;
mb = new MatrixBlock((int) agents.getValid(), numColumnsTf, estNNZ);
if (mb.isInSparseFormat())
mb.allocateSparseRowsBlock();
else
mb.allocateDenseBlock();
}
// rowid to be used in filling the matrix block
int rowID = 0;
for (int fileNo = 0; fileNo < files.size(); fileNo++) {
try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))))) {
if (fileNo == 0) {
if (prop.hasHeader())
// ignore the header line from data file
br.readLine();
//TODO: fix hard-wired header propagation to meta data column names
String dcdHeader = _da.constructDummycodedHeader(headerLine, agents.getDelim());
numColumnsTf = _da.genDcdMapsAndColTypes(fs, tfMtdPath, ncols, agents);
generateHeaderFiles(fs, tfMtdPath, headerLine, dcdHeader);
}
line = null;
while ((line = br.readLine()) != null) {
words = agents.getWords(line);
if (!agents.omit(words)) {
words = agents.apply(words);
if (isCSV) {
out.write(agents.checkAndPrepOutputString(words, sb));
out.write("\n");
}
if (isBB) {
agents.check(words);
for (int c = 0; c < words.length; c++) {
if (words[c] == null || words[c].isEmpty())
;
else
mb.appendValue(rowID, c, UtilFunctions.parseToDouble(words[c]));
}
}
rowID++;
}
}
}
}
if (mb != null) {
mb.recomputeNonZeros();
mb.examSparsity();
result.acquireModify(mb);
result.release();
result.exportData();
}
} finally {
IOUtilFunctions.closeSilently(out);
}
MatrixCharacteristics mc = new MatrixCharacteristics(agents.getValid(), numColumnsTf, (int) result.getNumRowsPerBlock(), (int) result.getNumColumnsPerBlock());
JobReturn ret = new JobReturn(new MatrixCharacteristics[] { mc }, true);
return ret;
}
use of org.apache.sysml.runtime.matrix.JobReturn in project incubator-systemml by apache.
the class DataTransform method cpDataTransform.
public static JobReturn cpDataTransform(TransformOperands oprnds, CacheableData<?>[] inputs, MatrixObject[] outputs) throws IOException, DMLRuntimeException, IllegalArgumentException, JSONException {
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
// find the first file in alphabetical ordering of partfiles in directory inputPath
String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
FileSystem fs = IOUtilFunctions.getFileSystem(smallestFile);
// find column names
String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
String outHeader = getOutputHeader(fs, headerLine, oprnds);
ArrayList<Integer> csvoutputs = new ArrayList<Integer>();
ArrayList<Integer> bboutputs = new ArrayList<Integer>();
// divide output objects based on output format (CSV or BinaryBlock)
for (int i = 0; i < outputs.length; i++) {
if (outputs[i].getFileFormatProperties() != null && outputs[i].getFileFormatProperties().getFileFormat() == FileFormatProperties.FileFormat.CSV)
csvoutputs.add(i);
else
bboutputs.add(i);
}
boolean isCSV = (csvoutputs.size() > 0);
boolean isBB = (bboutputs.size() > 0);
checkIfOutputOverlapsWithTxMtd(outputs, oprnds, isCSV, isBB, csvoutputs, bboutputs, fs);
JobReturn ret = null;
if (!oprnds.isApply) {
// build specification file with column IDs insteadof column names
String specWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds, oprnds.inputCSVProperties, oprnds.spec);
MapReduceTool.writeStringToHDFS(specWithIDs, oprnds.txMtdPath + "/" + "spec.json");
ret = performTransform(job, fs, oprnds.inputPath, colNamesToIds.size(), oprnds.inputCSVProperties, specWithIDs, oprnds.txMtdPath, oprnds.isApply, outputs[0], outHeader, isBB, isCSV);
} else {
// copy given transform metadata (applyTxPath) to specified location (txMtdPath)
MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
// path to specification file (optionally specified)
String specWithIDs = (oprnds.spec != null) ? oprnds.spec : MapReduceTool.readStringFromHDFSFile(oprnds.txMtdPath + "/" + "spec.json");
ret = performTransform(job, fs, oprnds.inputPath, colNamesToIds.size(), oprnds.inputCSVProperties, specWithIDs, oprnds.txMtdPath, oprnds.isApply, outputs[0], outHeader, isBB, isCSV);
}
return ret;
}
use of org.apache.sysml.runtime.matrix.JobReturn in project incubator-systemml by apache.
the class ApplyTfCSVMR method runJob.
public static JobReturn runJob(String inputPath, String spec, String mapsPath, String tmpPath, String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols, int replication, String headerLine) throws IOException, ClassNotFoundException, InterruptedException {
JobConf job = new JobConf(ApplyTfCSVMR.class);
job.setJobName("ApplyTfCSV");
/* Setup MapReduce Job */
job.setJarByClass(ApplyTfCSVMR.class);
// set relevant classes
job.setMapperClass(ApplyTfCSVMapper.class);
job.setNumReduceTasks(0);
// Add transformation metadata file as well as partOffsetsFile to Distributed cache
DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job);
DistributedCache.createSymlink(job);
Path cachefile = new Path(partOffsetsFile);
DistributedCache.addCacheFile(cachefile.toUri(), job);
DistributedCache.createSymlink(job);
// set input and output properties
job.setInputFormat(TextInputFormat.class);
job.setOutputFormat(TextOutputFormat.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
FileInputFormat.addInputPath(job, new Path(inputPath));
// delete outputPath, if exists already.
Path outPath = new Path(outputPath);
FileSystem fs = IOUtilFunctions.getFileSystem(outPath, job);
fs.delete(outPath, true);
FileOutputFormat.setOutputPath(job, outPath);
job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
if (inputDataProperties.getNAStrings() != null)
// Adding "dummy" string to handle the case of na_strings = ""
job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
job.set(MRJobConfiguration.TF_SPEC, spec);
job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath));
job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath);
job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols);
job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath);
job.set(MRJobConfiguration.TF_HEADER, headerLine);
job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString());
job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath);
//turn off adaptivemr
job.setBoolean("adaptivemr.map.enable", false);
// Run the job
RunningJob runjob = JobClient.runJob(job);
// Since transform CSV produces part files w/ prefix transform-part-*,
// delete all the "default" part-..... files
deletePartFiles(fs, outPath);
MatrixCharacteristics mc = new MatrixCharacteristics();
return new JobReturn(new MatrixCharacteristics[] { mc }, runjob.isSuccessful());
}
use of org.apache.sysml.runtime.matrix.JobReturn in project incubator-systemml by apache.
the class ApplyTfBBMR method runJob.
public static JobReturn runJob(String inputPath, String rblkInst, String otherInst, String spec, String mapsPath, String tmpPath, String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numRows, long numColsBefore, long numColsAfter, int replication, String headerLine) throws Exception {
CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(rblkInst);
long[] rlens = new long[] { numRows };
long[] clens = new long[] { numColsAfter };
int[] brlens = new int[] { rblk.brlen };
int[] bclens = new int[] { rblk.bclen };
byte[] realIndexes = new byte[] { rblk.input };
byte[] resultIndexes = new byte[] { rblk.output };
JobConf job = new JobConf(ApplyTfBBMR.class);
job.setJobName("ApplyTfBB");
/* Setup MapReduce Job */
job.setJarByClass(ApplyTfBBMR.class);
// set relevant classes
job.setMapperClass(ApplyTfBBMapper.class);
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, new String[] { inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, brlens, bclens, false, ConvertTarget.CELL);
MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
MRJobConfiguration.setCSVReblockInstructions(job, rblkInst);
//set up the instructions that will happen in the reducer, after the aggregation instrucions
MRJobConfiguration.setInstructionsInReducer(job, otherInst);
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
//set up preferred custom serialization framework for binary block format
if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
MRJobConfiguration.addBinaryBlockSerializationFramework(job);
//set up what matrices are needed to pass from the mapper to reducer
HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, rblkInst, null, otherInst, resultIndexes);
MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null, rblkInst, null, null, null, resultIndexes, mapoutputIndexes, false);
//set up the number of reducers
int numRed = WriteCSVMR.determineNumReducers(rlens, clens, ConfigurationManager.getNumReducers(), ret.numReducerGroups);
job.setNumReduceTasks(numRed);
//set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, new byte[] { rblk.output }, new byte[] { 0 }, new String[] { outputPath }, new OutputInfo[] { OutputInfo.BinaryBlockOutputInfo }, true, false);
// configure mapper and the mapper output key value pairs
job.setMapperClass(ApplyTfBBMapper.class);
job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
job.setMapOutputValueClass(BlockRow.class);
//configure reducer
job.setReducerClass(CSVReblockReducer.class);
//turn off adaptivemr
job.setBoolean("adaptivemr.map.enable", false);
//set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
// Add transformation metadata file as well as partOffsetsFile to Distributed cache
DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job);
DistributedCache.createSymlink(job);
Path cachefile = new Path(new Path(partOffsetsFile), "part-00000");
DistributedCache.addCacheFile(cachefile.toUri(), job);
DistributedCache.createSymlink(job);
job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
// Adding "dummy" string to handle the case of na_strings = ""
if (inputDataProperties.getNAStrings() != null)
job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
job.set(MRJobConfiguration.TF_SPEC, spec);
job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath));
job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath);
job.setLong(MRJobConfiguration.TF_NUM_COLS, numColsBefore);
job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath);
job.set(MRJobConfiguration.TF_HEADER, headerLine);
job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString());
job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath);
RunningJob runjob = JobClient.runJob(job);
MapReduceTool.deleteFileIfExistOnHDFS(cachefile, job);
Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for (int i = 0; i < resultIndexes.length; i++) {
ret.stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
}
return new JobReturn(ret.stats, runjob.isSuccessful());
}
Aggregations