use of org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData in project incubator-systemml by apache.
the class GMR method runJob.
/**
* Execute job.
*
* @param inst MR job instruction
* @param inputs input matrices, the inputs are indexed by 0, 1, 2, .. based on the position in this string
* @param inputInfos the input format information for the input matrices
* @param rlens array of number of rows
* @param clens array of number of columns
* @param brlens array of number of rows in block
* @param bclens array of number of columns in block
* @param partitioned boolean array of partitioned status
* @param pformats array of data partition formats
* @param psizes does nothing
* @param recordReaderInstruction record reader instruction
* @param instructionsInMapper in Mapper, the set of unary operations that need to be performed on each input matrix
* @param aggInstructionsInReducer in Reducer, right after sorting, the set of aggreagte operations
* that need to be performed on each input matrix
* @param otherInstructionsInReducer the mixed operations that need to be performed on matrices after the aggregate operations
* @param numReducers the number of reducers
* @param replication the replication factor for the output
* @param jvmReuse if true, reuse JVM
* @param resultIndexes the indexes of the result matrices that needs to be outputted
* @param dimsUnknownFilePrefix file path prefix when dimensions unknown
* @param outputs the names for the output directories, one for each result index
* @param outputInfos output format information for the output matrices
* @return job return object
* @throws Exception if Exception occurs
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, boolean[] partitioned, PDataPartitionFormat[] pformats, int[] psizes, String recordReaderInstruction, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, boolean jvmReuse, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception {
JobConf job = new JobConf(GMR.class);
job.setJobName("G-MR");
boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos);
//whether use block representation or cell representation
MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);
//added for handling recordreader instruction
String[] realinputs = inputs;
InputInfo[] realinputInfos = inputInfos;
long[] realrlens = rlens;
long[] realclens = clens;
int[] realbrlens = brlens;
int[] realbclens = bclens;
byte[] realIndexes = new byte[inputs.length];
for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
if (recordReaderInstruction != null && !recordReaderInstruction.isEmpty()) {
assert (inputs.length <= 2);
PickByCountInstruction ins = (PickByCountInstruction) PickByCountInstruction.parseInstruction(recordReaderInstruction);
PickFromCompactInputFormat.setKeyValueClasses(job, (Class<? extends WritableComparable>) inputInfos[ins.input1].inputKeyClass, inputInfos[ins.input1].inputValueClass);
job.setInputFormat(PickFromCompactInputFormat.class);
PickFromCompactInputFormat.setZeroValues(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata);
if (ins.isValuePick) {
double[] probs = MapReduceTool.readColumnVectorFromHDFS(inputs[ins.input2], inputInfos[ins.input2], rlens[ins.input2], clens[ins.input2], brlens[ins.input2], bclens[ins.input2]);
PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, probs);
realinputs = new String[inputs.length - 1];
realinputInfos = new InputInfo[inputs.length - 1];
realrlens = new long[inputs.length - 1];
realclens = new long[inputs.length - 1];
realbrlens = new int[inputs.length - 1];
realbclens = new int[inputs.length - 1];
realIndexes = new byte[inputs.length - 1];
byte realIndex = 0;
for (byte i = 0; i < inputs.length; i++) {
if (i == ins.input2)
continue;
realinputs[realIndex] = inputs[i];
realinputInfos[realIndex] = inputInfos[i];
if (i == ins.input1) {
realrlens[realIndex] = rlens[ins.input2];
realclens[realIndex] = clens[ins.input2];
realbrlens[realIndex] = 1;
realbclens[realIndex] = 1;
realIndexes[realIndex] = ins.output;
} else {
realrlens[realIndex] = rlens[i];
realclens[realIndex] = clens[i];
realbrlens[realIndex] = brlens[i];
realbclens[realIndex] = bclens[i];
realIndexes[realIndex] = i;
}
realIndex++;
}
} else {
//PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst, 1-ins.cst);
PickFromCompactInputFormat.setRangePickPartFiles(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst, 1 - ins.cst);
realrlens[ins.input1] = UtilFunctions.getLengthForInterQuantile((NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst);
realclens[ins.input1] = clens[ins.input1];
realbrlens[ins.input1] = 1;
realbclens[ins.input1] = 1;
realIndexes[ins.input1] = ins.output;
}
}
boolean resetDistCache = setupDistributedCache(job, instructionsInMapper, otherInstructionsInReducer, realinputs, realrlens, realclens);
//set up the input files and their format information
boolean[] distCacheOnly = getDistCacheOnlyInputs(realIndexes, recordReaderInstruction, instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer);
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens, distCacheOnly, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL);
MRJobConfiguration.setInputPartitioningInfo(job, pformats);
//set up the dimensions of input matrices
MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens);
MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
//set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens);
//set up unary instructions that will perform in the mapper
MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
//set up the aggregate instructions that will happen in the combiner and reducer
MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);
//set up the instructions that will happen in the reducer, after the aggregation instructions
MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
//set up the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
//set up preferred custom serialization framework for binary block format
if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
MRJobConfiguration.addBinaryBlockSerializationFramework(job);
//set up map/reduce memory configurations (if in AM context)
DMLConfig config = ConfigurationManager.getDMLConfig();
DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
//set up custom map/reduce configurations
MRJobConfiguration.setupCustomMRConfigurations(job, config);
//set up jvm reuse (incl. reuse of loaded dist cache matrices)
if (jvmReuse)
job.setNumTasksToExecutePerJvm(-1);
//set up what matrices are needed to pass from the mapper to reducer
HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes);
MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
MatrixCharacteristics[] stats = ret.stats;
//set up the number of reducers
MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
// Print the complete instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(stats);
// Update resultDimsUnknown based on computed "stats"
byte[] dimsUnknown = new byte[resultIndexes.length];
for (int i = 0; i < resultIndexes.length; i++) {
if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
dimsUnknown[i] = (byte) 1;
} else {
dimsUnknown[i] = (byte) 0;
}
}
//MRJobConfiguration.updateResultDimsUnknown(job,resultDimsUnknown);
//set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, dimsUnknown, outputs, outputInfos, inBlockRepresentation, true);
// configure mapper and the mapper output key value pairs
job.setMapperClass(GMRMapper.class);
if (numReducers == 0) {
job.setMapOutputKeyClass(Writable.class);
job.setMapOutputValueClass(Writable.class);
} else {
job.setMapOutputKeyClass(MatrixIndexes.class);
if (inBlockRepresentation)
job.setMapOutputValueClass(TaggedMatrixBlock.class);
else
job.setMapOutputValueClass(TaggedMatrixPackedCell.class);
}
//set up combiner
if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty()) {
job.setCombinerClass(GMRCombiner.class);
}
//configure reducer
job.setReducerClass(GMRReducer.class);
//job.setReducerClass(PassThroughReducer.class);
// By default, the job executes in "cluster" mode.
// Determine if we can optimize and run it in "local" mode.
MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
for (int i = 0; i < inputs.length; i++) {
inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
}
//set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
RunningJob runjob = JobClient.runJob(job);
Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for (int i = 0; i < resultIndexes.length; i++) stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
//cleanups
String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
stats = MapReduceTool.processDimsFiles(dir, stats);
MapReduceTool.deleteFileIfExistOnHDFS(dir);
if (resetDistCache)
MRBaseForCommonInstructions.resetDistCache();
return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
use of org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData in project incubator-systemml by apache.
the class CacheableData method toString.
public String toString() {
StringBuilder str = new StringBuilder();
str.append(getClass().getSimpleName());
str.append(": ");
str.append(_hdfsFileName + ", ");
if (_metaData instanceof NumItemsByEachReducerMetaData) {
str.append("NumItemsByEachReducerMetaData");
} else {
try {
MatrixFormatMetaData md = (MatrixFormatMetaData) _metaData;
if (md != null) {
MatrixCharacteristics mc = ((MatrixDimensionsMetaData) _metaData).getMatrixCharacteristics();
str.append(mc.toString());
InputInfo ii = md.getInputInfo();
if (ii == null)
str.append("null");
else {
str.append(", ");
str.append(InputInfo.inputInfoToString(ii));
}
} else {
str.append("null, null");
}
} catch (Exception ex) {
LOG.error(ex);
}
}
str.append(", ");
str.append(isDirty() ? "dirty" : "not-dirty");
return str.toString();
}
use of org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData in project incubator-systemml by apache.
the class MRJobInstruction method populateInputs.
/**
* Auxiliary data structures that store information required to spawn MR jobs.
* These data structures are populated by pulling out information from symbol
* table. More specifically, from information stored in <code>inputMatrices</code>
* and <code>outputMatrices</code>.
*/
private void populateInputs() {
// Since inputVars can potentially contain scalar variables,
// auxiliary data structures of size <code>inputMatrices.length</code>
// are allocated instead of size <code>inputVars.length</code>
// Allocate space
inputs = new String[inputMatrices.length];
inputInfos = new InputInfo[inputMatrices.length];
rlens = new long[inputMatrices.length];
clens = new long[inputMatrices.length];
brlens = new int[inputMatrices.length];
bclens = new int[inputMatrices.length];
partitioned = new boolean[inputMatrices.length];
pformats = new PDataPartitionFormat[inputMatrices.length];
psizes = new int[inputMatrices.length];
// populate information
for (int i = 0; i < inputMatrices.length; i++) {
inputs[i] = inputMatrices[i].getFileName();
MatrixCharacteristics mc = inputMatrices[i].getMatrixCharacteristics();
rlens[i] = mc.getRows();
clens[i] = mc.getCols();
brlens[i] = mc.getRowsPerBlock();
bclens[i] = mc.getColsPerBlock();
if (inputMatrices[i].getMetaData() instanceof MatrixFormatMetaData) {
inputInfos[i] = ((MatrixFormatMetaData) inputMatrices[i].getMetaData()).getInputInfo();
} else if (inputMatrices[i].getMetaData() instanceof NumItemsByEachReducerMetaData) {
inputInfos[i] = InputInfo.InputInfoForSortOutput;
inputInfos[i].metadata = inputMatrices[i].getMetaData();
}
partitioned[i] = inputMatrices[i].isPartitioned();
pformats[i] = inputMatrices[i].getPartitionFormat();
psizes[i] = inputMatrices[i].getPartitionSize();
}
}
use of org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData in project incubator-systemml by apache.
the class QuantilePickCPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
switch(_type) {
case VALUEPICK:
if (//INMEM VALUEPICK
_inmem) {
MatrixBlock matBlock = ec.getMatrixInput(input1.getName());
if (input2.getDataType() == DataType.SCALAR) {
ScalarObject quantile = ec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral());
double picked = matBlock.pickValue(quantile.getDoubleValue());
ec.setScalarOutput(output.getName(), new DoubleObject(picked));
} else {
MatrixBlock quantiles = ec.getMatrixInput(input2.getName());
MatrixBlock resultBlock = (MatrixBlock) matBlock.pickValues(quantiles, new MatrixBlock());
quantiles = null;
ec.releaseMatrixInput(input2.getName());
ec.setMatrixOutput(output.getName(), resultBlock);
}
ec.releaseMatrixInput(input1.getName());
} else //MR VALUEPICK
{
MatrixObject mat = ec.getMatrixObject(input1.getName());
String fname = mat.getFileName();
MetaData mdata = mat.getMetaData();
ScalarObject pickindex = ec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral());
if (mdata != null) {
try {
double picked = MapReduceTool.pickValue(fname, (NumItemsByEachReducerMetaData) mdata, pickindex.getDoubleValue());
ec.setVariable(output.getName(), new DoubleObject(picked));
} catch (Exception e) {
throw new DMLRuntimeException(e);
}
} else {
throw new DMLRuntimeException("Unexpected error while executing ValuePickCP: otherMetaData for file (" + fname + ") not found.");
}
}
break;
case MEDIAN:
if (//INMEM MEDIAN
_inmem) {
double picked = ec.getMatrixInput(input1.getName()).median();
ec.setScalarOutput(output.getName(), new DoubleObject(picked));
ec.releaseMatrixInput(input1.getName());
break;
} else //MR MEDIAN
{
MatrixObject mat1 = (MatrixObject) ec.getVariable(input1.getName());
String fname1 = mat1.getFileName();
MetaData mdata1 = mat1.getMetaData();
if (mdata1 != null) {
try {
double median = MapReduceTool.median(fname1, (NumItemsByEachReducerMetaData) mdata1);
ec.setVariable(output.getName(), new DoubleObject(median));
} catch (Exception e) {
throw new DMLRuntimeException(e);
}
} else {
throw new DMLRuntimeException("Unexpected error while executing ValuePickCP: otherMetaData for file (" + fname1 + ") not found.");
}
}
break;
case IQM:
if (//INMEM IQM
_inmem) {
MatrixBlock matBlock1 = ec.getMatrixInput(input1.getName());
double iqm = matBlock1.interQuartileMean();
ec.releaseMatrixInput(input1.getName());
ec.setScalarOutput(output.getName(), new DoubleObject(iqm));
} else //MR IQM
{
MatrixObject inputMatrix = (MatrixObject) ec.getVariable(input1.getName());
ScalarObject iqsum = ec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral());
double[] q25 = null;
double[] q75 = null;
try {
q25 = MapReduceTool.pickValueWeight(inputMatrix.getFileName(), (NumItemsByEachReducerMetaData) inputMatrix.getMetaData(), 0.25, false);
q75 = MapReduceTool.pickValueWeight(inputMatrix.getFileName(), (NumItemsByEachReducerMetaData) inputMatrix.getMetaData(), 0.75, false);
} catch (IOException e1) {
throw new DMLRuntimeException(e1);
}
double sumwt = UtilFunctions.getTotalLength((NumItemsByEachReducerMetaData) ec.getMetaData(input1.getName()));
double q25d = sumwt * 0.25;
double q75d = sumwt * 0.75;
// iqsum = interQuartileSum that includes complete portions of q25 and q75
// . exclude top portion of q25 and bottom portion of q75
double q25entry_weight = q25[0] * q25[1];
double q25portion_include = (q25[2] - q25d) * q25[0];
double q25portion_exclude = q25entry_weight - q25portion_include;
double q75portion_exclude = (q75[2] - q75d) * q75[0];
double mriqm = (iqsum.getDoubleValue() - q25portion_exclude - q75portion_exclude) / (sumwt * 0.5);
ec.setScalarOutput(output.getName(), new DoubleObject(mriqm));
}
break;
default:
throw new DMLRuntimeException("Unsupported qpick operation type: " + _type);
}
}
Aggregations