use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.
the class DynamicReadMatrixRcCP method execute.
@Override
public void execute() {
try {
String fname = ((Scalar) this.getFunctionInput(0)).getValue();
Integer m = Integer.parseInt(((Scalar) this.getFunctionInput(1)).getValue());
Integer n = Integer.parseInt(((Scalar) this.getFunctionInput(2)).getValue());
String format = ((Scalar) this.getFunctionInput(3)).getValue();
InputInfo ii = InputInfo.stringToInputInfo(format);
OutputInfo oi = OutputInfo.BinaryBlockOutputInfo;
String fnameTmp = createOutputFilePathAndName("TMP");
_ret = new Matrix(fnameTmp, m, n, ValueType.Double);
MatrixBlock mbTmp = DataConverter.readMatrixFromHDFS(fname, ii, m, n, ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize());
_ret.setMatrixDoubleArray(mbTmp, oi, ii);
_rc = new Scalar(ScalarValueType.Integer, "0");
// NOTE: The packagesupport wrapper creates a new MatrixObjectNew with the given
// matrix block. This leads to a dirty state of the new object. Hence, the resulting
// intermediate plan variable will be exported in front of MR jobs and during this export
// the format will be changed to binary block (the contract of external functions),
// no matter in which format the original matrix was.
} catch (Exception e) {
_rc = new Scalar(ScalarValueType.Integer, "1");
// throw new PackageRuntimeException("Error executing dynamic read of matrix",e);
}
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.
the class MRJobConfiguration method setUpMultipleInputs.
public static void setUpMultipleInputs(JobConf job, byte[] inputIndexes, String[] inputs, InputInfo[] inputInfos, int[] brlens, int[] bclens, boolean[] distCacheOnly, boolean setConverter, ConvertTarget target) throws Exception {
if (inputs.length != inputInfos.length)
throw new Exception("number of inputs and inputInfos does not match");
// set up names of the input matrices and their inputformat information
job.setStrings(INPUT_MATRICIES_DIRS_CONFIG, inputs);
MRJobConfiguration.setMapFunctionInputMatrixIndexes(job, inputIndexes);
// set up converter infos (converter determined implicitly)
if (setConverter) {
for (int i = 0; i < inputs.length; i++) setInputInfo(job, inputIndexes[i], inputInfos[i], brlens[i], bclens[i], target);
}
// remove redundant inputs and pure broadcast variables
ArrayList<Path> lpaths = new ArrayList<>();
ArrayList<InputInfo> liinfos = new ArrayList<>();
for (int i = 0; i < inputs.length; i++) {
Path p = new Path(inputs[i]);
// check and skip redundant inputs
if (// path already included
lpaths.contains(p) || // input only required in dist cache
distCacheOnly[i]) {
continue;
}
lpaths.add(p);
liinfos.add(inputInfos[i]);
}
boolean combineInputFormat = false;
if (OptimizerUtils.ALLOW_COMBINE_FILE_INPUT_FORMAT) {
// determine total input sizes
double totalInputSize = 0;
for (int i = 0; i < inputs.length; i++) totalInputSize += MapReduceTool.getFilesizeOnHDFS(new Path(inputs[i]));
// set max split size (default blocksize) to 2x blocksize if (1) sort buffer large enough,
// (2) degree of parallelism not hurt, and only a single input (except broadcasts)
// (the sort buffer size is relevant for pass-through of, potentially modified, inputs to the reducers)
// (the single input constraint stems from internal runtime assumptions used to relate meta data to inputs)
long sizeSortBuff = InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer();
long sizeHDFSBlk = InfrastructureAnalyzer.getHDFSBlockSize();
// use generic config api for backwards compatibility
long newSplitSize = sizeHDFSBlk * 2;
double spillPercent = Double.parseDouble(job.get(MRConfigurationNames.MR_MAP_SORT_SPILL_PERCENT, "1.0"));
int numPMap = OptimizerUtils.getNumMappers();
if (numPMap < totalInputSize / newSplitSize && sizeSortBuff * spillPercent >= newSplitSize && lpaths.size() == 1) {
job.setLong(MRConfigurationNames.MR_INPUT_FILEINPUTFORMAT_SPLIT_MAXSIZE, newSplitSize);
combineInputFormat = true;
}
}
// add inputs to jobs input (incl input format configuration)
for (int i = 0; i < lpaths.size(); i++) {
// add input to job inputs (for binaryblock we use CombineSequenceFileInputFormat to reduce task latency)
if (combineInputFormat && liinfos.get(i) == InputInfo.BinaryBlockInputInfo)
MultipleInputs.addInputPath(job, lpaths.get(i), CombineSequenceFileInputFormat.class);
else
MultipleInputs.addInputPath(job, lpaths.get(i), liinfos.get(i).inputFormatClass);
}
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.
the class GMR method runJob.
/**
* Execute job.
*
* @param inst MR job instruction
* @param inputs input matrices, the inputs are indexed by 0, 1, 2, .. based on the position in this string
* @param inputInfos the input format information for the input matrices
* @param rlens array of number of rows
* @param clens array of number of columns
* @param brlens array of number of rows in block
* @param bclens array of number of columns in block
* @param partitioned boolean array of partitioned status
* @param pformats array of data partition formats
* @param psizes does nothing
* @param recordReaderInstruction record reader instruction
* @param instructionsInMapper in Mapper, the set of unary operations that need to be performed on each input matrix
* @param aggInstructionsInReducer in Reducer, right after sorting, the set of aggreagte operations
* that need to be performed on each input matrix
* @param otherInstructionsInReducer the mixed operations that need to be performed on matrices after the aggregate operations
* @param numReducers the number of reducers
* @param replication the replication factor for the output
* @param jvmReuse if true, reuse JVM
* @param resultIndexes the indexes of the result matrices that needs to be outputted
* @param dimsUnknownFilePrefix file path prefix when dimensions unknown
* @param outputs the names for the output directories, one for each result index
* @param outputInfos output format information for the output matrices
* @return job return object
* @throws Exception if Exception occurs
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, boolean[] partitioned, PDataPartitionFormat[] pformats, int[] psizes, String recordReaderInstruction, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, boolean jvmReuse, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception {
JobConf job = new JobConf(GMR.class);
job.setJobName("G-MR");
boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos);
// whether use block representation or cell representation
MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);
// added for handling recordreader instruction
String[] realinputs = inputs;
InputInfo[] realinputInfos = inputInfos;
long[] realrlens = rlens;
long[] realclens = clens;
int[] realbrlens = brlens;
int[] realbclens = bclens;
byte[] realIndexes = new byte[inputs.length];
for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
if (recordReaderInstruction != null && !recordReaderInstruction.isEmpty()) {
assert (inputs.length <= 2);
PickByCountInstruction ins = (PickByCountInstruction) PickByCountInstruction.parseInstruction(recordReaderInstruction);
PickFromCompactInputFormat.setKeyValueClasses(job, (Class<? extends WritableComparable>) inputInfos[ins.input1].inputKeyClass, inputInfos[ins.input1].inputValueClass);
job.setInputFormat(PickFromCompactInputFormat.class);
PickFromCompactInputFormat.setZeroValues(job, (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata);
if (ins.isValuePick) {
double[] probs = MapReduceTool.readColumnVectorFromHDFS(inputs[ins.input2], inputInfos[ins.input2], rlens[ins.input2], clens[ins.input2], brlens[ins.input2], bclens[ins.input2]);
PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, probs);
realinputs = new String[inputs.length - 1];
realinputInfos = new InputInfo[inputs.length - 1];
realrlens = new long[inputs.length - 1];
realclens = new long[inputs.length - 1];
realbrlens = new int[inputs.length - 1];
realbclens = new int[inputs.length - 1];
realIndexes = new byte[inputs.length - 1];
byte realIndex = 0;
for (byte i = 0; i < inputs.length; i++) {
if (i == ins.input2)
continue;
realinputs[realIndex] = inputs[i];
realinputInfos[realIndex] = inputInfos[i];
if (i == ins.input1) {
realrlens[realIndex] = rlens[ins.input2];
realclens[realIndex] = clens[ins.input2];
realbrlens[realIndex] = 1;
realbclens[realIndex] = 1;
realIndexes[realIndex] = ins.output;
} else {
realrlens[realIndex] = rlens[i];
realclens[realIndex] = clens[i];
realbrlens[realIndex] = brlens[i];
realbclens[realIndex] = bclens[i];
realIndexes[realIndex] = i;
}
realIndex++;
}
} else {
// PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst, 1-ins.cst);
PickFromCompactInputFormat.setRangePickPartFiles(job, (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, ins.cst, 1 - ins.cst);
realrlens[ins.input1] = UtilFunctions.getLengthForInterQuantile((MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, ins.cst);
realclens[ins.input1] = clens[ins.input1];
realbrlens[ins.input1] = 1;
realbclens[ins.input1] = 1;
realIndexes[ins.input1] = ins.output;
}
}
boolean resetDistCache = setupDistributedCache(job, instructionsInMapper, otherInstructionsInReducer, realinputs, realrlens, realclens);
// set up the input files and their format information
boolean[] distCacheOnly = getDistCacheOnlyInputs(realIndexes, recordReaderInstruction, instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer);
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens, distCacheOnly, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL);
MRJobConfiguration.setInputPartitioningInfo(job, pformats);
// set up the dimensions of input matrices
MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens);
MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
// set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens);
// set up unary instructions that will perform in the mapper
MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
// set up the aggregate instructions that will happen in the combiner and reducer
MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);
// set up the instructions that will happen in the reducer, after the aggregation instructions
MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
// set up the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
// set up preferred custom serialization framework for binary block format
if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
MRJobConfiguration.addBinaryBlockSerializationFramework(job);
// set up map/reduce memory configurations (if in AM context)
DMLConfig config = ConfigurationManager.getDMLConfig();
DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
// set up custom map/reduce configurations
MRJobConfiguration.setupCustomMRConfigurations(job, config);
// set up jvm reuse (incl. reuse of loaded dist cache matrices)
if (jvmReuse)
job.setNumTasksToExecutePerJvm(-1);
// set up what matrices are needed to pass from the mapper to reducer
HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes);
MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
MatrixCharacteristics[] stats = ret.stats;
// set up the number of reducers
MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
// Print the complete instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(stats);
// Update resultDimsUnknown based on computed "stats"
byte[] dimsUnknown = new byte[resultIndexes.length];
for (int i = 0; i < resultIndexes.length; i++) {
if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
dimsUnknown[i] = (byte) 1;
} else {
dimsUnknown[i] = (byte) 0;
}
}
// MRJobConfiguration.updateResultDimsUnknown(job,resultDimsUnknown);
// set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, dimsUnknown, outputs, outputInfos, inBlockRepresentation, true);
// configure mapper and the mapper output key value pairs
job.setMapperClass(GMRMapper.class);
if (numReducers == 0) {
job.setMapOutputKeyClass(Writable.class);
job.setMapOutputValueClass(Writable.class);
} else {
job.setMapOutputKeyClass(MatrixIndexes.class);
if (inBlockRepresentation)
job.setMapOutputValueClass(TaggedMatrixBlock.class);
else
job.setMapOutputValueClass(TaggedMatrixPackedCell.class);
}
// set up combiner
if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty()) {
job.setCombinerClass(GMRCombiner.class);
}
// configure reducer
job.setReducerClass(GMRReducer.class);
// job.setReducerClass(PassThroughReducer.class);
// By default, the job executes in "cluster" mode.
// Determine if we can optimize and run it in "local" mode.
MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
for (int i = 0; i < inputs.length; i++) {
inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
}
// set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
RunningJob runjob = JobClient.runJob(job);
Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for (int i = 0; i < resultIndexes.length; i++) stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
// cleanups
String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
stats = MapReduceTool.processDimsFiles(dir, stats);
MapReduceTool.deleteFileIfExistOnHDFS(dir);
if (resetDistCache)
MRBaseForCommonInstructions.resetDistCache();
return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.
the class GroupedAggMR method runJob.
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String grpAggInstructions, String simpleReduceInstructions, /*only scalar or reorg instructions allowed*/
int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception {
JobConf job = new JobConf(GroupedAggMR.class);
job.setJobName("GroupedAgg-MR");
// whether use block representation or cell representation
// MRJobConfiguration.setMatrixValueClassForCM_N_COM(job, true);
MRJobConfiguration.setMatrixValueClass(job, false);
// added for handling recordreader instruction
String[] realinputs = inputs;
InputInfo[] realinputInfos = inputInfos;
long[] realrlens = rlens;
long[] realclens = clens;
int[] realbrlens = brlens;
int[] realbclens = bclens;
byte[] realIndexes = new byte[inputs.length];
for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
// set up the input files and their format information
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens, true, ConvertTarget.WEIGHTEDCELL);
// set up the dimensions of input matrices
MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens);
MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
// set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens);
// set up the grouped aggregate instructions that will happen in the combiner and reducer
MRJobConfiguration.setGroupedAggInstructions(job, grpAggInstructions);
// set up the instructions that will happen in the reducer, after the aggregation instrucions
MRJobConfiguration.setInstructionsInReducer(job, simpleReduceInstructions);
// set up the number of reducers
MRJobConfiguration.setNumReducers(job, numReducers, numReducers);
// set up the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
// set up custom map/reduce configurations
DMLConfig config = ConfigurationManager.getDMLConfig();
MRJobConfiguration.setupCustomMRConfigurations(job, config);
// set up what matrices are needed to pass from the mapper to reducer
MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, null, grpAggInstructions, resultIndexes);
MatrixCharacteristics[] stats = new MatrixCharacteristics[resultIndexes.length];
for (int i = 0; i < resultIndexes.length; i++) stats[i] = new MatrixCharacteristics();
// Print the complete instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(stats);
byte[] resultDimsUnknown = new byte[resultIndexes.length];
// Update resultDimsUnknown based on computed "stats"
for (int i = 0; i < resultIndexes.length; i++) resultDimsUnknown[i] = (byte) 2;
// set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, false);
// configure mapper and the mapper output key value pairs
job.setMapperClass(GroupedAggMRMapper.class);
job.setCombinerClass(GroupedAggMRCombiner.class);
job.setMapOutputKeyClass(TaggedMatrixIndexes.class);
job.setMapOutputValueClass(WeightedCell.class);
// configure reducer
job.setReducerClass(GroupedAggMRReducer.class);
// set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
// execute job
RunningJob runjob = JobClient.runJob(job);
// get important output statistics
Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for (int i = 0; i < resultIndexes.length; i++) {
// number of non-zeros
stats[i] = new MatrixCharacteristics();
stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
}
String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
stats = MapReduceTool.processDimsFiles(dir, stats);
MapReduceTool.deleteFileIfExistOnHDFS(dir);
return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.
the class SortMR method runStitchupJob.
private static boolean runStitchupJob(String input, long rlen, long clen, int brlen, int bclen, long[] counts, int numReducers, int replication, String output) throws Exception {
JobConf job = new JobConf(SortMR.class);
job.setJobName("SortIndexesMR");
// setup input/output paths
Path inpath = new Path(input);
Path outpath = new Path(output);
FileInputFormat.setInputPaths(job, inpath);
FileOutputFormat.setOutputPath(job, outpath);
MapReduceTool.deleteFileIfExistOnHDFS(outpath, job);
// set number of reducers (1 if local mode)
if (InfrastructureAnalyzer.isLocalMode(job))
job.setNumReduceTasks(1);
else
MRJobConfiguration.setNumReducers(job, numReducers, numReducers);
// setup input/output format
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
job.setInputFormat(iinfo.inputFormatClass);
job.setOutputFormat(oinfo.outputFormatClass);
CompactInputFormat.setKeyValueClasses(job, MatrixIndexes.class, MatrixBlock.class);
// setup mapper/reducer/output classes
MRJobConfiguration.setInputInfo(job, (byte) 0, InputInfo.BinaryBlockInputInfo, brlen, bclen, ConvertTarget.BLOCK);
job.setMapperClass(IndexSortStitchupMapper.class);
job.setReducerClass(IndexSortStitchupReducer.class);
job.setOutputKeyClass(oinfo.outputKeyClass);
job.setOutputValueClass(oinfo.outputValueClass);
MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen);
MRJobConfiguration.setMatricesDimensions(job, new byte[] { 0 }, new long[] { rlen }, new long[] { clen });
// compute shifted prefix sum of offsets and put into configuration
long[] cumsumCounts = new long[counts.length];
long sum = 0;
for (int i = 0; i < counts.length; i++) {
cumsumCounts[i] = sum;
sum += counts[i];
}
job.set(SORT_INDEXES_OFFSETS, Arrays.toString(cumsumCounts));
// setup replication factor
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
// set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
// run mr job
RunningJob runJob = JobClient.runJob(job);
return runJob.isSuccessful();
}
Aggregations