use of org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount in project incubator-systemml by apache.
the class TfUtils method getPartFileID.
/**
* Function to generate custom file names (transform-part-.....) for
* mappers' output for ApplyTfCSV job. The idea is to find the index
* of (thisfile, fileoffset) in the list of all offsets from the
* counters/offsets file, which was generated from either GenTfMtdMR
* or AssignRowIDMR job.
*
* @param job job configuration
* @param offset file offset
* @return part file id (ie, 00001, 00002, etc)
* @throws IOException if IOException occurs
*/
public String getPartFileID(JobConf job, long offset) throws IOException {
Reader reader = null;
int id = 0;
try {
reader = initOffsetsReader(job);
ByteWritable key = new ByteWritable();
OffsetCount value = new OffsetCount();
String thisFile = TfUtils.getPartFileName(job);
while (reader.next(key, value)) {
if (thisFile.equals(value.filename) && value.fileOffset == offset)
break;
id++;
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
String sid = Integer.toString(id);
char[] carr = new char[5 - sid.length()];
Arrays.fill(carr, '0');
String ret = (new String(carr)).concat(sid);
return ret;
}
use of org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount in project incubator-systemml by apache.
the class GTFMTDReducer method reduce.
@Override
public void reduce(IntWritable key, Iterator<DistinctValue> values, OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException {
FileSystem fs = FileSystem.get(_rJob);
int colID = key.get();
if (colID < 0) {
// process mapper output for MV and Bin agents
colID = colID * -1;
_agents.getMVImputeAgent().mergeAndOutputTransformationMetadata(values, _agents.getTfMtdDir(), colID, fs, _agents);
} else if (colID == _agents.getNumCols() + 1) {
// process mapper output for OFFSET_FILE
ArrayList<OffsetCount> list = new ArrayList<OffsetCount>();
while (values.hasNext()) list.add(new OffsetCount(values.next().getOffsetCount()));
long numTfRows = generateOffsetsFile(list);
reporter.incrCounter(MRJobConfiguration.DataTransformCounters.TRANSFORMED_NUM_ROWS, numTfRows);
} else {
// process mapper output for Recode agent
_agents.getRecodeAgent().mergeAndOutputTransformationMetadata(values, _agents.getTfMtdDir(), colID, fs, _agents);
}
}
use of org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount in project incubator-systemml by apache.
the class ApplyTfBBMapper method configure.
@Override
public void configure(JobConf job) {
super.configure(job);
try {
_partFileWithHeader = TfUtils.isPartFileWithHeader(job);
tfmapper = new TfUtils(job);
tfmapper.loadTfMetadata(job, true);
// Load relevant information for CSV Reblock
ByteWritable key = new ByteWritable();
OffsetCount value = new OffsetCount();
Path p = new Path(job.get(CSVReblockMR.ROWID_FILE_NAME));
Path path = new Path(job.get(MRConfigurationNames.MR_MAP_INPUT_FILE));
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
String thisfile = path.makeQualified(fs).toString();
SequenceFile.Reader reader = null;
try {
reader = new SequenceFile.Reader(fs, p, job);
while (reader.next(key, value)) {
// "key" needn't be checked since the offset file has information about a single CSV input (the raw data file)
if (thisfile.equals(value.filename))
offsetMap.put(value.fileOffset, value.count);
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
idxRow = new CSVReblockMapper.IndexedBlockRow();
int maxBclen = 0;
for (ArrayList<CSVReblockInstruction> insv : csv_reblock_instructions) for (CSVReblockInstruction in : insv) {
if (maxBclen < in.bclen)
maxBclen = in.bclen;
}
//always dense since common csv usecase
idxRow.getRow().data.reset(1, maxBclen, false);
} catch (IOException e) {
throw new RuntimeException(e);
} catch (JSONException e) {
throw new RuntimeException(e);
}
}
use of org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount in project incubator-systemml by apache.
the class CSVAssignRowIDReducer method reduce.
@Override
@SuppressWarnings("unchecked")
public void reduce(ByteWritable key, Iterator<OffsetCount> values, OutputCollector<ByteWritable, OffsetCount> out, Reporter report) throws IOException {
//need to sort the values by filename and fileoffset
while (values.hasNext()) list.add(new OffsetCount(values.next()));
Collections.sort(list);
long lineOffset = 0;
for (OffsetCount oc : list) {
long count = oc.count;
oc.count = lineOffset;
out.collect(key, oc);
lineOffset += count;
}
report.incrCounter(CSVReblockMR.NUM_ROWS_IN_MATRIX, key.toString(), lineOffset);
list.clear();
}
use of org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount in project incubator-systemml by apache.
the class CSVReblockMapper method configure.
@Override
@SuppressWarnings("deprecation")
public void configure(JobConf job) {
super.configure(job);
//get the number colums per block
//load the offset mapping
byte matrixIndex = representativeMatrixes.get(0);
try {
Path thisPath = new Path(job.get(MRConfigurationNames.MR_MAP_INPUT_FILE));
FileSystem fs = IOUtilFunctions.getFileSystem(thisPath, job);
thisPath = thisPath.makeQualified(fs);
String filename = thisPath.toString();
Path headerPath = new Path(job.getStrings(CSVReblockMR.SMALLEST_FILE_NAME_PER_INPUT)[matrixIndex]).makeQualified(fs);
if (headerPath.toString().equals(filename))
headerFile = true;
ByteWritable key = new ByteWritable();
OffsetCount value = new OffsetCount();
Path p = new Path(job.get(CSVReblockMR.ROWID_FILE_NAME));
SequenceFile.Reader reader = null;
try {
reader = new SequenceFile.Reader(fs, p, job);
while (reader.next(key, value)) {
if (key.get() == matrixIndex && filename.equals(value.filename))
offsetMap.put(value.fileOffset, value.count);
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
CSVReblockInstruction ins = csv_reblock_instructions.get(0).get(0);
_delim = ins.delim;
ignoreFirstLine = ins.hasHeader;
idxRow = new IndexedBlockRow();
int maxBclen = 0;
for (ArrayList<CSVReblockInstruction> insv : csv_reblock_instructions) for (CSVReblockInstruction in : insv) {
if (maxBclen < in.bclen)
maxBclen = in.bclen;
}
//always dense since common csv usecase
idxRow.getRow().data.reset(1, maxBclen, false);
}
Aggregations