use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class RemoteParForColocatedNLineInputFormat method getSplits.
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
InputSplit[] tmp = super.getSplits(job, numSplits);
// get partitioning information
MatrixCharacteristics mc = MRJobConfiguration.getPartitionedMatrixSize(job);
PDataPartitionFormat dpf = MRJobConfiguration.getPartitioningFormat(job);
PartitionFormat pf = new PartitionFormat(dpf, -1);
int blen = (int) (pf.isRowwise() ? pf.getNumRows(mc) : pf.getNumColumns(mc));
String fname = MRJobConfiguration.getPartitioningFilename(job);
// create wrapper splits
InputSplit[] ret = new InputSplit[tmp.length];
for (int i = 0; i < tmp.length; i++) {
// check for robustness of subsequent cast
if (tmp[i] instanceof FileSplit)
ret[i] = new RemoteParForColocatedFileSplit((FileSplit) tmp[i], fname, blen);
else
ret[i] = tmp[i];
}
return ret;
}
use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class ResultMergeLocalFile method mergeTextCellWithoutComp.
private static void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO) {
try {
// delete target file if already exists
MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
if (ALLOW_COPY_CELLFILES) {
copyAllFiles(fnameNew, inMO);
// we're done
return;
}
// actual merge
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fnameNew);
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));
String valueStr = null;
try {
for (// read/write all inputs
MatrixObject in : // read/write all inputs
inMO) {
if (LOG.isTraceEnabled())
LOG.trace("ResultMerge (local, file): Merge input " + in.hashCode() + " (fname=" + in.getFileName() + ") via stream merge");
JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
Path tmpPath = new Path(in.getFileName());
FileInputFormat.addInputPath(tmpJob, tmpPath);
TextInputFormat informat = new TextInputFormat();
informat.configure(tmpJob);
InputSplit[] splits = informat.getSplits(tmpJob, 1);
LongWritable key = new LongWritable();
Text value = new Text();
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob, Reporter.NULL);
try {
while (reader.next(key, value)) {
valueStr = value.toString().trim();
out.write(valueStr + "\n");
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
}
} finally {
IOUtilFunctions.closeSilently(out);
}
} catch (Exception ex) {
throw new DMLRuntimeException("Unable to merge text cell results.", ex);
}
}
use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class FrameReaderTextCSV method readCSVFrameFromHDFS.
protected void readCSVFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
splits = IOUtilFunctions.sortInputSplits(splits);
for (int i = 0, rpos = 0; i < splits.length; i++) rpos = readCSVFrameFromInputSplit(splits[i], informat, job, dest, schema, names, rlen, clen, rpos, i == 0);
}
use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class FrameReaderTextCSV method computeCSVSize.
protected Pair<Integer, Integer> computeCSVSize(Path path, JobConf job, FileSystem fs) throws IOException {
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
splits = IOUtilFunctions.sortInputSplits(splits);
// compute number of columns
int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
// compute number of rows
int nrow = 0;
for (int i = 0; i < splits.length; i++) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[i], job, Reporter.NULL);
LongWritable key = new LongWritable();
Text value = new Text();
try {
// ignore header of first split
if (i == 0 && _props.hasHeader())
reader.next(key, value);
// count remaining number of rows, ignore meta data
while (reader.next(key, value)) {
String val = value.toString();
nrow += (val.startsWith(TfUtils.TXMTD_MVPREFIX) || val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1;
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
return new Pair<>(nrow, ncol);
}
use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class FrameReaderTextCSVParallel method computeCSVSize.
@Override
protected Pair<Integer, Integer> computeCSVSize(Path path, JobConf job, FileSystem fs) throws IOException {
int numThreads = OptimizerUtils.getParallelTextReadParallelism();
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, numThreads);
// compute number of columns
int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
// compute number of rows
int nrow = 0;
ExecutorService pool = CommonThreadPool.get(numThreads);
try {
ArrayList<CountRowsTask> tasks = new ArrayList<>();
for (int i = 0; i < splits.length; i++) tasks.add(new CountRowsTask(splits[i], informat, job, _props.hasHeader(), i == 0));
List<Future<Long>> cret = pool.invokeAll(tasks);
for (Future<Long> count : cret) nrow += count.get().intValue();
} catch (Exception e) {
throw new IOException("Failed parallel read of text csv input.", e);
} finally {
pool.shutdown();
}
return new Pair<>(nrow, ncol);
}
Aggregations