use of org.apache.hadoop.mapred.InputSplit in project carbondata by apache.
the class MapredCarbonInputFormat method getSplits.
@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
org.apache.hadoop.mapreduce.JobContext jobContext = Job.getInstance(jobConf);
List<org.apache.hadoop.mapreduce.InputSplit> splitList = super.getSplits(jobContext);
InputSplit[] splits = new InputSplit[splitList.size()];
CarbonInputSplit split = null;
for (int i = 0; i < splitList.size(); i++) {
split = (CarbonInputSplit) splitList.get(i);
splits[i] = new CarbonHiveInputSplit(split.getSegmentId(), split.getPath(), split.getStart(), split.getLength(), split.getLocations(), split.getNumberOfBlocklets(), split.getVersion(), split.getBlockStorageIdMap());
}
return splits;
}
use of org.apache.hadoop.mapred.InputSplit in project cdap by caskdata.
the class DatasetInputFormat method getSplits.
@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
try (DatasetAccessor datasetAccessor = new DatasetAccessor(jobConf)) {
try {
datasetAccessor.initialize();
} catch (Exception e) {
throw new IOException("Could not get dataset", e);
}
try (RecordScannable recordScannable = datasetAccessor.getDataset()) {
Job job = new Job(jobConf);
JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
List<Split> dsSplits = recordScannable.getSplits();
InputSplit[] inputSplits = new InputSplit[dsSplits.size()];
for (int i = 0; i < dsSplits.size(); i++) {
inputSplits[i] = new DatasetInputSplit(dsSplits.get(i), tablePaths[0]);
}
return inputSplits;
}
}
}
use of org.apache.hadoop.mapred.InputSplit in project drill by apache.
the class HiveSubScan method deserializeInputSplit.
public static InputSplit deserializeInputSplit(String base64, String className) throws IOException, ReflectiveOperationException {
Constructor<?> constructor = Class.forName(className).getDeclaredConstructor();
if (constructor == null) {
throw new ReflectiveOperationException("Class " + className + " does not implement a default constructor.");
}
constructor.setAccessible(true);
InputSplit split = (InputSplit) constructor.newInstance();
ByteArrayDataInput byteArrayDataInput = ByteStreams.newDataInput(Base64.decodeBase64(base64));
split.readFields(byteArrayDataInput);
return split;
}
use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class ResultMergeLocalFile method createTextCellStagingFile.
private static void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID) throws IOException, DMLRuntimeException {
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(mo.getFileName());
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
LinkedList<Cell> buffer = new LinkedList<>();
LongWritable key = new LongWritable();
Text value = new Text();
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
int brlen = mc.getRowsPerBlock();
int bclen = mc.getColsPerBlock();
// long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
// NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
// errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
// It works fine with int row, col but we require long for larger matrices.
// Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
// we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)
FastStringTokenizer st = new FastStringTokenizer(' ');
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
try {
while (reader.next(key, value)) {
// reset tokenizer
st.reset(value.toString());
long row = st.nextLong();
long col = st.nextLong();
double lvalue = Double.parseDouble(st.nextToken());
Cell tmp = new Cell(row, col, lvalue);
buffer.addLast(tmp);
if (// periodic flush
buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
buffer.clear();
}
}
// final flush
if (!buffer.isEmpty()) {
appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
buffer.clear();
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
}
use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class DataPartitionerLocal method partitionTextCell.
private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen, int brlen, int bclen) {
long row = -1;
long col = -1;
try {
// STEP 1: read matrix from HDFS and write blocks to local staging area
// check and add input path
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fname);
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
LinkedList<Cell> buffer = new LinkedList<>();
LongWritable key = new LongWritable();
Text value = new Text();
FastStringTokenizer st = new FastStringTokenizer(' ');
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
try {
while (reader.next(key, value)) {
// reset tokenizer
st.reset(value.toString());
row = st.nextLong();
col = st.nextLong();
double lvalue = st.nextDouble();
Cell tmp = new Cell(row, col, lvalue);
buffer.addLast(tmp);
if (// periodic flush
buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
buffer.clear();
}
}
// final flush
if (!buffer.isEmpty()) {
appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
buffer.clear();
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
// STEP 2: read matrix blocks from staging area and write matrix to HDFS
String[] fnamesPartitions = new File(fnameStaging).list();
if (PARALLEL) {
int len = Math.min(fnamesPartitions.length, _par);
Thread[] threads = new Thread[len];
for (int i = 0; i < len; i++) {
int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
end = Math.min(end, fnamesPartitions.length - 1);
threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging, fnamesPartitions, start, end));
threads[i].start();
}
for (Thread t : threads) t.join();
} else {
for (String pdir : fnamesPartitions) writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir);
}
} catch (Exception e) {
// post-mortem error handling and bounds checking
if (row < 1 || row > rlen || col < 1 || col > clen) {
throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
} else
throw new DMLRuntimeException("Unable to partition text cell matrix.", e);
}
}
Aggregations