use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class SymlinkTextInputFormat method getRecordReader.
@Override
public RecordReader<LongWritable, Text> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
InputSplit targetSplit = ((SymlinkTextInputSplit) split).getTargetSplit();
// The target data is in TextInputFormat.
TextInputFormat inputFormat = new TextInputFormat();
inputFormat.configure(job);
RecordReader innerReader = null;
try {
innerReader = inputFormat.getRecordReader(targetSplit, job, reporter);
} catch (Exception e) {
innerReader = HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(e, job);
}
HiveRecordReader rr = new HiveRecordReader(innerReader, job);
rr.initIOContext((FileSplit) targetSplit, job, TextInputFormat.class, innerReader);
return rr;
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class BucketizedHiveInputFormat method getSplits.
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
init(job);
Path[] dirs = getInputPaths(job);
JobConf newjob = new JobConf(job);
ArrayList<InputSplit> result = new ArrayList<InputSplit>();
int numOrigSplits = 0;
// and then create a BucketizedHiveInputSplit on it
for (Path dir : dirs) {
PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
// create a new InputFormat instance if this is the first time to see this
// class
Class inputFormatClass = part.getInputFileFormatClass();
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
newjob.setInputFormat(inputFormat.getClass());
FileStatus[] listStatus = listStatus(newjob, dir);
for (FileStatus status : listStatus) {
LOG.info("block size: " + status.getBlockSize());
LOG.info("file length: " + status.getLen());
FileInputFormat.setInputPaths(newjob, status.getPath());
InputSplit[] iss = inputFormat.getSplits(newjob, 0);
if (iss != null && iss.length > 0) {
numOrigSplits += iss.length;
result.add(new BucketizedHiveInputSplit(iss, inputFormatClass.getName()));
}
}
}
LOG.info(result.size() + " bucketized splits generated from " + numOrigSplits + " original splits.");
return result.toArray(new BucketizedHiveInputSplit[result.size()]);
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class TestOrcSplitElimination method verifySplits.
private void verifySplits(HashSet<FsWithHash> originalHs, InputSplit[] splits) {
if (originalHs.size() != splits.length) {
String s = "Expected [";
for (FsWithHash fwh : originalHs) {
s += toString(fwh.fs) + ", ";
}
s += "], actual [";
for (InputSplit fs : splits) {
s += toString((FileSplit) fs) + ", ";
}
fail(s + "]");
}
for (int i = 0; i < splits.length; ++i) {
FileSplit fs = (FileSplit) splits[i];
if (!originalHs.contains(new FsWithHash((FileSplit) splits[i]))) {
String s = " in [";
for (FsWithHash fwh : originalHs) {
s += toString(fwh.fs) + ", ";
}
fail("Cannot find " + toString(fs) + s);
}
}
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class TestOrcSplitElimination method testFooterExternalCacheImpl.
private void testFooterExternalCacheImpl(boolean isPpd) throws IOException {
ObjectInspector inspector = createIO();
writeFile(inspector, testFilePath);
writeFile(inspector, testFilePath2);
GenericUDF udf = new GenericUDFOPEqualOrLessThan();
List<ExprNodeDesc> childExpr = Lists.newArrayList();
createTestSarg(inspector, udf, childExpr);
setupExternalCacheConfig(isPpd, testFilePath + "," + testFilePath2);
// Get the base values w/o cache.
conf.setBoolean(ConfVars.HIVE_ORC_MS_FOOTER_CACHE_ENABLED.varname, false);
OrcInputFormatForTest.clearLocalCache();
OrcInputFormat in0 = new OrcInputFormat();
InputSplit[] originals = in0.getSplits(conf, -1);
assertEquals(10, originals.length);
HashSet<FsWithHash> originalHs = new HashSet<>();
for (InputSplit original : originals) {
originalHs.add(new FsWithHash((FileSplit) original));
}
// Populate the cache.
conf.setBoolean(ConfVars.HIVE_ORC_MS_FOOTER_CACHE_ENABLED.varname, true);
OrcInputFormatForTest in = new OrcInputFormatForTest();
OrcInputFormatForTest.clearLocalCache();
OrcInputFormatForTest.caches.resetCounts();
OrcInputFormatForTest.caches.cache.clear();
InputSplit[] splits = in.getSplits(conf, -1);
// Puts, gets, hits, unused, unused.
@SuppressWarnings("static-access") AtomicInteger[] counts = { in.caches.putCount, isPpd ? in.caches.getByExprCount : in.caches.getCount, isPpd ? in.caches.getHitByExprCount : in.caches.getHitCount, isPpd ? in.caches.getCount : in.caches.getByExprCount, isPpd ? in.caches.getHitCount : in.caches.getHitByExprCount };
verifySplits(originalHs, splits);
verifyCallCounts(counts, 2, 2, 0);
assertEquals(2, OrcInputFormatForTest.caches.cache.size());
// Verify we can get from cache.
OrcInputFormatForTest.clearLocalCache();
OrcInputFormatForTest.caches.resetCounts();
splits = in.getSplits(conf, -1);
verifySplits(originalHs, splits);
verifyCallCounts(counts, 0, 2, 2);
// Verify ORC SARG still works.
OrcInputFormatForTest.clearLocalCache();
OrcInputFormatForTest.caches.resetCounts();
childExpr.set(1, new ExprNodeConstantDesc(5));
conf.set("hive.io.filter.expr.serialized", SerializationUtilities.serializeExpression(new ExprNodeGenericFuncDesc(inspector, udf, childExpr)));
splits = in.getSplits(conf, -1);
InputSplit[] filtered = { originals[0], originals[4], originals[5], originals[9] };
originalHs = new HashSet<>();
for (InputSplit original : filtered) {
originalHs.add(new FsWithHash((FileSplit) original));
}
verifySplits(originalHs, splits);
verifyCallCounts(counts, 0, 2, 2);
// Verify corrupted cache value gets replaced.
OrcInputFormatForTest.clearLocalCache();
OrcInputFormatForTest.caches.resetCounts();
Map.Entry<Long, MockExternalCaches.MockItem> e = OrcInputFormatForTest.caches.cache.entrySet().iterator().next();
Long key = e.getKey();
byte[] someData = new byte[8];
ByteBuffer toCorrupt = e.getValue().data;
System.arraycopy(toCorrupt.array(), toCorrupt.arrayOffset(), someData, 0, someData.length);
toCorrupt.putLong(0, 0L);
splits = in.getSplits(conf, -1);
verifySplits(originalHs, splits);
if (!isPpd) {
// Recovery is not implemented yet for PPD path.
ByteBuffer restored = OrcInputFormatForTest.caches.cache.get(key).data;
byte[] newData = new byte[someData.length];
System.arraycopy(restored.array(), restored.arrayOffset(), newData, 0, newData.length);
assertArrayEquals(someData, newData);
}
}
use of org.apache.hadoop.mapred.InputSplit in project mongo-hadoop by mongodb.
the class MongoInputFormat method getSplits.
public InputSplit[] getSplits(final JobConf job, final int numSplits) throws IOException {
try {
MongoSplitter splitterImpl = MongoSplitterFactory.getSplitter(job);
LOG.info("Using " + splitterImpl + " to calculate splits. (old mapreduce API)");
final List<org.apache.hadoop.mapreduce.InputSplit> splits = splitterImpl.calculateSplits();
return splits.toArray(new InputSplit[splits.size()]);
} catch (SplitFailedException spfe) {
throw new IOException(spfe);
}
}
Aggregations