Search in sources :

Example 11 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class SymlinkTextInputFormat method getRecordReader.

@Override
public RecordReader<LongWritable, Text> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    InputSplit targetSplit = ((SymlinkTextInputSplit) split).getTargetSplit();
    // The target data is in TextInputFormat.
    TextInputFormat inputFormat = new TextInputFormat();
    inputFormat.configure(job);
    RecordReader innerReader = null;
    try {
        innerReader = inputFormat.getRecordReader(targetSplit, job, reporter);
    } catch (Exception e) {
        innerReader = HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(e, job);
    }
    HiveRecordReader rr = new HiveRecordReader(innerReader, job);
    rr.initIOContext((FileSplit) targetSplit, job, TextInputFormat.class, innerReader);
    return rr;
}
Also used : TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) RecordReader(org.apache.hadoop.mapred.RecordReader) InputSplit(org.apache.hadoop.mapred.InputSplit) IOException(java.io.IOException)

Example 12 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class BucketizedHiveInputFormat method getSplits.

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    init(job);
    Path[] dirs = getInputPaths(job);
    JobConf newjob = new JobConf(job);
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();
    int numOrigSplits = 0;
    // and then create a BucketizedHiveInputSplit on it
    for (Path dir : dirs) {
        PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
        // create a new InputFormat instance if this is the first time to see this
        // class
        Class inputFormatClass = part.getInputFileFormatClass();
        InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
        newjob.setInputFormat(inputFormat.getClass());
        FileStatus[] listStatus = listStatus(newjob, dir);
        for (FileStatus status : listStatus) {
            LOG.info("block size: " + status.getBlockSize());
            LOG.info("file length: " + status.getLen());
            FileInputFormat.setInputPaths(newjob, status.getPath());
            InputSplit[] iss = inputFormat.getSplits(newjob, 0);
            if (iss != null && iss.length > 0) {
                numOrigSplits += iss.length;
                result.add(new BucketizedHiveInputSplit(iss, inputFormatClass.getName()));
            }
        }
    }
    LOG.info(result.size() + " bucketized splits generated from " + numOrigSplits + " original splits.");
    return result.toArray(new BucketizedHiveInputSplit[result.size()]);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 13 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class TestOrcSplitElimination method verifySplits.

private void verifySplits(HashSet<FsWithHash> originalHs, InputSplit[] splits) {
    if (originalHs.size() != splits.length) {
        String s = "Expected [";
        for (FsWithHash fwh : originalHs) {
            s += toString(fwh.fs) + ", ";
        }
        s += "], actual [";
        for (InputSplit fs : splits) {
            s += toString((FileSplit) fs) + ", ";
        }
        fail(s + "]");
    }
    for (int i = 0; i < splits.length; ++i) {
        FileSplit fs = (FileSplit) splits[i];
        if (!originalHs.contains(new FsWithHash((FileSplit) splits[i]))) {
            String s = " in [";
            for (FsWithHash fwh : originalHs) {
                s += toString(fwh.fs) + ", ";
            }
            fail("Cannot find " + toString(fs) + s);
        }
    }
}
Also used : FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 14 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class TestOrcSplitElimination method testFooterExternalCacheImpl.

private void testFooterExternalCacheImpl(boolean isPpd) throws IOException {
    ObjectInspector inspector = createIO();
    writeFile(inspector, testFilePath);
    writeFile(inspector, testFilePath2);
    GenericUDF udf = new GenericUDFOPEqualOrLessThan();
    List<ExprNodeDesc> childExpr = Lists.newArrayList();
    createTestSarg(inspector, udf, childExpr);
    setupExternalCacheConfig(isPpd, testFilePath + "," + testFilePath2);
    // Get the base values w/o cache.
    conf.setBoolean(ConfVars.HIVE_ORC_MS_FOOTER_CACHE_ENABLED.varname, false);
    OrcInputFormatForTest.clearLocalCache();
    OrcInputFormat in0 = new OrcInputFormat();
    InputSplit[] originals = in0.getSplits(conf, -1);
    assertEquals(10, originals.length);
    HashSet<FsWithHash> originalHs = new HashSet<>();
    for (InputSplit original : originals) {
        originalHs.add(new FsWithHash((FileSplit) original));
    }
    // Populate the cache.
    conf.setBoolean(ConfVars.HIVE_ORC_MS_FOOTER_CACHE_ENABLED.varname, true);
    OrcInputFormatForTest in = new OrcInputFormatForTest();
    OrcInputFormatForTest.clearLocalCache();
    OrcInputFormatForTest.caches.resetCounts();
    OrcInputFormatForTest.caches.cache.clear();
    InputSplit[] splits = in.getSplits(conf, -1);
    // Puts, gets, hits, unused, unused.
    @SuppressWarnings("static-access") AtomicInteger[] counts = { in.caches.putCount, isPpd ? in.caches.getByExprCount : in.caches.getCount, isPpd ? in.caches.getHitByExprCount : in.caches.getHitCount, isPpd ? in.caches.getCount : in.caches.getByExprCount, isPpd ? in.caches.getHitCount : in.caches.getHitByExprCount };
    verifySplits(originalHs, splits);
    verifyCallCounts(counts, 2, 2, 0);
    assertEquals(2, OrcInputFormatForTest.caches.cache.size());
    // Verify we can get from cache.
    OrcInputFormatForTest.clearLocalCache();
    OrcInputFormatForTest.caches.resetCounts();
    splits = in.getSplits(conf, -1);
    verifySplits(originalHs, splits);
    verifyCallCounts(counts, 0, 2, 2);
    // Verify ORC SARG still works.
    OrcInputFormatForTest.clearLocalCache();
    OrcInputFormatForTest.caches.resetCounts();
    childExpr.set(1, new ExprNodeConstantDesc(5));
    conf.set("hive.io.filter.expr.serialized", SerializationUtilities.serializeExpression(new ExprNodeGenericFuncDesc(inspector, udf, childExpr)));
    splits = in.getSplits(conf, -1);
    InputSplit[] filtered = { originals[0], originals[4], originals[5], originals[9] };
    originalHs = new HashSet<>();
    for (InputSplit original : filtered) {
        originalHs.add(new FsWithHash((FileSplit) original));
    }
    verifySplits(originalHs, splits);
    verifyCallCounts(counts, 0, 2, 2);
    // Verify corrupted cache value gets replaced.
    OrcInputFormatForTest.clearLocalCache();
    OrcInputFormatForTest.caches.resetCounts();
    Map.Entry<Long, MockExternalCaches.MockItem> e = OrcInputFormatForTest.caches.cache.entrySet().iterator().next();
    Long key = e.getKey();
    byte[] someData = new byte[8];
    ByteBuffer toCorrupt = e.getValue().data;
    System.arraycopy(toCorrupt.array(), toCorrupt.arrayOffset(), someData, 0, someData.length);
    toCorrupt.putLong(0, 0L);
    splits = in.getSplits(conf, -1);
    verifySplits(originalHs, splits);
    if (!isPpd) {
        // Recovery is not implemented yet for PPD path.
        ByteBuffer restored = OrcInputFormatForTest.caches.cache.get(key).data;
        byte[] newData = new byte[someData.length];
        System.arraycopy(restored.array(), restored.arrayOffset(), newData, 0, newData.length);
        assertArrayEquals(someData, newData);
    }
}
Also used : GenericUDFOPEqualOrLessThan(org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan) FileSplit(org.apache.hadoop.mapred.FileSplit) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) InputSplit(org.apache.hadoop.mapred.InputSplit) HashSet(java.util.HashSet) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) ByteBuffer(java.nio.ByteBuffer) GenericUDF(org.apache.hadoop.hive.ql.udf.generic.GenericUDF) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap)

Example 15 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project mongo-hadoop by mongodb.

the class MongoInputFormat method getSplits.

public InputSplit[] getSplits(final JobConf job, final int numSplits) throws IOException {
    try {
        MongoSplitter splitterImpl = MongoSplitterFactory.getSplitter(job);
        LOG.info("Using " + splitterImpl + " to calculate splits. (old mapreduce API)");
        final List<org.apache.hadoop.mapreduce.InputSplit> splits = splitterImpl.calculateSplits();
        return splits.toArray(new InputSplit[splits.size()]);
    } catch (SplitFailedException spfe) {
        throw new IOException(spfe);
    }
}
Also used : MongoSplitter(com.mongodb.hadoop.splitter.MongoSplitter) IOException(java.io.IOException) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) SplitFailedException(com.mongodb.hadoop.splitter.SplitFailedException)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)161 Path (org.apache.hadoop.fs.Path)57 JobConf (org.apache.hadoop.mapred.JobConf)56 Test (org.junit.Test)49 IOException (java.io.IOException)47 ArrayList (java.util.ArrayList)29 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)27 FileSplit (org.apache.hadoop.mapred.FileSplit)24 FileSystem (org.apache.hadoop.fs.FileSystem)21 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)21 InputFormat (org.apache.hadoop.mapred.InputFormat)19 RecordWriter (org.apache.hadoop.mapred.RecordWriter)19 NullWritable (org.apache.hadoop.io.NullWritable)18 Text (org.apache.hadoop.io.Text)18 Configuration (org.apache.hadoop.conf.Configuration)14 LongWritable (org.apache.hadoop.io.LongWritable)11 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)10 Properties (java.util.Properties)9 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)9 HashMap (java.util.HashMap)8