Search in sources :

Example 6 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class AbstractTestParquetDirect method read.

public static List<ArrayWritable> read(Path parquetFile) throws IOException {
    List<ArrayWritable> records = new ArrayList<ArrayWritable>();
    RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(new FileSplit(parquetFile, 0, fileLength(parquetFile), (String[]) null), new JobConf(), null);
    NullWritable alwaysNull = reader.createKey();
    ArrayWritable record = reader.createValue();
    while (reader.next(alwaysNull, record)) {
        records.add(record);
        // a new value so the last isn't clobbered
        record = reader.createValue();
    }
    return records;
}
Also used : ArrayWritable(org.apache.hadoop.io.ArrayWritable) ArrayList(java.util.ArrayList) FileSplit(org.apache.hadoop.mapred.FileSplit) JobConf(org.apache.hadoop.mapred.JobConf) NullWritable(org.apache.hadoop.io.NullWritable)

Example 7 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class TestOrcSplitElimination method verifySplits.

private void verifySplits(HashSet<FsWithHash> originalHs, InputSplit[] splits) {
    if (originalHs.size() != splits.length) {
        String s = "Expected [";
        for (FsWithHash fwh : originalHs) {
            s += toString(fwh.fs) + ", ";
        }
        s += "], actual [";
        for (InputSplit fs : splits) {
            s += toString((FileSplit) fs) + ", ";
        }
        fail(s + "]");
    }
    for (int i = 0; i < splits.length; ++i) {
        FileSplit fs = (FileSplit) splits[i];
        if (!originalHs.contains(new FsWithHash((FileSplit) splits[i]))) {
            String s = " in [";
            for (FsWithHash fwh : originalHs) {
                s += toString(fwh.fs) + ", ";
            }
            fail("Cannot find " + toString(fs) + s);
        }
    }
}
Also used : FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 8 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class TestOrcSplitElimination method testFooterExternalCacheImpl.

private void testFooterExternalCacheImpl(boolean isPpd) throws IOException {
    ObjectInspector inspector = createIO();
    writeFile(inspector, testFilePath);
    writeFile(inspector, testFilePath2);
    GenericUDF udf = new GenericUDFOPEqualOrLessThan();
    List<ExprNodeDesc> childExpr = Lists.newArrayList();
    createTestSarg(inspector, udf, childExpr);
    setupExternalCacheConfig(isPpd, testFilePath + "," + testFilePath2);
    // Get the base values w/o cache.
    conf.setBoolean(ConfVars.HIVE_ORC_MS_FOOTER_CACHE_ENABLED.varname, false);
    OrcInputFormatForTest.clearLocalCache();
    OrcInputFormat in0 = new OrcInputFormat();
    InputSplit[] originals = in0.getSplits(conf, -1);
    assertEquals(10, originals.length);
    HashSet<FsWithHash> originalHs = new HashSet<>();
    for (InputSplit original : originals) {
        originalHs.add(new FsWithHash((FileSplit) original));
    }
    // Populate the cache.
    conf.setBoolean(ConfVars.HIVE_ORC_MS_FOOTER_CACHE_ENABLED.varname, true);
    OrcInputFormatForTest in = new OrcInputFormatForTest();
    OrcInputFormatForTest.clearLocalCache();
    OrcInputFormatForTest.caches.resetCounts();
    OrcInputFormatForTest.caches.cache.clear();
    InputSplit[] splits = in.getSplits(conf, -1);
    // Puts, gets, hits, unused, unused.
    @SuppressWarnings("static-access") AtomicInteger[] counts = { in.caches.putCount, isPpd ? in.caches.getByExprCount : in.caches.getCount, isPpd ? in.caches.getHitByExprCount : in.caches.getHitCount, isPpd ? in.caches.getCount : in.caches.getByExprCount, isPpd ? in.caches.getHitCount : in.caches.getHitByExprCount };
    verifySplits(originalHs, splits);
    verifyCallCounts(counts, 2, 2, 0);
    assertEquals(2, OrcInputFormatForTest.caches.cache.size());
    // Verify we can get from cache.
    OrcInputFormatForTest.clearLocalCache();
    OrcInputFormatForTest.caches.resetCounts();
    splits = in.getSplits(conf, -1);
    verifySplits(originalHs, splits);
    verifyCallCounts(counts, 0, 2, 2);
    // Verify ORC SARG still works.
    OrcInputFormatForTest.clearLocalCache();
    OrcInputFormatForTest.caches.resetCounts();
    childExpr.set(1, new ExprNodeConstantDesc(5));
    conf.set("hive.io.filter.expr.serialized", SerializationUtilities.serializeExpression(new ExprNodeGenericFuncDesc(inspector, udf, childExpr)));
    splits = in.getSplits(conf, -1);
    InputSplit[] filtered = { originals[0], originals[4], originals[5], originals[9] };
    originalHs = new HashSet<>();
    for (InputSplit original : filtered) {
        originalHs.add(new FsWithHash((FileSplit) original));
    }
    verifySplits(originalHs, splits);
    verifyCallCounts(counts, 0, 2, 2);
    // Verify corrupted cache value gets replaced.
    OrcInputFormatForTest.clearLocalCache();
    OrcInputFormatForTest.caches.resetCounts();
    Map.Entry<Long, MockExternalCaches.MockItem> e = OrcInputFormatForTest.caches.cache.entrySet().iterator().next();
    Long key = e.getKey();
    byte[] someData = new byte[8];
    ByteBuffer toCorrupt = e.getValue().data;
    System.arraycopy(toCorrupt.array(), toCorrupt.arrayOffset(), someData, 0, someData.length);
    toCorrupt.putLong(0, 0L);
    splits = in.getSplits(conf, -1);
    verifySplits(originalHs, splits);
    if (!isPpd) {
        // Recovery is not implemented yet for PPD path.
        ByteBuffer restored = OrcInputFormatForTest.caches.cache.get(key).data;
        byte[] newData = new byte[someData.length];
        System.arraycopy(restored.array(), restored.arrayOffset(), newData, 0, newData.length);
        assertArrayEquals(someData, newData);
    }
}
Also used : GenericUDFOPEqualOrLessThan(org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan) FileSplit(org.apache.hadoop.mapred.FileSplit) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) InputSplit(org.apache.hadoop.mapred.InputSplit) HashSet(java.util.HashSet) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) ByteBuffer(java.nio.ByteBuffer) GenericUDF(org.apache.hadoop.hive.ql.udf.generic.GenericUDF) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap)

Example 9 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class TestParquetRowGroupFilter method testRowGroupFilterTakeEffect.

@Test
public void testRowGroupFilterTakeEffect() throws Exception {
    // define schema
    columnNames = "intCol";
    columnTypes = "int";
    StructObjectInspector inspector = getObjectInspector(columnNames, columnTypes);
    MessageType fileSchema = MessageTypeParser.parseMessageType("message hive_schema {\n" + "  optional int32 intCol;\n" + "}\n");
    conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "intCol");
    conf.set("columns", "intCol");
    conf.set("columns.types", "int");
    // create Parquet file with specific data
    Path testPath = writeDirect("RowGroupFilterTakeEffect", fileSchema, new DirectWriter() {

        @Override
        public void write(RecordConsumer consumer) {
            for (int i = 0; i < 100; i++) {
                consumer.startMessage();
                consumer.startField("int", 0);
                consumer.addInteger(i);
                consumer.endField("int", 0);
                consumer.endMessage();
            }
        }
    });
    // > 50
    GenericUDF udf = new GenericUDFOPGreaterThan();
    List<ExprNodeDesc> children = Lists.newArrayList();
    ExprNodeColumnDesc columnDesc = new ExprNodeColumnDesc(Integer.class, "intCol", "T", false);
    ExprNodeConstantDesc constantDesc = new ExprNodeConstantDesc(50);
    children.add(columnDesc);
    children.add(constantDesc);
    ExprNodeGenericFuncDesc genericFuncDesc = new ExprNodeGenericFuncDesc(inspector, udf, children);
    String searchArgumentStr = SerializationUtilities.serializeExpression(genericFuncDesc);
    conf.set(TableScanDesc.FILTER_EXPR_CONF_STR, searchArgumentStr);
    ParquetRecordReaderWrapper recordReader = (ParquetRecordReaderWrapper) new MapredParquetInputFormat().getRecordReader(new FileSplit(testPath, 0, fileLength(testPath), (String[]) null), conf, null);
    Assert.assertEquals("row group is not filtered correctly", 1, recordReader.getFiltedBlocks().size());
    // > 100
    constantDesc = new ExprNodeConstantDesc(100);
    children.set(1, constantDesc);
    genericFuncDesc = new ExprNodeGenericFuncDesc(inspector, udf, children);
    searchArgumentStr = SerializationUtilities.serializeExpression(genericFuncDesc);
    conf.set(TableScanDesc.FILTER_EXPR_CONF_STR, searchArgumentStr);
    recordReader = (ParquetRecordReaderWrapper) new MapredParquetInputFormat().getRecordReader(new FileSplit(testPath, 0, fileLength(testPath), (String[]) null), conf, null);
    Assert.assertEquals("row group is not filtered correctly", 0, recordReader.getFiltedBlocks().size());
}
Also used : Path(org.apache.hadoop.fs.Path) GenericUDFOPGreaterThan(org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) ParquetRecordReaderWrapper(org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper) RecordConsumer(org.apache.parquet.io.api.RecordConsumer) FileSplit(org.apache.hadoop.mapred.FileSplit) GenericUDF(org.apache.hadoop.hive.ql.udf.generic.GenericUDF) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) MessageType(org.apache.parquet.schema.MessageType) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 10 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class SplitGrouper method schemaEvolved.

private boolean schemaEvolved(InputSplit s, InputSplit prevSplit, boolean groupAcrossFiles, MapWork work) throws IOException {
    boolean retval = false;
    Path path = ((FileSplit) s).getPath();
    PartitionDesc pd = HiveFileFormatUtils.getPartitionDescFromPathRecursively(work.getPathToPartitionInfo(), path, cache);
    String currentDeserializerClass = pd.getDeserializerClassName();
    Class<?> currentInputFormatClass = pd.getInputFileFormatClass();
    Class<?> previousInputFormatClass = null;
    String previousDeserializerClass = null;
    if (prevSplit != null) {
        Path prevPath = ((FileSplit) prevSplit).getPath();
        if (!groupAcrossFiles) {
            return !path.equals(prevPath);
        }
        PartitionDesc prevPD = HiveFileFormatUtils.getPartitionDescFromPathRecursively(work.getPathToPartitionInfo(), prevPath, cache);
        previousDeserializerClass = prevPD.getDeserializerClassName();
        previousInputFormatClass = prevPD.getInputFileFormatClass();
    }
    if ((currentInputFormatClass != previousInputFormatClass) || (!currentDeserializerClass.equals(previousDeserializerClass))) {
        retval = true;
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Adding split " + path + " to src new group? " + retval);
    }
    return retval;
}
Also used : Path(org.apache.hadoop.fs.Path) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) FileSplit(org.apache.hadoop.mapred.FileSplit)

Aggregations

FileSplit (org.apache.hadoop.mapred.FileSplit)61 Path (org.apache.hadoop.fs.Path)36 InputSplit (org.apache.hadoop.mapred.InputSplit)21 JobConf (org.apache.hadoop.mapred.JobConf)15 File (java.io.File)10 IOException (java.io.IOException)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 FileSystem (org.apache.hadoop.fs.FileSystem)10 Configuration (org.apache.hadoop.conf.Configuration)9 RecordReader (org.apache.hadoop.mapred.RecordReader)8 ArrayList (java.util.ArrayList)7 Properties (java.util.Properties)7 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)7 Test (org.junit.Test)7 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)5 NullWritable (org.apache.hadoop.io.NullWritable)5 NodeControllerInfo (org.apache.hyracks.api.client.NodeControllerInfo)4 ClusterTopology (org.apache.hyracks.api.topology.ClusterTopology)4 RecordCursor (com.facebook.presto.spi.RecordCursor)3 INTEGER (com.facebook.presto.spi.type.IntegerType.INTEGER)3