Search in sources :

Example 6 with KeyValuesReader

use of org.apache.tez.runtime.library.api.KeyValuesReader in project hive by apache.

the class KeyValuesInputMerger method next.

/**
 * @return true if there are more key-values and advances to next key-values
 * @throws IOException
 */
@Override
public boolean next() throws IOException {
    // add the previous nextKVReader back to queue
    if (!nextKVReaders.isEmpty()) {
        for (KeyValuesReader kvReader : nextKVReaders) {
            addToQueue(kvReader);
        }
        nextKVReaders.clear();
    }
    KeyValuesReader nextKVReader = null;
    // get the new nextKVReader with lowest key
    nextKVReader = pQueue.poll();
    if (nextKVReader != null) {
        nextKVReaders.add(nextKVReader);
    }
    while (pQueue.peek() != null) {
        KeyValuesReader equalValueKVReader = pQueue.poll();
        if (pQueue.comparator().compare(nextKVReader, equalValueKVReader) == 0) {
            nextKVReaders.add(equalValueKVReader);
        } else {
            pQueue.add(equalValueKVReader);
            break;
        }
    }
    return !(nextKVReaders.isEmpty());
}
Also used : KeyValuesReader(org.apache.tez.runtime.library.api.KeyValuesReader)

Example 7 with KeyValuesReader

use of org.apache.tez.runtime.library.api.KeyValuesReader in project hive by apache.

the class ReduceRecordSource method init.

void init(JobConf jconf, Operator<?> reducer, boolean vectorized, TableDesc keyTableDesc, TableDesc valueTableDesc, Reader reader, boolean handleGroupKey, byte tag, VectorizedRowBatchCtx batchContext, long vectorizedVertexNum, int vectorizedTestingReducerBatchSize) throws Exception {
    this.vectorizedVertexNum = vectorizedVertexNum;
    if (vectorizedTestingReducerBatchSize > VectorizedRowBatch.DEFAULT_SIZE) {
        // For now, we don't go higher than the default batch size unless we do more work
        // to verify every vectorized operator downstream can handle a larger batch size.
        vectorizedTestingReducerBatchSize = VectorizedRowBatch.DEFAULT_SIZE;
    }
    this.vectorizedTestingReducerBatchSize = vectorizedTestingReducerBatchSize;
    ObjectInspector keyObjectInspector;
    this.reducer = reducer;
    this.vectorized = vectorized;
    this.keyTableDesc = keyTableDesc;
    if (reader instanceof KeyValueReader) {
        this.reader = new KeyValuesFromKeyValue((KeyValueReader) reader);
    } else {
        this.reader = new KeyValuesFromKeyValues((KeyValuesReader) reader);
    }
    this.handleGroupKey = handleGroupKey;
    this.tag = tag;
    try {
        inputKeySerDe = ReflectionUtils.newInstance(keyTableDesc.getSerDeClass(), null);
        inputKeySerDe.initialize(null, keyTableDesc.getProperties(), null);
        keyObjectInspector = inputKeySerDe.getObjectInspector();
        if (vectorized) {
            keyStructInspector = (StructObjectInspector) keyObjectInspector;
            firstValueColumnOffset = keyStructInspector.getAllStructFieldRefs().size();
        }
        // We should initialize the SerDe with the TypeInfo when available.
        this.valueTableDesc = valueTableDesc;
        inputValueSerDe = (AbstractSerDe) ReflectionUtils.newInstance(valueTableDesc.getSerDeClass(), null);
        inputValueSerDe.initialize(null, valueTableDesc.getProperties(), null);
        valueObjectInspector = inputValueSerDe.getObjectInspector();
        ArrayList<ObjectInspector> ois = new ArrayList<ObjectInspector>();
        if (vectorized) {
            /* vectorization only works with struct object inspectors */
            valueStructInspectors = (StructObjectInspector) valueObjectInspector;
            final int totalColumns = firstValueColumnOffset + valueStructInspectors.getAllStructFieldRefs().size();
            rowObjectInspector = Utilities.constructVectorizedReduceRowOI(keyStructInspector, valueStructInspectors);
            batch = batchContext.createVectorizedRowBatch();
            // Setup vectorized deserialization for the key and value.
            BinarySortableSerDe binarySortableSerDe = (BinarySortableSerDe) inputKeySerDe;
            keyBinarySortableDeserializeToRow = new VectorDeserializeRow<BinarySortableDeserializeRead>(new BinarySortableDeserializeRead(VectorizedBatchUtil.typeInfosFromStructObjectInspector(keyStructInspector), (batchContext.getRowdataTypePhysicalVariations().length > firstValueColumnOffset) ? Arrays.copyOfRange(batchContext.getRowdataTypePhysicalVariations(), 0, firstValueColumnOffset) : batchContext.getRowdataTypePhysicalVariations(), /* useExternalBuffer */
            true, binarySortableSerDe.getSortOrders(), binarySortableSerDe.getNullMarkers(), binarySortableSerDe.getNotNullMarkers()));
            keyBinarySortableDeserializeToRow.init(0);
            final int valuesSize = valueStructInspectors.getAllStructFieldRefs().size();
            if (valuesSize > 0) {
                valueLazyBinaryDeserializeToRow = new VectorDeserializeRow<LazyBinaryDeserializeRead>(new LazyBinaryDeserializeRead(VectorizedBatchUtil.typeInfosFromStructObjectInspector(valueStructInspectors), (batchContext.getRowdataTypePhysicalVariations().length >= totalColumns) ? Arrays.copyOfRange(batchContext.getRowdataTypePhysicalVariations(), firstValueColumnOffset, totalColumns) : null, /* useExternalBuffer */
                true));
                valueLazyBinaryDeserializeToRow.init(firstValueColumnOffset);
                // Create data buffers for value bytes column vectors.
                for (int i = firstValueColumnOffset; i < batch.numCols; i++) {
                    ColumnVector colVector = batch.cols[i];
                    if (colVector instanceof BytesColumnVector) {
                        BytesColumnVector bytesColumnVector = (BytesColumnVector) colVector;
                        bytesColumnVector.initBuffer();
                    }
                }
            }
        } else {
            ois.add(keyObjectInspector);
            ois.add(valueObjectInspector);
            rowObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(Utilities.reduceFieldNameList, ois);
        }
    } catch (Throwable e) {
        abort = true;
        if (e instanceof OutOfMemoryError) {
            // Don't create a new object if we are already out of memory
            throw (OutOfMemoryError) e;
        } else {
            throw new RuntimeException("Reduce operator initialization failed", e);
        }
    }
    perfLogger.perfLogEnd(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
}
Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) BinarySortableSerDe(org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe) KeyValueReader(org.apache.tez.runtime.library.api.KeyValueReader) ArrayList(java.util.ArrayList) BinarySortableDeserializeRead(org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector) KeyValuesReader(org.apache.tez.runtime.library.api.KeyValuesReader) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) LazyBinaryDeserializeRead(org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead)

Example 8 with KeyValuesReader

use of org.apache.tez.runtime.library.api.KeyValuesReader in project hive by apache.

the class ReduceRecordSource method init.

void init(JobConf jconf, Operator<?> reducer, boolean vectorized, TableDesc keyTableDesc, TableDesc valueTableDesc, Reader reader, boolean handleGroupKey, byte tag, VectorizedRowBatchCtx batchContext, long vectorizedVertexNum) throws Exception {
    this.vectorizedVertexNum = vectorizedVertexNum;
    ObjectInspector keyObjectInspector;
    this.reducer = reducer;
    this.vectorized = vectorized;
    this.keyTableDesc = keyTableDesc;
    if (reader instanceof KeyValueReader) {
        this.reader = new KeyValuesFromKeyValue((KeyValueReader) reader);
    } else {
        this.reader = new KeyValuesFromKeyValues((KeyValuesReader) reader);
    }
    this.handleGroupKey = handleGroupKey;
    this.tag = tag;
    try {
        inputKeyDeserializer = ReflectionUtils.newInstance(keyTableDesc.getDeserializerClass(), null);
        SerDeUtils.initializeSerDe(inputKeyDeserializer, null, keyTableDesc.getProperties(), null);
        keyObjectInspector = inputKeyDeserializer.getObjectInspector();
        if (vectorized) {
            keyStructInspector = (StructObjectInspector) keyObjectInspector;
            firstValueColumnOffset = keyStructInspector.getAllStructFieldRefs().size();
        }
        // We should initialize the SerDe with the TypeInfo when available.
        this.valueTableDesc = valueTableDesc;
        inputValueDeserializer = (AbstractSerDe) ReflectionUtils.newInstance(valueTableDesc.getDeserializerClass(), null);
        SerDeUtils.initializeSerDe(inputValueDeserializer, null, valueTableDesc.getProperties(), null);
        valueObjectInspector = inputValueDeserializer.getObjectInspector();
        ArrayList<ObjectInspector> ois = new ArrayList<ObjectInspector>();
        if (vectorized) {
            /* vectorization only works with struct object inspectors */
            valueStructInspectors = (StructObjectInspector) valueObjectInspector;
            final int totalColumns = firstValueColumnOffset + valueStructInspectors.getAllStructFieldRefs().size();
            valueStringWriters = new ArrayList<VectorExpressionWriter>(totalColumns);
            valueStringWriters.addAll(Arrays.asList(VectorExpressionWriterFactory.genVectorStructExpressionWritables(keyStructInspector)));
            valueStringWriters.addAll(Arrays.asList(VectorExpressionWriterFactory.genVectorStructExpressionWritables(valueStructInspectors)));
            rowObjectInspector = Utilities.constructVectorizedReduceRowOI(keyStructInspector, valueStructInspectors);
            batch = batchContext.createVectorizedRowBatch();
            // Setup vectorized deserialization for the key and value.
            BinarySortableSerDe binarySortableSerDe = (BinarySortableSerDe) inputKeyDeserializer;
            keyBinarySortableDeserializeToRow = new VectorDeserializeRow<BinarySortableDeserializeRead>(new BinarySortableDeserializeRead(VectorizedBatchUtil.typeInfosFromStructObjectInspector(keyStructInspector), /* useExternalBuffer */
            true, binarySortableSerDe.getSortOrders()));
            keyBinarySortableDeserializeToRow.init(0);
            final int valuesSize = valueStructInspectors.getAllStructFieldRefs().size();
            if (valuesSize > 0) {
                valueLazyBinaryDeserializeToRow = new VectorDeserializeRow<LazyBinaryDeserializeRead>(new LazyBinaryDeserializeRead(VectorizedBatchUtil.typeInfosFromStructObjectInspector(valueStructInspectors), /* useExternalBuffer */
                true));
                valueLazyBinaryDeserializeToRow.init(firstValueColumnOffset);
                // Create data buffers for value bytes column vectors.
                for (int i = firstValueColumnOffset; i < batch.numCols; i++) {
                    ColumnVector colVector = batch.cols[i];
                    if (colVector instanceof BytesColumnVector) {
                        BytesColumnVector bytesColumnVector = (BytesColumnVector) colVector;
                        bytesColumnVector.initBuffer();
                    }
                }
            }
        } else {
            ois.add(keyObjectInspector);
            ois.add(valueObjectInspector);
            rowObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(Utilities.reduceFieldNameList, ois);
        }
    } catch (Throwable e) {
        abort = true;
        if (e instanceof OutOfMemoryError) {
            // Don't create a new object if we are already out of memory
            throw (OutOfMemoryError) e;
        } else {
            throw new RuntimeException("Reduce operator initialization failed", e);
        }
    }
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
}
Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) BinarySortableSerDe(org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe) KeyValueReader(org.apache.tez.runtime.library.api.KeyValueReader) ArrayList(java.util.ArrayList) BinarySortableDeserializeRead(org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead) VectorExpressionWriter(org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector) KeyValuesReader(org.apache.tez.runtime.library.api.KeyValuesReader) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) LazyBinaryDeserializeRead(org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead)

Example 9 with KeyValuesReader

use of org.apache.tez.runtime.library.api.KeyValuesReader in project tez by apache.

the class TestSortedGroupedMergedInput method testSkippedKey.

@Test(timeout = 5000)
public void testSkippedKey() throws Exception {
    SortedTestKeyValuesReader kvsReader1 = new SortedTestKeyValuesReader(new int[] { 1, 2, 3 }, new int[][] { { 1, 1 }, { 2, 2 }, { 3, 3 } });
    SortedTestKeyValuesReader kvsReader2 = new SortedTestKeyValuesReader(new int[] { 1, 2, 3 }, new int[][] { { 1, 1 }, { 2, 2 }, { 3, 3 } });
    SortedTestKeyValuesReader kvsReader3 = new SortedTestKeyValuesReader(new int[] { 1, 2, 3 }, new int[][] { { 1, 1 }, { 2, 2 }, { 3, 3 } });
    SortedTestInput sInput1 = new SortedTestInput(kvsReader1);
    SortedTestInput sInput2 = new SortedTestInput(kvsReader2);
    SortedTestInput sInput3 = new SortedTestInput(kvsReader3);
    List<Input> sInputs = new LinkedList<Input>();
    sInputs.add(sInput1);
    sInputs.add(sInput2);
    sInputs.add(sInput3);
    OrderedGroupedMergedKVInput input = new OrderedGroupedMergedKVInput(createMergedInputContext(), sInputs);
    KeyValuesReader kvsReader = input.getReader();
    int keyCount = 0;
    while (kvsReader.next()) {
        keyCount++;
        if (keyCount == 2) {
            continue;
        }
        Integer key = (Integer) kvsReader.getCurrentKey();
        assertEquals(Integer.valueOf(keyCount), key);
        Iterator<Object> valuesIter = kvsReader.getCurrentValues().iterator();
        int valCount = 0;
        while (valuesIter.hasNext()) {
            valCount++;
            Integer val = (Integer) valuesIter.next();
            assertEquals(Integer.valueOf(keyCount), val);
        }
        assertEquals(6, valCount);
    }
    getNextFromFinishedReader(kvsReader);
}
Also used : Input(org.apache.tez.runtime.api.Input) KeyValuesReader(org.apache.tez.runtime.library.api.KeyValuesReader) LinkedList(java.util.LinkedList) Test(org.junit.Test)

Example 10 with KeyValuesReader

use of org.apache.tez.runtime.library.api.KeyValuesReader in project tez by apache.

the class TestSortedGroupedMergedInput method testPartialValuesSkip.

@Test(timeout = 5000)
public void testPartialValuesSkip() throws Exception {
    SortedTestKeyValuesReader kvsReader1 = new SortedTestKeyValuesReader(new int[] { 1, 2, 3 }, new int[][] { { 1, 1 }, { 2, 2 }, { 3, 3 } });
    SortedTestKeyValuesReader kvsReader2 = new SortedTestKeyValuesReader(new int[] { 1, 2, 3 }, new int[][] { { 1, 1 }, { 2, 2 }, { 3, 3 } });
    SortedTestKeyValuesReader kvsReader3 = new SortedTestKeyValuesReader(new int[] { 1, 2, 3 }, new int[][] { { 1, 1 }, { 2, 2 }, { 3, 3 } });
    SortedTestInput sInput1 = new SortedTestInput(kvsReader1);
    SortedTestInput sInput2 = new SortedTestInput(kvsReader2);
    SortedTestInput sInput3 = new SortedTestInput(kvsReader3);
    List<Input> sInputs = new LinkedList<Input>();
    sInputs.add(sInput1);
    sInputs.add(sInput2);
    sInputs.add(sInput3);
    OrderedGroupedMergedKVInput input = new OrderedGroupedMergedKVInput(createMergedInputContext(), sInputs);
    KeyValuesReader kvsReader = input.getReader();
    int keyCount = 0;
    while (kvsReader.next()) {
        keyCount++;
        Integer key = (Integer) kvsReader.getCurrentKey();
        assertEquals(Integer.valueOf(keyCount), key);
        Iterator<Object> valuesIter = kvsReader.getCurrentValues().iterator();
        int valCount = 0;
        while (valuesIter.hasNext()) {
            valCount++;
            if (keyCount == 2 && valCount == 3) {
                break;
            }
            Integer val = (Integer) valuesIter.next();
            assertEquals(Integer.valueOf(keyCount), val);
        }
        if (keyCount == 2) {
            assertEquals(3, valCount);
        } else {
            assertEquals(6, valCount);
        }
    }
    getNextFromFinishedReader(kvsReader);
}
Also used : Input(org.apache.tez.runtime.api.Input) KeyValuesReader(org.apache.tez.runtime.library.api.KeyValuesReader) LinkedList(java.util.LinkedList) Test(org.junit.Test)

Aggregations

KeyValuesReader (org.apache.tez.runtime.library.api.KeyValuesReader)13 LinkedList (java.util.LinkedList)9 Input (org.apache.tez.runtime.api.Input)9 Test (org.junit.Test)8 ArrayList (java.util.ArrayList)2 BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)2 ColumnVector (org.apache.hadoop.hive.ql.exec.vector.ColumnVector)2 BinarySortableSerDe (org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe)2 BinarySortableDeserializeRead (org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead)2 LazyBinaryDeserializeRead (org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead)2 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)2 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)2 MergedInputContext (org.apache.tez.runtime.api.MergedInputContext)2 KeyValueReader (org.apache.tez.runtime.library.api.KeyValueReader)2 IOException (java.io.IOException)1 VectorExpressionWriter (org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter)1 RawComparator (org.apache.hadoop.io.RawComparator)1 ProgressHelper (org.apache.tez.common.ProgressHelper)1 MROutputLegacy (org.apache.tez.mapreduce.output.MROutputLegacy)1 LogicalInput (org.apache.tez.runtime.api.LogicalInput)1