Search in sources :

Example 66 with ColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.

the class ListIndexColColumn method evaluate.

@Override
public void evaluate(VectorizedRowBatch batch) {
    if (childExpressions != null) {
        super.evaluateChildren(batch);
    }
    ColumnVector outV = batch.cols[outputColumnNum];
    ListColumnVector listV = (ListColumnVector) batch.cols[listColumnNum];
    ColumnVector childV = listV.child;
    LongColumnVector indexColumnVector = (LongColumnVector) batch.cols[indexColumnNum];
    long[] indexV = indexColumnVector.vector;
    // We do not need to do a column reset since we are carefully changing the output.
    outV.isRepeating = false;
    if (listV.isRepeating) {
        if (listV.isNull[0]) {
            outV.isNull[0] = true;
            outV.noNulls = false;
            outV.isRepeating = true;
        } else {
            if (indexColumnVector.isRepeating) {
                if (indexV[0] >= listV.lengths[0]) {
                    outV.isNull[0] = true;
                    outV.noNulls = false;
                } else {
                    outV.isNull[0] = false;
                    outV.setElement(0, (int) (listV.offsets[0] + indexV[0]), childV);
                }
                outV.isRepeating = true;
            } else {
                for (int i = 0; i < batch.size; i++) {
                    int j = (batch.selectedInUse) ? batch.selected[i] : i;
                    if (indexV[j] >= listV.lengths[0]) {
                        outV.isNull[j] = true;
                        outV.noNulls = false;
                    } else {
                        outV.isNull[j] = false;
                        outV.setElement(j, (int) (listV.offsets[0] + indexV[j]), childV);
                    }
                }
            }
        }
    } else {
        for (int i = 0; i < batch.size; i++) {
            int j = (batch.selectedInUse) ? batch.selected[i] : i;
            if (listV.isNull[j] || indexV[j] >= listV.lengths[j]) {
                outV.isNull[j] = true;
                outV.noNulls = false;
            } else {
                outV.isNull[j] = false;
                outV.setElement(j, (int) (listV.offsets[j] + indexV[j]), childV);
            }
        }
    }
}
Also used : ListColumnVector(org.apache.hadoop.hive.ql.exec.vector.ListColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) ListColumnVector(org.apache.hadoop.hive.ql.exec.vector.ListColumnVector) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)

Example 67 with ColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.

the class ListIndexColScalar method evaluate.

@Override
public void evaluate(VectorizedRowBatch batch) {
    if (childExpressions != null) {
        super.evaluateChildren(batch);
    }
    ColumnVector outV = batch.cols[outputColumnNum];
    ListColumnVector listV = (ListColumnVector) batch.cols[listColumnNum];
    ColumnVector childV = listV.child;
    if (listV.isRepeating) {
        if (listV.isNull[0]) {
            outV.isNull[0] = true;
            outV.noNulls = false;
        } else {
            if (index >= listV.lengths[0]) {
                outV.isNull[0] = true;
                outV.noNulls = false;
            } else {
                outV.isNull[0] = false;
                outV.setElement(0, (int) (listV.offsets[0] + index), childV);
            }
        }
        outV.isRepeating = true;
    } else {
        for (int i = 0; i < batch.size; i++) {
            int j = (batch.selectedInUse) ? batch.selected[i] : i;
            if (listV.isNull[j] || index >= listV.lengths[j]) {
                outV.isNull[j] = true;
                outV.noNulls = false;
            } else {
                outV.isNull[j] = false;
                outV.setElement(j, (int) (listV.offsets[j] + index), childV);
            }
        }
        outV.isRepeating = false;
    }
}
Also used : ListColumnVector(org.apache.hadoop.hive.ql.exec.vector.ListColumnVector) ListColumnVector(org.apache.hadoop.hive.ql.exec.vector.ListColumnVector) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector)

Example 68 with ColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.

the class TestVectorGenericDateExpressions method testDateAddColCol.

private void testDateAddColCol(PrimitiveCategory colType1, boolean isPositive) throws HiveException {
    LongColumnVector date1 = newRandomLongColumnVector(10000, size);
    LongColumnVector days2 = newRandomLongColumnVector(1000, size);
    ColumnVector col1 = castTo(date1, colType1);
    LongColumnVector output = new LongColumnVector(size);
    VectorizedRowBatch batch = new VectorizedRowBatch(3, size);
    batch.cols[0] = col1;
    batch.cols[1] = days2;
    batch.cols[2] = output;
    validateDateAdd(batch, date1, days2, colType1, isPositive);
    TestVectorizedRowBatch.addRandomNulls(date1);
    batch.cols[0] = castTo(date1, colType1);
    validateDateAdd(batch, date1, days2, colType1, isPositive);
    TestVectorizedRowBatch.addRandomNulls(days2);
    batch.cols[1] = days2;
    validateDateAdd(batch, date1, days2, colType1, isPositive);
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) TestVectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.TestVectorizedRowBatch) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) TimestampColumnVector(org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector)

Example 69 with ColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.

the class SerDeEncodedDataReader method processOneSlice.

/**
 * Unlike the other overload of processOneSlice, doesn't cache data.
 */
private boolean processOneSlice(Vectors diskData, boolean[] splitIncludes, int stripeIx, StripeData cacheData, long startTime) throws IOException, InterruptedException {
    if (diskData == null) {
        // The other overload should have been used.
        throw new AssertionError();
    }
    // LlapIoImpl.LOG.debug("diskData " + diskData);
    logProcessOneSlice(stripeIx, diskData, cacheData);
    if (cacheData == null && diskData.getRowCount() == 0) {
        // Nothing to process.
        return true;
    }
    ColumnEncoding[] cacheEncodings = cacheData == null ? null : cacheData.getEncodings();
    LlapSerDeDataBuffer[][][] cacheBuffers = cacheData == null ? null : cacheData.getData();
    if (cacheData != null) {
        // Don't validate column count - no encodings for vectors.
        validateCacheAndDisk(cacheData, diskData.getRowCount(), -1, diskData);
    }
    SerDeStripeMetadata metadata = new SerDeStripeMetadata(stripeIx);
    metadata.setEncodings(Arrays.asList(cacheEncodings == null ? new ColumnEncoding[splitIncludes.length] : cacheEncodings));
    metadata.setRowCount(diskData.getRowCount());
    if (LlapIoImpl.LOG.isTraceEnabled()) {
        LlapIoImpl.LOG.trace("Derived stripe metadata for this split is " + metadata);
    }
    consumer.setStripeMetadata(metadata);
    OrcEncodedColumnBatch ecb = useObjectPools ? ECB_POOL.take() : new OrcEncodedColumnBatch();
    ecb.init(fileKey, metadata.getStripeIx(), OrcEncodedColumnBatch.ALL_RGS, writerIncludes.length);
    int vectorsIx = 0;
    for (int colIx = 0; colIx < writerIncludes.length; ++colIx) {
        // Skip the 0-th column, since it won't have a vector after reading the text source.
        if (colIx == 0)
            continue;
        if (!writerIncludes[colIx])
            continue;
        if (splitIncludes[colIx]) {
            List<ColumnVector> vectors = diskData.getVectors(vectorsIx++);
            if (LlapIoImpl.LOG.isTraceEnabled()) {
                LlapIoImpl.LOG.trace("Processing vectors for column " + colIx + ": " + vectors);
            }
            ecb.initColumnWithVectors(colIx, vectors);
        } else {
            ecb.initColumn(colIx, OrcEncodedColumnBatch.MAX_DATA_STREAMS);
            processColumnCacheData(cacheBuffers, ecb, colIx);
        }
    }
    if (processStop()) {
        recordReaderTime(startTime);
        return false;
    }
    return sendEcbToConsumer(ecb, cacheData != null, null);
}
Also used : ColumnEncoding(org.apache.orc.OrcProto.ColumnEncoding) OrcEncodedColumnBatch(org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch) LlapSerDeDataBuffer(org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.LlapSerDeDataBuffer) SerDeStripeMetadata(org.apache.hadoop.hive.llap.io.decode.GenericColumnVectorProducer.SerDeStripeMetadata) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector)

Example 70 with ColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.

the class OrcEncodedDataConsumer method decodeBatch.

@Override
protected void decodeBatch(OrcEncodedColumnBatch batch, Consumer<ColumnVectorBatch> downstreamConsumer) throws InterruptedException {
    long startTime = counters.startTimeCounter();
    int currentStripeIndex = batch.getBatchKey().stripeIx;
    boolean sameStripe = currentStripeIndex == previousStripeIndex;
    try {
        ConsumerStripeMetadata stripeMetadata = stripes.get(currentStripeIndex);
        // Get non null row count from root column, to get max vector batches
        int rgIdx = batch.getBatchKey().rgIx;
        long nonNullRowCount = -1;
        if (rgIdx == OrcEncodedColumnBatch.ALL_RGS) {
            nonNullRowCount = stripeMetadata.getRowCount();
        } else {
            OrcProto.RowIndexEntry rowIndex = stripeMetadata.getRowIndexEntry(0, rgIdx);
            nonNullRowCount = getRowCount(rowIndex);
        }
        int maxBatchesRG = (int) ((nonNullRowCount / VectorizedRowBatch.DEFAULT_SIZE) + 1);
        int batchSize = VectorizedRowBatch.DEFAULT_SIZE;
        TypeDescription fileSchema = fileMetadata.getSchema();
        if (columnReaders == null || !sameStripe) {
            createColumnReaders(batch, stripeMetadata, fileSchema);
        } else {
            repositionInStreams(this.columnReaders, batch, sameStripe, stripeMetadata);
        }
        previousStripeIndex = currentStripeIndex;
        for (int i = 0; i < maxBatchesRG; i++) {
            // for last batch in row group, adjust the batch size
            if (i == maxBatchesRG - 1) {
                batchSize = (int) (nonNullRowCount % VectorizedRowBatch.DEFAULT_SIZE);
                if (batchSize == 0)
                    break;
            }
            ColumnVectorBatch cvb = cvbPool.take();
            // assert cvb.cols.length == batch.getColumnIxs().length; // Must be constant per split.
            cvb.size = batchSize;
            for (int idx = 0; idx < columnReaders.length; ++idx) {
                TreeReader reader = columnReaders[idx];
                if (cvb.cols[idx] == null) {
                    // Orc store rows inside a root struct (hive writes it this way).
                    // When we populate column vectors we skip over the root struct.
                    cvb.cols[idx] = createColumn(batchSchemas[idx], VectorizedRowBatch.DEFAULT_SIZE);
                }
                trace.logTreeReaderNextVector(idx);
                /*
           * Currently, ORC's TreeReaderFactory class does this:
           *
           *     public void nextBatch(VectorizedRowBatch batch,
           *              int batchSize) throws IOException {
           *       batch.cols[0].reset();
           *       batch.cols[0].ensureSize(batchSize, false);
           *       nextVector(batch.cols[0], null, batchSize);
           *     }
           *
           * CONCERN:
           *     For better performance, we'd like to *not* do a ColumnVector.reset()
           *     which zeroes out isNull.  Why?  Because there are common cases where
           *     ORC will *immediately* copy its null flags into the isNull array.  This is a
           *     waste.
           *
           *     For correctness now we must do it for now.
           *
           *     The best solution is for ORC to manage the noNulls and isNull array itself
           *     because it knows what NULLs the next set of rows contains.
           *
           *     Its management of the fields of ColumnVector is a little different than what we
           *     must do for vector expressions.  For those, we must maintain the invariant that if
           *     noNulls is true there are no NULLs in any part of the isNull array.  This is
           *     because the next vector expression relies on the invariant.
           *
           *     Given that ORC (or any other producer) is providing *read-only* batches to the
           *     consumer, what is important is that the isNull array through batch.size has
           *     integrity with the noNulls flag.  So, if ORC is giving us 100 rows (for example)
           *     and none of them are NULL, it can safely set or make sure the first 100 isNull
           *     entries are false and safely set noNulls to true.  Any other NULLs (true entries)
           *     in isNull are irrelevant because ORC owns the batch.  It just need to make sure
           *     it doesn't get confused.
           *
           */
                ColumnVector cv = cvb.cols[idx];
                cv.reset();
                cv.ensureSize(batchSize, false);
                reader.nextVector(cv, null, batchSize);
            }
            // we are done reading a batch, send it to consumer for processing
            downstreamConsumer.consumeData(cvb);
            counters.incrCounter(LlapIOCounters.ROWS_EMITTED, batchSize);
        }
        LlapIoImpl.ORC_LOGGER.debug("Done with decode");
        counters.incrTimeCounter(LlapIOCounters.DECODE_TIME_NS, startTime);
        counters.incrCounter(LlapIOCounters.NUM_VECTOR_BATCHES, maxBatchesRG);
        counters.incrCounter(LlapIOCounters.NUM_DECODED_BATCHES);
    } catch (IOException e) {
        // Caller will return the batch.
        downstreamConsumer.setError(e);
    }
}
Also used : ConsumerStripeMetadata(org.apache.hadoop.hive.llap.io.metadata.ConsumerStripeMetadata) OrcProto(org.apache.orc.OrcProto) TypeDescription(org.apache.orc.TypeDescription) TreeReader(org.apache.orc.impl.TreeReaderFactory.TreeReader) SettableTreeReader(org.apache.hadoop.hive.ql.io.orc.encoded.EncodedTreeReaderFactory.SettableTreeReader) StructTreeReader(org.apache.orc.impl.TreeReaderFactory.StructTreeReader) ColumnVectorBatch(org.apache.hadoop.hive.llap.io.api.impl.ColumnVectorBatch) IOException(java.io.IOException) DecimalColumnVector(org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector) StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) MapColumnVector(org.apache.hadoop.hive.ql.exec.vector.MapColumnVector) TimestampColumnVector(org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector) ListColumnVector(org.apache.hadoop.hive.ql.exec.vector.ListColumnVector) UnionColumnVector(org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector) DoubleColumnVector(org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)

Aggregations

ColumnVector (org.apache.hadoop.hive.ql.exec.vector.ColumnVector)72 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)41 BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)30 TimestampColumnVector (org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector)20 DoubleColumnVector (org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)19 DecimalColumnVector (org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector)14 VectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch)11 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)6 IOException (java.io.IOException)5 ListColumnVector (org.apache.hadoop.hive.ql.exec.vector.ListColumnVector)5 TestVectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.TestVectorizedRowBatch)5 StructColumnVector (org.apache.hadoop.hive.ql.exec.vector.StructColumnVector)4 Timestamp (java.sql.Timestamp)3 ArrayList (java.util.ArrayList)3 MapColumnVector (org.apache.hadoop.hive.ql.exec.vector.MapColumnVector)3 BinarySortableSerDe (org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe)3 BinarySortableDeserializeRead (org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead)3 LazyBinaryDeserializeRead (org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead)3 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)3 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)3