Search in sources :

Example 21 with ColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.

the class VectorDeserializeOrcWriter method flushBatch.

private void flushBatch() throws IOException {
    addBatchToWriter();
    if (!isAsync) {
        for (int c = 0; c < sourceBatch.cols.length; ++c) {
            // This resets vectors in both batches.
            ColumnVector colVector = sourceBatch.cols[c];
            if (colVector != null) {
                colVector.reset();
                colVector.init();
            }
        }
        sourceBatch.selectedInUse = false;
        sourceBatch.size = 0;
        sourceBatch.endOfFile = false;
        propagateSourceBatchFieldsToDest();
    } else {
        // In addBatchToWriter, we have passed the batch to both ORC and operator pipeline
        // (neither ever changes the vectors). We'd need a set of vectors batch to write to.
        // TODO: for now, create this from scratch. Ideally we should return the vectors from ops.
        //       We could also have the ORC thread create it for us in its spare time...
        this.sourceBatch = vrbCtx.createVectorizedRowBatch();
        if (usesSourceIncludes) {
            this.destinationBatch = new VectorizedRowBatch(sourceIncludes.size());
            int inclBatchIx = 0;
            for (Integer columnId : sourceIncludes) {
                destinationBatch.cols[inclBatchIx++] = sourceBatch.cols[columnId];
            }
            destinationBatch.setPartitionInfo(sourceIncludes.size(), 0);
        } else {
            this.destinationBatch = sourceBatch;
        }
    }
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector)

Example 22 with ColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project h2o-3 by h2oai.

the class OrcTestUtils method compareFrameContents.

static int compareFrameContents(String fileName, Set<String> failedFiles, Frame h2oFrame, Reader orcReader, String[] colTypes, String[] colNames, boolean[] toInclude) {
    // get all stripe info
    List<StripeInformation> stripesInfo = orcReader.getStripes();
    int wrongTests = 0;
    if (stripesInfo.size() == 0) {
        // Orc file contains no data
        assertEquals("Orc file is empty.  H2O frame row number should be zero: ", 0, h2oFrame.numRows());
    } else {
        // row index into H2O frame
        Long startRowIndex = 0L;
        for (StripeInformation oneStripe : stripesInfo) {
            try {
                RecordReader perStripe = orcReader.rows(oneStripe.getOffset(), oneStripe.getDataLength(), toInclude, null, colNames);
                // read orc file stripes in vectorizedRowBatch
                VectorizedRowBatch batch = perStripe.nextBatch(null);
                boolean done = false;
                Long rowCounts = 0L;
                // row number of current stripe
                Long rowNumber = oneStripe.getNumberOfRows();
                while (!done) {
                    // row number of current batch
                    long currentBatchRow = batch.count();
                    ColumnVector[] dataVectors = batch.cols;
                    int colIndex = 0;
                    for (int cIdx = 0; cIdx < batch.numCols; cIdx++) {
                        // read one column at a time;
                        if (toInclude[cIdx + 1]) {
                            compare1Cloumn(dataVectors[cIdx], colTypes[colIndex].toLowerCase(), colIndex, currentBatchRow, h2oFrame.vec(colNames[colIndex]), startRowIndex);
                            colIndex++;
                        }
                    }
                    // record number of rows of data actually read
                    rowCounts = rowCounts + currentBatchRow;
                    startRowIndex = startRowIndex + currentBatchRow;
                    if (// read all rows of the stripe already.
                    rowCounts >= rowNumber)
                        done = true;
                    if (// not done yet, get next batch
                    !done)
                        batch = perStripe.nextBatch(batch);
                }
                perStripe.close();
            } catch (Throwable e) {
                failedFiles.add(fileName);
                e.printStackTrace();
                wrongTests += 1;
            }
        }
    }
    return wrongTests;
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) StripeInformation(org.apache.hadoop.hive.ql.io.orc.StripeInformation) DecimalColumnVector(org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector) DoubleColumnVector(org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)

Example 23 with ColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.

the class EncodedTreeReaderFactory method createEncodedTreeReader.

private static TreeReader createEncodedTreeReader(TypeDescription schema, List<OrcProto.ColumnEncoding> encodings, OrcEncodedColumnBatch batch, CompressionCodec codec, TreeReaderFactory.Context context) throws IOException {
    int columnIndex = schema.getId();
    ColumnStreamData[] streamBuffers = null;
    List<ColumnVector> vectors = null;
    if (batch.hasData(columnIndex)) {
        streamBuffers = batch.getColumnData(columnIndex);
    } else if (batch.hasVectors(columnIndex)) {
        vectors = batch.getColumnVectors(columnIndex);
    } else {
        throw new AssertionError("Batch has no data for " + columnIndex + ": " + batch);
    }
    // EncodedColumnBatch is already decompressed, we don't really need to pass codec.
    // But we need to know if the original data is compressed or not. This is used to skip
    // positions in row index properly. If the file is originally compressed,
    // then 1st position (compressed offset) in row index should be skipped to get
    // uncompressed offset, else 1st position should not be skipped.
    // TODO: there should be a better way to do this, code just needs to be modified
    OrcProto.ColumnEncoding columnEncoding = encodings.get(columnIndex);
    // stream buffers are arranged in enum order of stream kind
    ColumnStreamData present = null, data = null, dictionary = null, lengths = null, secondary = null;
    if (streamBuffers != null) {
        present = streamBuffers[OrcProto.Stream.Kind.PRESENT_VALUE];
        data = streamBuffers[OrcProto.Stream.Kind.DATA_VALUE];
        dictionary = streamBuffers[OrcProto.Stream.Kind.DICTIONARY_DATA_VALUE];
        lengths = streamBuffers[OrcProto.Stream.Kind.LENGTH_VALUE];
        secondary = streamBuffers[OrcProto.Stream.Kind.SECONDARY_VALUE];
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("columnIndex: {} columnType: {} streamBuffers.length: {} vectors: {} columnEncoding: {}" + " present: {} data: {} dictionary: {} lengths: {} secondary: {} tz: {}", columnIndex, schema, streamBuffers == null ? 0 : streamBuffers.length, vectors == null ? 0 : vectors.size(), columnEncoding, present != null, data, dictionary != null, lengths != null, secondary != null, context.getWriterTimezone());
    }
    // TODO: get rid of the builders - they serve no purpose... just call ctors directly.
    switch(schema.getCategory()) {
        case BINARY:
        case BOOLEAN:
        case BYTE:
        case SHORT:
        case INT:
        case LONG:
        case FLOAT:
        case DOUBLE:
        case CHAR:
        case VARCHAR:
        case STRING:
        case DECIMAL:
        case TIMESTAMP:
        case DATE:
            return getPrimitiveTreeReader(columnIndex, schema, codec, columnEncoding, present, data, dictionary, lengths, secondary, context, vectors);
        case LIST:
            // Not currently supported.
            assert vectors == null;
            TypeDescription elementType = schema.getChildren().get(0);
            TreeReader elementReader = createEncodedTreeReader(elementType, encodings, batch, codec, context);
            return ListStreamReader.builder().setColumnIndex(columnIndex).setColumnEncoding(columnEncoding).setCompressionCodec(codec).setPresentStream(present).setLengthStream(lengths).setElementReader(elementReader).setContext(context).build();
        case MAP:
            // Not currently supported.
            assert vectors == null;
            TypeDescription keyType = schema.getChildren().get(0);
            TypeDescription valueType = schema.getChildren().get(1);
            TreeReader keyReader = createEncodedTreeReader(keyType, encodings, batch, codec, context);
            TreeReader valueReader = createEncodedTreeReader(valueType, encodings, batch, codec, context);
            return MapStreamReader.builder().setColumnIndex(columnIndex).setColumnEncoding(columnEncoding).setCompressionCodec(codec).setPresentStream(present).setLengthStream(lengths).setKeyReader(keyReader).setValueReader(valueReader).setContext(context).build();
        case STRUCT:
            {
                // Not currently supported.
                assert vectors == null;
                int childCount = schema.getChildren().size();
                TreeReader[] childReaders = new TreeReader[childCount];
                for (int i = 0; i < childCount; i++) {
                    TypeDescription childType = schema.getChildren().get(i);
                    childReaders[i] = createEncodedTreeReader(childType, encodings, batch, codec, context);
                }
                return StructStreamReader.builder().setColumnIndex(columnIndex).setCompressionCodec(codec).setColumnEncoding(columnEncoding).setPresentStream(present).setChildReaders(childReaders).setContext(context).build();
            }
        case UNION:
            {
                // Not currently supported.
                assert vectors == null;
                int childCount = schema.getChildren().size();
                TreeReader[] childReaders = new TreeReader[childCount];
                for (int i = 0; i < childCount; i++) {
                    TypeDescription childType = schema.getChildren().get(i);
                    childReaders[i] = createEncodedTreeReader(childType, encodings, batch, codec, context);
                }
                return UnionStreamReader.builder().setColumnIndex(columnIndex).setCompressionCodec(codec).setColumnEncoding(columnEncoding).setPresentStream(present).setDataStream(data).setChildReaders(childReaders).setContext(context).build();
            }
        default:
            throw new UnsupportedOperationException("Data type not supported: " + schema);
    }
}
Also used : OrcProto(org.apache.orc.OrcProto) TypeDescription(org.apache.orc.TypeDescription) ColumnStreamData(org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector)

Example 24 with ColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.

the class VectorUDFDateAddColCol method evaluate.

@Override
public void evaluate(VectorizedRowBatch batch) {
    if (childExpressions != null) {
        super.evaluateChildren(batch);
    }
    ColumnVector inputColVector1 = batch.cols[colNum1];
    LongColumnVector inputColVector2 = (LongColumnVector) batch.cols[colNum2];
    int[] sel = batch.selected;
    int n = batch.size;
    long[] vector2 = inputColVector2.vector;
    LongColumnVector outV = (LongColumnVector) batch.cols[outputColumn];
    long[] outputVector = outV.vector;
    if (n <= 0) {
        // Nothing to do
        return;
    }
    // Handle null
    NullUtil.propagateNullsColCol(inputColVector1, inputColVector2, outV, batch.selected, batch.size, batch.selectedInUse);
    switch(inputTypes[0]) {
        case DATE:
            // Now disregard null in second pass.
            if ((inputColVector1.isRepeating) && (inputColVector2.isRepeating)) {
                // All must be selected otherwise size would be zero
                // Repeating property will not change.
                outV.isRepeating = true;
                outputVector[0] = evaluateDate(inputColVector1, 0, vector2[0]);
            } else if (batch.selectedInUse) {
                for (int j = 0; j != n; j++) {
                    int i = sel[j];
                    outputVector[i] = evaluateDate(inputColVector1, i, vector2[i]);
                }
            } else {
                for (int i = 0; i != n; i++) {
                    outputVector[i] = evaluateDate(inputColVector1, i, vector2[i]);
                }
            }
            break;
        case TIMESTAMP:
            // Now disregard null in second pass.
            if ((inputColVector1.isRepeating) && (inputColVector2.isRepeating)) {
                // All must be selected otherwise size would be zero
                // Repeating property will not change.
                outV.isRepeating = true;
                outputVector[0] = evaluateTimestamp(inputColVector1, 0, vector2[0]);
            } else if (batch.selectedInUse) {
                for (int j = 0; j != n; j++) {
                    int i = sel[j];
                    outputVector[i] = evaluateTimestamp(inputColVector1, i, vector2[i]);
                }
            } else {
                for (int i = 0; i != n; i++) {
                    outputVector[i] = evaluateTimestamp(inputColVector1, i, vector2[i]);
                }
            }
            break;
        case STRING:
        case CHAR:
        case VARCHAR:
            // Now disregard null in second pass.
            if ((inputColVector1.isRepeating) && (inputColVector2.isRepeating)) {
                // All must be selected otherwise size would be zero
                // Repeating property will not change.
                outV.isRepeating = true;
                evaluateString((BytesColumnVector) inputColVector1, outV, 0, vector2[0]);
            } else if (batch.selectedInUse) {
                for (int j = 0; j != n; j++) {
                    int i = sel[j];
                    evaluateString((BytesColumnVector) inputColVector1, outV, i, vector2[i]);
                }
            } else {
                for (int i = 0; i != n; i++) {
                    evaluateString((BytesColumnVector) inputColVector1, outV, i, vector2[i]);
                }
            }
            break;
        default:
            throw new Error("Unsupported input type " + inputTypes[0].name());
    }
}
Also used : BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector) TimestampColumnVector(org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)

Example 25 with ColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.

the class VectorUDFDateAddColScalar method evaluate.

@Override
public void evaluate(VectorizedRowBatch batch) {
    if (childExpressions != null) {
        super.evaluateChildren(batch);
    }
    LongColumnVector outV = (LongColumnVector) batch.cols[outputColumn];
    ColumnVector inputCol = batch.cols[this.colNum];
    /* every line below this is identical for evaluateLong & evaluateString */
    final int n = inputCol.isRepeating ? 1 : batch.size;
    int[] sel = batch.selected;
    final boolean selectedInUse = (inputCol.isRepeating == false) && batch.selectedInUse;
    if (batch.size == 0) {
        /* n != batch.size when isRepeating */
        return;
    }
    /* true for all algebraic UDFs with no state */
    outV.isRepeating = inputCol.isRepeating;
    switch(inputTypes[0]) {
        case DATE:
            if (inputCol.noNulls) {
                outV.noNulls = true;
                if (selectedInUse) {
                    for (int j = 0; j < n; j++) {
                        int i = sel[j];
                        outV.vector[i] = evaluateDate(inputCol, i);
                    }
                } else {
                    for (int i = 0; i < n; i++) {
                        outV.vector[i] = evaluateDate(inputCol, i);
                    }
                }
            } else {
                // Handle case with nulls. Don't do function if the value is null, to save time,
                // because calling the function can be expensive.
                outV.noNulls = false;
                if (selectedInUse) {
                    for (int j = 0; j < n; j++) {
                        int i = sel[j];
                        outV.isNull[i] = inputCol.isNull[i];
                        if (!inputCol.isNull[i]) {
                            outV.vector[i] = evaluateDate(inputCol, i);
                        }
                    }
                } else {
                    for (int i = 0; i < n; i++) {
                        outV.isNull[i] = inputCol.isNull[i];
                        if (!inputCol.isNull[i]) {
                            outV.vector[i] = evaluateDate(inputCol, i);
                        }
                    }
                }
            }
            break;
        case TIMESTAMP:
            if (inputCol.noNulls) {
                outV.noNulls = true;
                if (batch.selectedInUse) {
                    for (int j = 0; j < n; j++) {
                        int i = sel[j];
                        outV.vector[i] = evaluateTimestamp(inputCol, i);
                    }
                } else {
                    for (int i = 0; i < n; i++) {
                        outV.vector[i] = evaluateTimestamp(inputCol, i);
                    }
                }
            } else {
                // Handle case with nulls. Don't do function if the value is null, to save time,
                // because calling the function can be expensive.
                outV.noNulls = false;
                if (batch.selectedInUse) {
                    for (int j = 0; j < n; j++) {
                        int i = sel[j];
                        outV.isNull[i] = inputCol.isNull[i];
                        if (!inputCol.isNull[i]) {
                            outV.vector[i] = evaluateTimestamp(inputCol, i);
                        }
                    }
                } else {
                    for (int i = 0; i < n; i++) {
                        outV.isNull[i] = inputCol.isNull[i];
                        if (!inputCol.isNull[i]) {
                            outV.vector[i] = evaluateTimestamp(inputCol, i);
                        }
                    }
                }
            }
            break;
        case STRING:
        case CHAR:
        case VARCHAR:
            if (inputCol.noNulls) {
                outV.noNulls = true;
                if (batch.selectedInUse) {
                    for (int j = 0; j < n; j++) {
                        int i = sel[j];
                        evaluateString(inputCol, outV, i);
                    }
                } else {
                    for (int i = 0; i < n; i++) {
                        evaluateString(inputCol, outV, i);
                    }
                }
            } else {
                // Handle case with nulls. Don't do function if the value is null, to save time,
                // because calling the function can be expensive.
                outV.noNulls = false;
                if (batch.selectedInUse) {
                    for (int j = 0; j < n; j++) {
                        int i = sel[j];
                        outV.isNull[i] = inputCol.isNull[i];
                        if (!inputCol.isNull[i]) {
                            evaluateString(inputCol, outV, i);
                        }
                    }
                } else {
                    for (int i = 0; i < n; i++) {
                        outV.isNull[i] = inputCol.isNull[i];
                        if (!inputCol.isNull[i]) {
                            evaluateString(inputCol, outV, i);
                        }
                    }
                }
            }
            break;
        default:
            throw new Error("Unsupported input type " + inputTypes[0].name());
    }
}
Also used : LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector) TimestampColumnVector(org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)

Aggregations

ColumnVector (org.apache.hadoop.hive.ql.exec.vector.ColumnVector)43 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)24 BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)19 TimestampColumnVector (org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector)14 DoubleColumnVector (org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)11 VectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch)9 DecimalColumnVector (org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector)8 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)4 TestVectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.TestVectorizedRowBatch)3 Output (org.apache.hadoop.hive.serde2.ByteStream.Output)3 BinarySortableSerializeWrite (org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite)3 Test (org.junit.Test)3 ParseException (java.text.ParseException)2 IOException (java.io.IOException)1 Timestamp (java.sql.Timestamp)1 ArrayList (java.util.ArrayList)1 ColumnStreamData (org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData)1 LlapDataBuffer (org.apache.hadoop.hive.llap.cache.LlapDataBuffer)1 SerDeStripeMetadata (org.apache.hadoop.hive.llap.io.decode.GenericColumnVectorProducer.SerDeStripeMetadata)1 JoinUtil (org.apache.hadoop.hive.ql.exec.JoinUtil)1