Search in sources :

Example 6 with StructColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.StructColumnVector in project hive by apache.

the class TestInputOutputFormat method testColumnProjectionWithAcid.

/**
   * Test column projection when using ACID.
   */
@Test
public void testColumnProjectionWithAcid() throws Exception {
    Path baseDir = new Path(workDir, "base_00100");
    testFilePath = new Path(baseDir, "bucket_00000");
    fs.mkdirs(baseDir);
    fs.delete(testFilePath, true);
    TypeDescription fileSchema = TypeDescription.fromString("struct<operation:int," + "originalTransaction:bigint,bucket:int,rowId:bigint," + "currentTransaction:bigint," + "row:struct<a:int,b:struct<c:int>,d:string>>");
    Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).fileSystem(fs).setSchema(fileSchema).compress(org.apache.orc.CompressionKind.NONE));
    VectorizedRowBatch batch = fileSchema.createRowBatch(1000);
    batch.size = 1000;
    StructColumnVector scv = (StructColumnVector) batch.cols[5];
    // operation
    batch.cols[0].isRepeating = true;
    ((LongColumnVector) batch.cols[0]).vector[0] = 0;
    // original transaction
    batch.cols[1].isRepeating = true;
    ((LongColumnVector) batch.cols[1]).vector[0] = 1;
    // bucket
    batch.cols[2].isRepeating = true;
    ((LongColumnVector) batch.cols[2]).vector[0] = 0;
    // current transaction
    batch.cols[4].isRepeating = true;
    ((LongColumnVector) batch.cols[4]).vector[0] = 1;
    LongColumnVector lcv = (LongColumnVector) ((StructColumnVector) scv.fields[1]).fields[0];
    for (int r = 0; r < 1000; r++) {
        // row id
        ((LongColumnVector) batch.cols[3]).vector[r] = r;
        // a
        ((LongColumnVector) scv.fields[0]).vector[r] = r * 42;
        // b.c
        lcv.vector[r] = r * 10001;
        // d
        ((BytesColumnVector) scv.fields[2]).setVal(r, Integer.toHexString(r).getBytes(StandardCharsets.UTF_8));
    }
    writer.addRowBatch(batch);
    writer.addUserMetadata(OrcRecordUpdater.ACID_KEY_INDEX_NAME, ByteBuffer.wrap("0,0,999".getBytes(StandardCharsets.UTF_8)));
    writer.close();
    long fileLength = fs.getFileStatus(testFilePath).getLen();
    // test with same schema with include
    conf.set(ValidTxnList.VALID_TXNS_KEY, "100:99:");
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d");
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct<c:int>,string");
    conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false");
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2");
    OrcSplit split = new OrcSplit(testFilePath, null, 0, fileLength, new String[0], null, false, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength);
    OrcInputFormat inputFormat = new OrcInputFormat();
    AcidInputFormat.RowReader<OrcStruct> reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf));
    int record = 0;
    RecordIdentifier id = reader.createKey();
    OrcStruct struct = reader.createValue();
    while (reader.next(id, struct)) {
        assertEquals("id " + record, record, id.getRowId());
        assertEquals("bucket " + record, 0, id.getBucketId());
        assertEquals("trans " + record, 1, id.getTransactionId());
        assertEquals("a " + record, 42 * record, ((IntWritable) struct.getFieldValue(0)).get());
        assertEquals(null, struct.getFieldValue(1));
        assertEquals("d " + record, Integer.toHexString(record), struct.getFieldValue(2).toString());
        record += 1;
    }
    assertEquals(1000, record);
    reader.close();
    // test with schema evolution and include
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d,f");
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct<c:int,e:string>,string,int");
    conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false");
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2,3");
    split = new OrcSplit(testFilePath, null, 0, fileLength, new String[0], null, false, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength);
    inputFormat = new OrcInputFormat();
    reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf));
    record = 0;
    id = reader.createKey();
    struct = reader.createValue();
    while (reader.next(id, struct)) {
        assertEquals("id " + record, record, id.getRowId());
        assertEquals("bucket " + record, 0, id.getBucketId());
        assertEquals("trans " + record, 1, id.getTransactionId());
        assertEquals("a " + record, 42 * record, ((IntWritable) struct.getFieldValue(0)).get());
        assertEquals(null, struct.getFieldValue(1));
        assertEquals("d " + record, Integer.toHexString(record), struct.getFieldValue(2).toString());
        assertEquals("f " + record, null, struct.getFieldValue(3));
        record += 1;
    }
    assertEquals(1000, record);
    reader.close();
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) AcidInputFormat(org.apache.hadoop.hive.ql.io.AcidInputFormat) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) TypeDescription(org.apache.orc.TypeDescription) RecordWriter(org.apache.hadoop.mapred.RecordWriter) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) Test(org.junit.Test)

Example 7 with StructColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.StructColumnVector in project hive by apache.

the class RecordReaderImpl method copyStructColumn.

void copyStructColumn(ColumnVector destination, ColumnVector source, int sourceOffset, int length) {
    StructColumnVector castedSource = (StructColumnVector) source;
    StructColumnVector castedDestination = (StructColumnVector) destination;
    castedDestination.isRepeating = castedSource.isRepeating;
    castedDestination.noNulls = castedSource.noNulls;
    if (source.isRepeating) {
        castedDestination.isNull[0] = castedSource.isNull[0];
        for (int c = 0; c > castedSource.fields.length; ++c) {
            copyColumn(castedDestination.fields[c], castedSource.fields[c], 0, 1);
        }
    } else {
        if (!castedSource.noNulls) {
            for (int r = 0; r < length; ++r) {
                castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r];
            }
        } else {
            for (int c = 0; c > castedSource.fields.length; ++c) {
                copyColumn(castedDestination.fields[c], castedSource.fields[c], sourceOffset, length);
            }
        }
    }
}
Also used : StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector)

Example 8 with StructColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.StructColumnVector in project hive by apache.

the class VectorizedOrcAcidRowBatchReader method next.

@Override
public boolean next(NullWritable key, VectorizedRowBatch value) throws IOException {
    try {
        // required in next()
        if (addPartitionCols) {
            if (partitionValues != null) {
                rbCtx.addPartitionColsToBatch(value, partitionValues);
            }
            addPartitionCols = false;
        }
        if (!baseReader.nextBatch(vectorizedRowBatchBase)) {
            return false;
        }
    } catch (Exception e) {
        throw new IOException("error iterating", e);
    }
    // Once we have read the VectorizedRowBatchBase from the file, there are two kinds of cases
    // for which we might have to discard rows from the batch:
    // Case 1- when the row is created by a transaction that is not valid, or
    // Case 2- when the row has been deleted.
    // We will go through the batch to discover rows which match any of the cases and specifically
    // remove them from the selected vector. Of course, selectedInUse should also be set.
    BitSet selectedBitSet = new BitSet(vectorizedRowBatchBase.size);
    if (vectorizedRowBatchBase.selectedInUse) {
        // When selectedInUse is true, start with every bit set to false and selectively set
        // certain bits to true based on the selected[] vector.
        selectedBitSet.set(0, vectorizedRowBatchBase.size, false);
        for (int j = 0; j < vectorizedRowBatchBase.size; ++j) {
            int i = vectorizedRowBatchBase.selected[j];
            selectedBitSet.set(i);
        }
    } else {
        // When selectedInUse is set to false, everything in the batch is selected.
        selectedBitSet.set(0, vectorizedRowBatchBase.size, true);
    }
    // Case 1- find rows which belong to transactions that are not valid.
    findRecordsWithInvalidTransactionIds(vectorizedRowBatchBase, selectedBitSet);
    // Case 2- find rows which have been deleted.
    this.deleteEventRegistry.findDeletedRecords(vectorizedRowBatchBase, selectedBitSet);
    if (selectedBitSet.cardinality() == vectorizedRowBatchBase.size) {
        // None of the cases above matched and everything is selected. Hence, we will use the
        // same values for the selected and selectedInUse.
        value.size = vectorizedRowBatchBase.size;
        value.selected = vectorizedRowBatchBase.selected;
        value.selectedInUse = vectorizedRowBatchBase.selectedInUse;
    } else {
        value.size = selectedBitSet.cardinality();
        value.selectedInUse = true;
        value.selected = new int[selectedBitSet.cardinality()];
        // This loop fills up the selected[] vector with all the index positions that are selected.
        for (int setBitIndex = selectedBitSet.nextSetBit(0), selectedItr = 0; setBitIndex >= 0; setBitIndex = selectedBitSet.nextSetBit(setBitIndex + 1), ++selectedItr) {
            value.selected[selectedItr] = setBitIndex;
        }
    }
    // Finally, link up the columnVector from the base VectorizedRowBatch to outgoing batch.
    // NOTE: We only link up the user columns and not the ACID metadata columns because this
    // vectorized code path is not being used in cases of update/delete, when the metadata columns
    // would be expected to be passed up the operator pipeline. This is because
    // currently the update/delete specifically disable vectorized code paths.
    // This happens at ql/exec/Utilities.java::3293 when it checks for mapWork.getVectorMode()
    StructColumnVector payloadStruct = (StructColumnVector) vectorizedRowBatchBase.cols[OrcRecordUpdater.ROW];
    // Transfer columnVector objects from base batch to outgoing batch.
    System.arraycopy(payloadStruct.fields, 0, value.cols, 0, value.getDataColumnCount());
    progress = baseReader.getProgress();
    return true;
}
Also used : StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) BitSet(java.util.BitSet) IOException(java.io.IOException) IOException(java.io.IOException)

Example 9 with StructColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.StructColumnVector in project hive by apache.

the class VectorizedColumnReaderTestBase method structRead.

protected void structRead(boolean isDictionaryEncoding) throws Exception {
    Configuration conf = new Configuration();
    conf.set(IOConstants.COLUMNS, "struct_field");
    conf.set(IOConstants.COLUMNS_TYPES, "struct<a:int,b:double>");
    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
    String schema = "message hive_schema {\n" + "group struct_field {\n" + "  optional int32 a;\n" + "  optional double b;\n" + "}\n" + "}\n";
    VectorizedParquetRecordReader reader = createParquetReader(schema, conf);
    VectorizedRowBatch previous = reader.createValue();
    int c = 0;
    try {
        while (reader.next(NullWritable.get(), previous)) {
            StructColumnVector vector = (StructColumnVector) previous.cols[0];
            LongColumnVector cv = (LongColumnVector) vector.fields[0];
            DoubleColumnVector dv = (DoubleColumnVector) vector.fields[1];
            for (int i = 0; i < cv.vector.length; i++) {
                if (c == nElements) {
                    break;
                }
                assertEquals(getIntValue(isDictionaryEncoding, c), cv.vector[i]);
                assertEquals(getDoubleValue(isDictionaryEncoding, c), dv.vector[i], 0);
                assertFalse(vector.isNull[i]);
                assertFalse(vector.isRepeating);
                c++;
            }
        }
        assertEquals("It doesn't exit at expected position", nElements, c);
    } finally {
        reader.close();
    }
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) DoubleColumnVector(org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector) Configuration(org.apache.hadoop.conf.Configuration) VectorizedParquetRecordReader(org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader) StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)

Example 10 with StructColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.StructColumnVector in project hive by apache.

the class VectorizedColumnReaderTestBase method structReadSomeNull.

protected void structReadSomeNull(boolean isDictionaryEncoding) throws Exception {
    Configuration conf = new Configuration();
    conf.set(IOConstants.COLUMNS, "struct_field_some_null");
    conf.set(IOConstants.COLUMNS_TYPES, "struct<f:int,g:double>");
    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
    String schema = "message hive_schema {\n" + "group struct_field_some_null {\n" + "  optional int32 f;\n" + "  optional double g;\n" + "}\n";
    VectorizedParquetRecordReader reader = createParquetReader(schema, conf);
    VectorizedRowBatch previous = reader.createValue();
    int c = 0;
    try {
        while (reader.next(NullWritable.get(), previous)) {
            StructColumnVector sv = (StructColumnVector) previous.cols[0];
            LongColumnVector fv = (LongColumnVector) sv.fields[0];
            DoubleColumnVector gv = (DoubleColumnVector) sv.fields[1];
            for (int i = 0; i < fv.vector.length; i++) {
                if (c == nElements) {
                    break;
                }
                assertEquals(c % 2 == 0, fv.isNull[i]);
                assertEquals(c % 3 == 0, gv.isNull[i]);
                assertEquals(c % /* 2*3 = */
                6 == 0, sv.isNull[i]);
                if (!sv.isNull[i]) {
                    if (!fv.isNull[i]) {
                        assertEquals(getIntValue(isDictionaryEncoding, c), fv.vector[i]);
                    }
                    if (!gv.isNull[i]) {
                        assertEquals(getDoubleValue(isDictionaryEncoding, c), gv.vector[i], 0);
                    }
                }
                assertFalse(fv.isRepeating);
                c++;
            }
        }
        assertEquals("It doesn't exit at expected position", nElements, c);
    } finally {
        reader.close();
    }
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) DoubleColumnVector(org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector) Configuration(org.apache.hadoop.conf.Configuration) VectorizedParquetRecordReader(org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader) StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)

Aggregations

StructColumnVector (org.apache.hadoop.hive.ql.exec.vector.StructColumnVector)11 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)6 VectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch)6 Configuration (org.apache.hadoop.conf.Configuration)4 VectorizedParquetRecordReader (org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader)4 DoubleColumnVector (org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)3 TypeDescription (org.apache.orc.TypeDescription)3 BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)2 StructTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo)2 RecordWriter (org.apache.hadoop.mapred.RecordWriter)2 Test (org.junit.Test)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 BitSet (java.util.BitSet)1 Path (org.apache.hadoop.fs.Path)1 ColumnVector (org.apache.hadoop.hive.ql.exec.vector.ColumnVector)1 AcidInputFormat (org.apache.hadoop.hive.ql.io.AcidInputFormat)1 RecordIdentifier (org.apache.hadoop.hive.ql.io.RecordIdentifier)1 CharTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo)1 ListTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo)1