Search in sources :

Example 1 with StructColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.StructColumnVector in project hive by apache.

the class VectorizedColumnReaderTestBase method nestedStructRead0.

protected void nestedStructRead0(boolean isDictionaryEncoding) throws Exception {
    Configuration conf = new Configuration();
    conf.set(IOConstants.COLUMNS, "nested_struct_field");
    conf.set(IOConstants.COLUMNS_TYPES, "struct<nsf:struct<c:int,d:int>,e:double>");
    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
    String schema = "message hive_schema {\n" + "group nested_struct_field {\n" + "  optional group nsf {\n" + "    optional int32 c;\n" + "    optional int32 d;\n" + "  }" + "optional double e;\n" + "}\n";
    VectorizedParquetRecordReader reader = createParquetReader(schema, conf);
    VectorizedRowBatch previous = reader.createValue();
    int c = 0;
    try {
        while (reader.next(NullWritable.get(), previous)) {
            StructColumnVector vector = (StructColumnVector) previous.cols[0];
            StructColumnVector sv = (StructColumnVector) vector.fields[0];
            LongColumnVector cv = (LongColumnVector) sv.fields[0];
            LongColumnVector dv = (LongColumnVector) sv.fields[1];
            DoubleColumnVector ev = (DoubleColumnVector) vector.fields[1];
            for (int i = 0; i < cv.vector.length; i++) {
                if (c == nElements) {
                    break;
                }
                assertEquals(getIntValue(isDictionaryEncoding, c), cv.vector[i]);
                assertEquals(getIntValue(isDictionaryEncoding, c), dv.vector[i]);
                assertEquals(getDoubleValue(isDictionaryEncoding, c), ev.vector[i], 0);
                assertFalse(vector.isNull[i]);
                assertFalse(vector.isRepeating);
                c++;
            }
        }
        assertEquals("It doesn't exit at expected position", nElements, c);
    } finally {
        reader.close();
    }
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) DoubleColumnVector(org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector) Configuration(org.apache.hadoop.conf.Configuration) VectorizedParquetRecordReader(org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader) StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)

Example 2 with StructColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.StructColumnVector in project hive by apache.

the class BatchToRowReader method nextStruct.

public StructType nextStruct(ColumnVector vector, int row, StructTypeInfo schema, Object previous) {
    if (vector.isRepeating) {
        row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
        List<TypeInfo> childrenTypes = schema.getAllStructFieldTypeInfos();
        StructType result = createStructObject(previous, childrenTypes);
        StructColumnVector struct = (StructColumnVector) vector;
        for (int f = 0; f < childrenTypes.size(); ++f) {
            setStructCol(result, f, nextValue(struct.fields[f], row, childrenTypes.get(f), getStructCol(result, f)));
        }
        return result;
    } else {
        return null;
    }
}
Also used : StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) MapTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) UnionTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo) VarcharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo) CharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo)

Example 3 with StructColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.StructColumnVector in project hive by apache.

the class VectorizedStructColumnReader method readBatch.

@Override
public void readBatch(int total, ColumnVector column, TypeInfo columnType) throws IOException {
    StructColumnVector structColumnVector = (StructColumnVector) column;
    StructTypeInfo structTypeInfo = (StructTypeInfo) columnType;
    ColumnVector[] vectors = structColumnVector.fields;
    for (int i = 0; i < vectors.length; i++) {
        fieldReaders.get(i).readBatch(total, vectors[i], structTypeInfo.getAllStructFieldTypeInfos().get(i));
        structColumnVector.isRepeating = structColumnVector.isRepeating && vectors[i].isRepeating;
        for (int j = 0; j < vectors[i].isNull.length; j++) {
            structColumnVector.isNull[j] = (i == 0) ? vectors[i].isNull[j] : structColumnVector.isNull[j] && vectors[i].isNull[j];
        }
        structColumnVector.noNulls = (i == 0) ? vectors[i].noNulls : structColumnVector.noNulls && vectors[i].noNulls;
    }
}
Also used : StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector) StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector)

Example 4 with StructColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.StructColumnVector in project hive by apache.

the class RecordReaderImpl method nextStruct.

static OrcStruct nextStruct(ColumnVector vector, int row, TypeDescription schema, Object previous) {
    if (vector.isRepeating) {
        row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
        OrcStruct result;
        List<TypeDescription> childrenTypes = schema.getChildren();
        int numChildren = childrenTypes.size();
        if (previous == null || previous.getClass() != OrcStruct.class) {
            result = new OrcStruct(numChildren);
        } else {
            result = (OrcStruct) previous;
            result.setNumFields(numChildren);
        }
        StructColumnVector struct = (StructColumnVector) vector;
        for (int f = 0; f < numChildren; ++f) {
            result.setFieldValue(f, nextValue(struct.fields[f], row, childrenTypes.get(f), result.getFieldValue(f)));
        }
        return result;
    } else {
        return null;
    }
}
Also used : StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) TypeDescription(org.apache.orc.TypeDescription)

Example 5 with StructColumnVector

use of org.apache.hadoop.hive.ql.exec.vector.StructColumnVector in project hive by apache.

the class TestInputOutputFormat method testSchemaEvolution.

/**
   * Test schema evolution when using the reader directly.
   */
@Test
public void testSchemaEvolution() throws Exception {
    TypeDescription fileSchema = TypeDescription.fromString("struct<a:int,b:struct<c:int>,d:string>");
    Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).fileSystem(fs).setSchema(fileSchema).compress(org.apache.orc.CompressionKind.NONE));
    VectorizedRowBatch batch = fileSchema.createRowBatch(1000);
    batch.size = 1000;
    LongColumnVector lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]);
    for (int r = 0; r < 1000; r++) {
        ((LongColumnVector) batch.cols[0]).vector[r] = r * 42;
        lcv.vector[r] = r * 10001;
        ((BytesColumnVector) batch.cols[2]).setVal(r, Integer.toHexString(r).getBytes(StandardCharsets.UTF_8));
    }
    writer.addRowBatch(batch);
    writer.close();
    TypeDescription readerSchema = TypeDescription.fromString("struct<a:int,b:struct<c:int,future1:int>,d:string,future2:int>");
    Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rowsOptions(new Reader.Options().schema(readerSchema));
    batch = readerSchema.createRowBatch();
    lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]);
    LongColumnVector future1 = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[1]);
    assertEquals(true, rows.nextBatch(batch));
    assertEquals(1000, batch.size);
    assertEquals(true, future1.isRepeating);
    assertEquals(true, future1.isNull[0]);
    assertEquals(true, batch.cols[3].isRepeating);
    assertEquals(true, batch.cols[3].isNull[0]);
    for (int r = 0; r < batch.size; ++r) {
        assertEquals("row " + r, r * 42, ((LongColumnVector) batch.cols[0]).vector[r]);
        assertEquals("row " + r, r * 10001, lcv.vector[r]);
        assertEquals("row " + r, r * 10001, lcv.vector[r]);
        assertEquals("row " + r, Integer.toHexString(r), ((BytesColumnVector) batch.cols[2]).toString(r));
    }
    assertEquals(false, rows.nextBatch(batch));
    rows.close();
    // try it again with an include vector
    rows = reader.rowsOptions(new Reader.Options().schema(readerSchema).include(new boolean[] { false, true, true, true, false, false, true }));
    batch = readerSchema.createRowBatch();
    lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]);
    future1 = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[1]);
    assertEquals(true, rows.nextBatch(batch));
    assertEquals(1000, batch.size);
    assertEquals(true, future1.isRepeating);
    assertEquals(true, future1.isNull[0]);
    assertEquals(true, batch.cols[3].isRepeating);
    assertEquals(true, batch.cols[3].isNull[0]);
    assertEquals(true, batch.cols[2].isRepeating);
    assertEquals(true, batch.cols[2].isNull[0]);
    for (int r = 0; r < batch.size; ++r) {
        assertEquals("row " + r, r * 42, ((LongColumnVector) batch.cols[0]).vector[r]);
        assertEquals("row " + r, r * 10001, lcv.vector[r]);
    }
    assertEquals(false, rows.nextBatch(batch));
    rows.close();
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) TypeDescription(org.apache.orc.TypeDescription) RecordWriter(org.apache.hadoop.mapred.RecordWriter) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) Test(org.junit.Test)

Aggregations

StructColumnVector (org.apache.hadoop.hive.ql.exec.vector.StructColumnVector)11 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)6 VectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch)6 Configuration (org.apache.hadoop.conf.Configuration)4 VectorizedParquetRecordReader (org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader)4 DoubleColumnVector (org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)3 TypeDescription (org.apache.orc.TypeDescription)3 BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)2 StructTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo)2 RecordWriter (org.apache.hadoop.mapred.RecordWriter)2 Test (org.junit.Test)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 BitSet (java.util.BitSet)1 Path (org.apache.hadoop.fs.Path)1 ColumnVector (org.apache.hadoop.hive.ql.exec.vector.ColumnVector)1 AcidInputFormat (org.apache.hadoop.hive.ql.io.AcidInputFormat)1 RecordIdentifier (org.apache.hadoop.hive.ql.io.RecordIdentifier)1 CharTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo)1 ListTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo)1