Search in sources :

Example 21 with TypeDescription

use of org.apache.orc.TypeDescription in project hive by apache.

the class OrcInputFormat method createOptionsForReader.

static Reader.Options createOptionsForReader(Configuration conf) {
    /**
     * Do we have schema on read in the configuration variables?
     */
    TypeDescription schema = OrcInputFormat.getDesiredRowTypeDescr(conf, true, Integer.MAX_VALUE);
    Reader.Options readerOptions = new Reader.Options().schema(schema);
    // TODO: Convert genIncludedColumns and setSearchArgument to use TypeDescription.
    final List<OrcProto.Type> schemaTypes = OrcUtils.getOrcTypes(schema);
    readerOptions.include(OrcInputFormat.genIncludedColumns(schema, conf));
    // todo: last param is bogus. why is this hardcoded?
    OrcInputFormat.setSearchArgument(readerOptions, schemaTypes, conf, true);
    return readerOptions;
}
Also used : TypeDescription(org.apache.orc.TypeDescription) StatsProvidingRecordReader(org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader) BatchToRowReader(org.apache.hadoop.hive.ql.io.BatchToRowReader)

Example 22 with TypeDescription

use of org.apache.orc.TypeDescription in project hive by apache.

the class OrcInputFormat method getDesiredRowTypeDescr.

/**
 * Generate the desired schema for reading the file.
 * @param conf the configuration
 * @param isAcidRead is this an acid format?
 * @param dataColumns the desired number of data columns for vectorized read
 * @return the desired schema or null if schema evolution isn't enabled
 * @throws IllegalArgumentException
 */
public static TypeDescription getDesiredRowTypeDescr(Configuration conf, boolean isAcidRead, int dataColumns) {
    String columnNameProperty = null;
    String columnTypeProperty = null;
    ArrayList<String> schemaEvolutionColumnNames = null;
    ArrayList<TypeDescription> schemaEvolutionTypeDescrs = null;
    boolean haveSchemaEvolutionProperties = false;
    if (isAcidRead || HiveConf.getBoolVar(conf, ConfVars.HIVE_SCHEMA_EVOLUTION)) {
        columnNameProperty = conf.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS);
        columnTypeProperty = conf.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES);
        haveSchemaEvolutionProperties = (columnNameProperty != null && columnTypeProperty != null);
        if (haveSchemaEvolutionProperties) {
            schemaEvolutionColumnNames = Lists.newArrayList(columnNameProperty.split(","));
            if (schemaEvolutionColumnNames.size() == 0) {
                haveSchemaEvolutionProperties = false;
            } else {
                schemaEvolutionTypeDescrs = typeDescriptionsFromHiveTypeProperty(columnTypeProperty, dataColumns);
                if (schemaEvolutionTypeDescrs.size() != Math.min(dataColumns, schemaEvolutionColumnNames.size())) {
                    haveSchemaEvolutionProperties = false;
                }
            }
        } else if (isAcidRead) {
            throw new IllegalArgumentException(ErrorMsg.SCHEMA_REQUIRED_TO_READ_ACID_TABLES.getErrorCodedMsg());
        }
    }
    if (haveSchemaEvolutionProperties) {
        if (LOG.isInfoEnabled()) {
            LOG.info("Using schema evolution configuration variables schema.evolution.columns " + schemaEvolutionColumnNames.toString() + " / schema.evolution.columns.types " + schemaEvolutionTypeDescrs.toString() + " (isAcidRead " + isAcidRead + ")");
        }
    } else {
        // Try regular properties;
        columnNameProperty = conf.get(serdeConstants.LIST_COLUMNS);
        columnTypeProperty = conf.get(serdeConstants.LIST_COLUMN_TYPES);
        if (columnTypeProperty == null || columnNameProperty == null) {
            return null;
        }
        schemaEvolutionColumnNames = Lists.newArrayList(columnNameProperty.split(","));
        if (schemaEvolutionColumnNames.size() == 0) {
            return null;
        }
        schemaEvolutionTypeDescrs = typeDescriptionsFromHiveTypeProperty(columnTypeProperty, dataColumns);
        if (schemaEvolutionTypeDescrs.size() != Math.min(dataColumns, schemaEvolutionColumnNames.size())) {
            return null;
        }
        // Find first virtual column and clip them off.
        int virtualColumnClipNum = -1;
        int columnNum = 0;
        for (String columnName : schemaEvolutionColumnNames) {
            if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(columnName)) {
                virtualColumnClipNum = columnNum;
                break;
            }
            columnNum++;
        }
        if (virtualColumnClipNum != -1 && virtualColumnClipNum < dataColumns) {
            schemaEvolutionColumnNames = Lists.newArrayList(schemaEvolutionColumnNames.subList(0, virtualColumnClipNum));
            schemaEvolutionTypeDescrs = Lists.newArrayList(schemaEvolutionTypeDescrs.subList(0, virtualColumnClipNum));
        }
        if (LOG.isInfoEnabled()) {
            LOG.info("Using column configuration variables columns " + schemaEvolutionColumnNames.toString() + " / columns.types " + schemaEvolutionTypeDescrs.toString() + " (isAcidRead " + isAcidRead + ")");
        }
    }
    // Desired schema does not include virtual columns or partition columns.
    TypeDescription result = TypeDescription.createStruct();
    for (int i = 0; i < schemaEvolutionTypeDescrs.size(); i++) {
        result.addField(schemaEvolutionColumnNames.get(i), schemaEvolutionTypeDescrs.get(i));
    }
    return result;
}
Also used : TypeDescription(org.apache.orc.TypeDescription)

Example 23 with TypeDescription

use of org.apache.orc.TypeDescription in project hive by apache.

the class OrcEncodedDataConsumer method decodeBatch.

@Override
protected void decodeBatch(OrcEncodedColumnBatch batch, Consumer<ColumnVectorBatch> downstreamConsumer) throws InterruptedException {
    long startTime = counters.startTimeCounter();
    int currentStripeIndex = batch.getBatchKey().stripeIx;
    boolean sameStripe = currentStripeIndex == previousStripeIndex;
    try {
        ConsumerStripeMetadata stripeMetadata = stripes.get(currentStripeIndex);
        // Get non null row count from root column, to get max vector batches
        int rgIdx = batch.getBatchKey().rgIx;
        long nonNullRowCount = -1;
        if (rgIdx == OrcEncodedColumnBatch.ALL_RGS) {
            nonNullRowCount = stripeMetadata.getRowCount();
        } else {
            OrcProto.RowIndexEntry rowIndex = stripeMetadata.getRowIndexEntry(0, rgIdx);
            nonNullRowCount = getRowCount(rowIndex);
        }
        int maxBatchesRG = (int) ((nonNullRowCount / VectorizedRowBatch.DEFAULT_SIZE) + 1);
        int batchSize = VectorizedRowBatch.DEFAULT_SIZE;
        TypeDescription fileSchema = fileMetadata.getSchema();
        if (columnReaders == null || !sameStripe) {
            createColumnReaders(batch, stripeMetadata, fileSchema);
        } else {
            repositionInStreams(this.columnReaders, batch, sameStripe, stripeMetadata);
        }
        previousStripeIndex = currentStripeIndex;
        for (int i = 0; i < maxBatchesRG; i++) {
            // for last batch in row group, adjust the batch size
            if (i == maxBatchesRG - 1) {
                batchSize = (int) (nonNullRowCount % VectorizedRowBatch.DEFAULT_SIZE);
                if (batchSize == 0)
                    break;
            }
            ColumnVectorBatch cvb = cvbPool.take();
            // assert cvb.cols.length == batch.getColumnIxs().length; // Must be constant per split.
            cvb.size = batchSize;
            for (int idx = 0; idx < columnReaders.length; ++idx) {
                TreeReader reader = columnReaders[idx];
                if (cvb.cols[idx] == null) {
                    // Orc store rows inside a root struct (hive writes it this way).
                    // When we populate column vectors we skip over the root struct.
                    cvb.cols[idx] = createColumn(batchSchemas[idx], VectorizedRowBatch.DEFAULT_SIZE);
                }
                trace.logTreeReaderNextVector(idx);
                /*
           * Currently, ORC's TreeReaderFactory class does this:
           *
           *     public void nextBatch(VectorizedRowBatch batch,
           *              int batchSize) throws IOException {
           *       batch.cols[0].reset();
           *       batch.cols[0].ensureSize(batchSize, false);
           *       nextVector(batch.cols[0], null, batchSize);
           *     }
           *
           * CONCERN:
           *     For better performance, we'd like to *not* do a ColumnVector.reset()
           *     which zeroes out isNull.  Why?  Because there are common cases where
           *     ORC will *immediately* copy its null flags into the isNull array.  This is a
           *     waste.
           *
           *     For correctness now we must do it for now.
           *
           *     The best solution is for ORC to manage the noNulls and isNull array itself
           *     because it knows what NULLs the next set of rows contains.
           *
           *     Its management of the fields of ColumnVector is a little different than what we
           *     must do for vector expressions.  For those, we must maintain the invariant that if
           *     noNulls is true there are no NULLs in any part of the isNull array.  This is
           *     because the next vector expression relies on the invariant.
           *
           *     Given that ORC (or any other producer) is providing *read-only* batches to the
           *     consumer, what is important is that the isNull array through batch.size has
           *     integrity with the noNulls flag.  So, if ORC is giving us 100 rows (for example)
           *     and none of them are NULL, it can safely set or make sure the first 100 isNull
           *     entries are false and safely set noNulls to true.  Any other NULLs (true entries)
           *     in isNull are irrelevant because ORC owns the batch.  It just need to make sure
           *     it doesn't get confused.
           *
           */
                ColumnVector cv = cvb.cols[idx];
                cv.reset();
                cv.ensureSize(batchSize, false);
                reader.nextVector(cv, null, batchSize);
            }
            // we are done reading a batch, send it to consumer for processing
            downstreamConsumer.consumeData(cvb);
            counters.incrCounter(LlapIOCounters.ROWS_EMITTED, batchSize);
        }
        LlapIoImpl.ORC_LOGGER.debug("Done with decode");
        counters.incrTimeCounter(LlapIOCounters.DECODE_TIME_NS, startTime);
        counters.incrCounter(LlapIOCounters.NUM_VECTOR_BATCHES, maxBatchesRG);
        counters.incrCounter(LlapIOCounters.NUM_DECODED_BATCHES);
    } catch (IOException e) {
        // Caller will return the batch.
        downstreamConsumer.setError(e);
    }
}
Also used : ConsumerStripeMetadata(org.apache.hadoop.hive.llap.io.metadata.ConsumerStripeMetadata) OrcProto(org.apache.orc.OrcProto) TypeDescription(org.apache.orc.TypeDescription) TreeReader(org.apache.orc.impl.TreeReaderFactory.TreeReader) SettableTreeReader(org.apache.hadoop.hive.ql.io.orc.encoded.EncodedTreeReaderFactory.SettableTreeReader) StructTreeReader(org.apache.orc.impl.TreeReaderFactory.StructTreeReader) ColumnVectorBatch(org.apache.hadoop.hive.llap.io.api.impl.ColumnVectorBatch) IOException(java.io.IOException) DecimalColumnVector(org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector) StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) MapColumnVector(org.apache.hadoop.hive.ql.exec.vector.MapColumnVector) TimestampColumnVector(org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector) ListColumnVector(org.apache.hadoop.hive.ql.exec.vector.ListColumnVector) UnionColumnVector(org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector) DoubleColumnVector(org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)

Example 24 with TypeDescription

use of org.apache.orc.TypeDescription in project hive by apache.

the class TestInputOutputFormat method testColumnProjectionWithAcid.

/**
 * Test column projection when using ACID.
 */
@Test
public void testColumnProjectionWithAcid() throws Exception {
    Path baseDir = new Path(workDir, "base_00100");
    testFilePath = new Path(baseDir, "bucket_00000");
    fs.mkdirs(baseDir);
    fs.delete(testFilePath, true);
    TypeDescription fileSchema = TypeDescription.fromString("struct<operation:int," + "originalTransaction:bigint,bucket:int,rowId:bigint," + "currentTransaction:bigint," + "row:struct<a:int,b:struct<c:int>,d:string>>");
    Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).fileSystem(fs).setSchema(fileSchema).compress(org.apache.orc.CompressionKind.NONE));
    VectorizedRowBatch batch = fileSchema.createRowBatch(1000);
    batch.size = 1000;
    StructColumnVector scv = (StructColumnVector) batch.cols[5];
    // operation
    batch.cols[0].isRepeating = true;
    ((LongColumnVector) batch.cols[0]).vector[0] = 0;
    // original transaction
    batch.cols[1].isRepeating = true;
    ((LongColumnVector) batch.cols[1]).vector[0] = 1;
    // bucket
    batch.cols[2].isRepeating = true;
    ((LongColumnVector) batch.cols[2]).vector[0] = 0;
    // current transaction
    batch.cols[4].isRepeating = true;
    ((LongColumnVector) batch.cols[4]).vector[0] = 1;
    LongColumnVector lcv = (LongColumnVector) ((StructColumnVector) scv.fields[1]).fields[0];
    for (int r = 0; r < 1000; r++) {
        // row id
        ((LongColumnVector) batch.cols[3]).vector[r] = r;
        // a
        ((LongColumnVector) scv.fields[0]).vector[r] = r * 42;
        // b.c
        lcv.vector[r] = r * 10001;
        // d
        ((BytesColumnVector) scv.fields[2]).setVal(r, Integer.toHexString(r).getBytes(StandardCharsets.UTF_8));
    }
    writer.addRowBatch(batch);
    writer.addUserMetadata(OrcRecordUpdater.ACID_KEY_INDEX_NAME, ByteBuffer.wrap("0,0,999".getBytes(StandardCharsets.UTF_8)));
    writer.close();
    long fileLength = fs.getFileStatus(testFilePath).getLen();
    // test with same schema with include
    conf.set(ValidWriteIdList.VALID_WRITEIDS_KEY, "tbl:100:99:");
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d");
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct<c:int>,string");
    conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false");
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2");
    OrcSplit split = new OrcSplit(testFilePath, null, 0, fileLength, new String[0], null, false, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength, workDir);
    OrcInputFormat inputFormat = new OrcInputFormat();
    AcidInputFormat.RowReader<OrcStruct> reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf));
    int record = 0;
    RecordIdentifier id = reader.createKey();
    OrcStruct struct = reader.createValue();
    while (reader.next(id, struct)) {
        assertEquals("id " + record, record, id.getRowId());
        assertEquals("bucket " + record, 0, id.getBucketProperty());
        assertEquals("writeid " + record, 1, id.getWriteId());
        assertEquals("a " + record, 42 * record, ((IntWritable) struct.getFieldValue(0)).get());
        assertEquals(null, struct.getFieldValue(1));
        assertEquals("d " + record, Integer.toHexString(record), struct.getFieldValue(2).toString());
        record += 1;
    }
    assertEquals(1000, record);
    reader.close();
    // test with schema evolution and include
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d,f");
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct<c:int,e:string>,string,int");
    conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false");
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2,3");
    split = new OrcSplit(testFilePath, null, 0, fileLength, new String[0], null, false, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength, workDir);
    inputFormat = new OrcInputFormat();
    reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf));
    record = 0;
    id = reader.createKey();
    struct = reader.createValue();
    while (reader.next(id, struct)) {
        assertEquals("id " + record, record, id.getRowId());
        assertEquals("bucket " + record, 0, id.getBucketProperty());
        assertEquals("writeid " + record, 1, id.getWriteId());
        assertEquals("a " + record, 42 * record, ((IntWritable) struct.getFieldValue(0)).get());
        assertEquals(null, struct.getFieldValue(1));
        assertEquals("d " + record, Integer.toHexString(record), struct.getFieldValue(2).toString());
        assertEquals("f " + record, null, struct.getFieldValue(3));
        record += 1;
    }
    assertEquals(1000, record);
    reader.close();
}
Also used : ArrayList(java.util.ArrayList) AcidInputFormat(org.apache.hadoop.hive.ql.io.AcidInputFormat) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) TypeDescription(org.apache.orc.TypeDescription) RecordWriter(org.apache.hadoop.mapred.RecordWriter) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) Test(org.junit.Test)

Example 25 with TypeDescription

use of org.apache.orc.TypeDescription in project hive by apache.

the class TestInputOutputFormat method testSchemaEvolution.

/**
 * Test schema evolution when using the reader directly.
 */
@Test
public void testSchemaEvolution() throws Exception {
    TypeDescription fileSchema = TypeDescription.fromString("struct<a:int,b:struct<c:int>,d:string>");
    Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).fileSystem(fs).setSchema(fileSchema).compress(org.apache.orc.CompressionKind.NONE));
    VectorizedRowBatch batch = fileSchema.createRowBatch(1000);
    batch.size = 1000;
    LongColumnVector lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]);
    for (int r = 0; r < 1000; r++) {
        ((LongColumnVector) batch.cols[0]).vector[r] = r * 42;
        lcv.vector[r] = r * 10001;
        ((BytesColumnVector) batch.cols[2]).setVal(r, Integer.toHexString(r).getBytes(StandardCharsets.UTF_8));
    }
    writer.addRowBatch(batch);
    writer.close();
    TypeDescription readerSchema = TypeDescription.fromString("struct<a:int,b:struct<c:int,future1:int>,d:string,future2:int>");
    Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rowsOptions(new Reader.Options().schema(readerSchema));
    batch = readerSchema.createRowBatch();
    lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]);
    LongColumnVector future1 = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[1]);
    assertEquals(true, rows.nextBatch(batch));
    assertEquals(1000, batch.size);
    assertEquals(true, future1.isRepeating);
    assertEquals(true, future1.isNull[0]);
    assertEquals(true, batch.cols[3].isRepeating);
    assertEquals(true, batch.cols[3].isNull[0]);
    for (int r = 0; r < batch.size; ++r) {
        assertEquals("row " + r, r * 42, ((LongColumnVector) batch.cols[0]).vector[r]);
        assertEquals("row " + r, r * 10001, lcv.vector[r]);
        assertEquals("row " + r, r * 10001, lcv.vector[r]);
        assertEquals("row " + r, Integer.toHexString(r), ((BytesColumnVector) batch.cols[2]).toString(r));
    }
    assertEquals(false, rows.nextBatch(batch));
    rows.close();
    // try it again with an include vector
    rows = reader.rowsOptions(new Reader.Options().schema(readerSchema).include(new boolean[] { false, true, true, true, false, false, true }));
    batch = readerSchema.createRowBatch();
    lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]);
    future1 = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[1]);
    assertEquals(true, rows.nextBatch(batch));
    assertEquals(1000, batch.size);
    assertEquals(true, future1.isRepeating);
    assertEquals(true, future1.isNull[0]);
    assertEquals(true, batch.cols[3].isRepeating);
    assertEquals(true, batch.cols[3].isNull[0]);
    assertEquals(true, batch.cols[2].isRepeating);
    assertEquals(true, batch.cols[2].isNull[0]);
    for (int r = 0; r < batch.size; ++r) {
        assertEquals("row " + r, r * 42, ((LongColumnVector) batch.cols[0]).vector[r]);
        assertEquals("row " + r, r * 10001, lcv.vector[r]);
    }
    assertEquals(false, rows.nextBatch(batch));
    rows.close();
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) TypeDescription(org.apache.orc.TypeDescription) RecordWriter(org.apache.hadoop.mapred.RecordWriter) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) Test(org.junit.Test)

Aggregations

TypeDescription (org.apache.orc.TypeDescription)26 Test (org.junit.Test)7 ArrayList (java.util.ArrayList)6 BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)6 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)6 VectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch)6 StructColumnVector (org.apache.hadoop.hive.ql.exec.vector.StructColumnVector)5 ListObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector)4 MapObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector)4 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)4 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)4 BinaryObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector)4 BooleanObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector)4 ByteObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector)4 DoubleObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector)4 FloatObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector)4 Configuration (org.apache.hadoop.conf.Configuration)3 Path (org.apache.hadoop.fs.Path)3 ColumnVector (org.apache.hadoop.hive.ql.exec.vector.ColumnVector)3 ListColumnVector (org.apache.hadoop.hive.ql.exec.vector.ListColumnVector)3