Search in sources :

Example 6 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class TestMutations method testMulti.

@Test
public void testMulti() throws Exception {
    Table table = partitionedTableBuilder.addPartition(ASIA_INDIA).create(metaStoreClient);
    MutatorClient client = new MutatorClientBuilder().addSinkTable(table.getDbName(), table.getTableName(), true).metaStoreUri(metaStoreUri).build();
    client.connect();
    Transaction transaction = client.newTransaction();
    List<AcidTable> destinations = client.getTables();
    transaction.begin();
    MutatorFactory mutatorFactory = new ReflectiveMutatorFactory(conf, MutableRecord.class, RECORD_ID_COLUMN, BUCKET_COLUMN_INDEXES);
    MutatorCoordinator coordinator = new MutatorCoordinatorBuilder().metaStoreUri(metaStoreUri).table(destinations.get(0)).mutatorFactory(mutatorFactory).build();
    BucketIdResolver bucketIdResolver = mutatorFactory.newBucketIdResolver(destinations.get(0).getTotalBuckets());
    MutableRecord asiaIndiaRecord1 = (MutableRecord) bucketIdResolver.attachBucketIdToRecord(new MutableRecord(1, "Hello streaming"));
    MutableRecord europeUkRecord1 = (MutableRecord) bucketIdResolver.attachBucketIdToRecord(new MutableRecord(2, "Hello streaming"));
    MutableRecord europeFranceRecord1 = (MutableRecord) bucketIdResolver.attachBucketIdToRecord(new MutableRecord(3, "Hello streaming"));
    MutableRecord europeFranceRecord2 = (MutableRecord) bucketIdResolver.attachBucketIdToRecord(new MutableRecord(4, "Bonjour streaming"));
    coordinator.insert(ASIA_INDIA, asiaIndiaRecord1);
    coordinator.insert(EUROPE_UK, europeUkRecord1);
    coordinator.insert(EUROPE_FRANCE, europeFranceRecord1);
    coordinator.insert(EUROPE_FRANCE, europeFranceRecord2);
    coordinator.close();
    transaction.commit();
    // ASIA_INDIA
    StreamingAssert streamingAssertions = assertionFactory.newStreamingAssert(table, ASIA_INDIA);
    streamingAssertions.assertMinWriteId(1L);
    streamingAssertions.assertMaxWriteId(1L);
    streamingAssertions.assertExpectedFileCount(1);
    List<Record> readRecords = streamingAssertions.readRecords();
    assertThat(readRecords.size(), is(1));
    assertThat(readRecords.get(0).getRow(), is("{1, Hello streaming}"));
    assertThat(readRecords.get(0).getRecordIdentifier(), is(new RecordIdentifier(1L, encodeBucket(0), 0L)));
    // EUROPE_UK
    streamingAssertions = assertionFactory.newStreamingAssert(table, EUROPE_UK);
    streamingAssertions.assertMinWriteId(1L);
    streamingAssertions.assertMaxWriteId(1L);
    streamingAssertions.assertExpectedFileCount(1);
    readRecords = streamingAssertions.readRecords();
    assertThat(readRecords.size(), is(1));
    assertThat(readRecords.get(0).getRow(), is("{2, Hello streaming}"));
    assertThat(readRecords.get(0).getRecordIdentifier(), is(new RecordIdentifier(1L, encodeBucket(0), 0L)));
    // EUROPE_FRANCE
    streamingAssertions = assertionFactory.newStreamingAssert(table, EUROPE_FRANCE);
    streamingAssertions.assertMinWriteId(1L);
    streamingAssertions.assertMaxWriteId(1L);
    streamingAssertions.assertExpectedFileCount(1);
    readRecords = streamingAssertions.readRecords();
    assertThat(readRecords.size(), is(2));
    assertThat(readRecords.get(0).getRow(), is("{3, Hello streaming}"));
    assertThat(readRecords.get(0).getRecordIdentifier(), is(new RecordIdentifier(1L, encodeBucket(0), 0L)));
    assertThat(readRecords.get(1).getRow(), is("{4, Bonjour streaming}"));
    assertThat(readRecords.get(1).getRecordIdentifier(), is(new RecordIdentifier(1L, encodeBucket(0), 1L)));
    client.close();
}
Also used : AcidTable(org.apache.hive.hcatalog.streaming.mutate.client.AcidTable) Table(org.apache.hadoop.hive.metastore.api.Table) AcidTable(org.apache.hive.hcatalog.streaming.mutate.client.AcidTable) MutatorCoordinator(org.apache.hive.hcatalog.streaming.mutate.worker.MutatorCoordinator) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) MutatorFactory(org.apache.hive.hcatalog.streaming.mutate.worker.MutatorFactory) Transaction(org.apache.hive.hcatalog.streaming.mutate.client.Transaction) MutatorCoordinatorBuilder(org.apache.hive.hcatalog.streaming.mutate.worker.MutatorCoordinatorBuilder) BucketIdResolver(org.apache.hive.hcatalog.streaming.mutate.worker.BucketIdResolver) Record(org.apache.hive.hcatalog.streaming.mutate.StreamingAssert.Record) MutatorClient(org.apache.hive.hcatalog.streaming.mutate.client.MutatorClient) MutatorClientBuilder(org.apache.hive.hcatalog.streaming.mutate.client.MutatorClientBuilder) Test(org.junit.Test)

Example 7 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class TestInputOutputFormat method testAcidReadPastLastStripeOffset.

@Test
public void testAcidReadPastLastStripeOffset() throws Exception {
    Path baseDir = new Path(workDir, "base_00100");
    testFilePath = new Path(baseDir, "bucket_00000");
    fs.mkdirs(baseDir);
    fs.delete(testFilePath, true);
    TypeDescription fileSchema = TypeDescription.fromString("struct<operation:int," + "originalTransaction:bigint,bucket:int,rowId:bigint," + "currentTransaction:bigint," + "row:struct<a:int,b:struct<c:int>,d:string>>");
    OrcRecordUpdater.KeyIndexBuilder indexBuilder = new OrcRecordUpdater.KeyIndexBuilder("test");
    OrcFile.WriterOptions options = OrcFile.writerOptions(conf).fileSystem(fs).setSchema(fileSchema).compress(org.apache.orc.CompressionKind.NONE).callback(indexBuilder).stripeSize(128);
    // Create ORC file with small stripe size so we can write multiple stripes.
    Writer writer = OrcFile.createWriter(testFilePath, options);
    VectorizedRowBatch batch = fileSchema.createRowBatch(TypeDescription.RowBatchVersion.USE_DECIMAL64, 1000);
    batch.size = 1000;
    StructColumnVector scv = (StructColumnVector) batch.cols[5];
    // operation
    batch.cols[0].isRepeating = true;
    ((LongColumnVector) batch.cols[0]).vector[0] = OrcRecordUpdater.INSERT_OPERATION;
    // original transaction
    batch.cols[1].isRepeating = true;
    ((LongColumnVector) batch.cols[1]).vector[0] = 1;
    // bucket
    batch.cols[2].isRepeating = true;
    ((LongColumnVector) batch.cols[2]).vector[0] = BucketCodec.V1.encode(new AcidOutputFormat.Options(conf).bucket(0).statementId(0));
    // current transaction
    batch.cols[4].isRepeating = true;
    ((LongColumnVector) batch.cols[4]).vector[0] = 1;
    LongColumnVector lcv = (LongColumnVector) ((StructColumnVector) scv.fields[1]).fields[0];
    for (int r = 0; r < 1000; r++) {
        // row id
        ((LongColumnVector) batch.cols[3]).vector[r] = r;
        // a
        ((LongColumnVector) scv.fields[0]).vector[r] = r * 42;
        // b.c
        lcv.vector[r] = r * 10001;
        // d
        ((BytesColumnVector) scv.fields[2]).setVal(r, Integer.toHexString(r).getBytes(StandardCharsets.UTF_8));
        indexBuilder.addKey(OrcRecordUpdater.INSERT_OPERATION, 1, (int) (((LongColumnVector) batch.cols[2]).vector[0]), r);
    }
    // Minimum 5000 rows per stripe.
    for (int idx = 0; idx < 8; ++idx) {
        writer.addRowBatch(batch);
        // bucket
        batch.cols[2].isRepeating = true;
        ((LongColumnVector) batch.cols[2]).vector[0] = BucketCodec.V1.encode(new AcidOutputFormat.Options(conf).bucket(0).statementId(idx + 1));
        for (long row_id : ((LongColumnVector) batch.cols[3]).vector) {
            indexBuilder.addKey(OrcRecordUpdater.INSERT_OPERATION, 1, (int) (((LongColumnVector) batch.cols[2]).vector[0]), row_id);
        }
    }
    writer.close();
    long fileLength = fs.getFileStatus(testFilePath).getLen();
    // Find the last stripe.
    List<StripeInformation> stripes;
    RecordIdentifier[] keyIndex;
    try (Reader orcReader = OrcFile.createReader(fs, testFilePath)) {
        stripes = orcReader.getStripes();
        keyIndex = OrcRecordUpdater.parseKeyIndex(orcReader);
    }
    StripeInformation lastStripe = stripes.get(stripes.size() - 1);
    long lastStripeOffset = lastStripe.getOffset();
    long lastStripeLength = lastStripe.getLength();
    Assert.assertEquals("Index length doesn't match number of stripes", stripes.size(), keyIndex.length);
    Assert.assertEquals("1st Index entry mismatch", new RecordIdentifier(1, 536870916, 999), keyIndex[0]);
    Assert.assertEquals("2nd Index entry mismatch", new RecordIdentifier(1, 536870920, 999), keyIndex[1]);
    // test with same schema with include
    conf.set(ValidTxnList.VALID_TXNS_KEY, "100:99:");
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d");
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct<c:int>,string");
    conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false");
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2");
    LOG.info("Last stripe " + stripes.size() + ", offset " + lastStripeOffset + ", length " + lastStripeLength);
    // Specify an OrcSplit that starts beyond the offset of the last stripe.
    OrcSplit split = new OrcSplit(testFilePath, null, lastStripeOffset + 1, lastStripeLength, new String[0], null, false, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength, workDir, null);
    OrcInputFormat inputFormat = new OrcInputFormat();
    AcidInputFormat.RowReader<OrcStruct> reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf));
    int record = 0;
    OrcRawRecordMerger.ReaderKey id = reader.createKey();
    OrcStruct struct = reader.createValue();
    // not be read. Thus 0 records.
    while (reader.next(id, struct)) {
        record += 1;
    }
    assertEquals(0, record);
    reader.close();
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) AcidInputFormat(org.apache.hadoop.hive.ql.io.AcidInputFormat) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) TypeDescription(org.apache.orc.TypeDescription) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) RecordWriter(org.apache.hadoop.mapred.RecordWriter) StripeInformation(org.apache.orc.StripeInformation) Test(org.junit.Test)

Example 8 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class OrcRawRecordMerger method discoverOriginalKeyBounds.

/**
 * Find the key range for original bucket files.
 * For unbucketed tables the insert event data is still written to bucket_N file except that
 * N is just a writer ID - it still matches {@link RecordIdentifier#getBucketProperty()}.  For
 * 'original' files (ubucketed) the same applies.  A file 000000_0 encodes a taskId/wirterId and
 * at read time we synthesize {@link RecordIdentifier#getBucketProperty()} to match the file name
 * and so the same bucketProperty is used here to create minKey/maxKey, i.e. these keys are valid
 * to filter data from delete_delta files even for unbucketed tables.
 * @param reader the reader
 * @param bucket the bucket number we are reading
 * @param options the options for reading with
 * @throws IOException
 */
private KeyInterval discoverOriginalKeyBounds(Reader reader, int bucket, Reader.Options options, Configuration conf, Options mergerOptions) throws IOException {
    long rowLength = 0;
    long rowOffset = 0;
    // this would usually be at block boundary
    long offset = options.getOffset();
    // this would usually be at block boundary
    long maxOffset = options.getMaxOffset();
    boolean isTail = true;
    RecordIdentifier minKey = null;
    RecordIdentifier maxKey = null;
    TransactionMetaData tfp = TransactionMetaData.findWriteIDForSynthetcRowIDs(mergerOptions.getBucketPath(), mergerOptions.getRootPath(), conf);
    int bucketProperty = encodeBucketId(conf, bucket, tfp.statementId);
    /**
     * options.getOffset() and getMaxOffset() would usually be at block boundary which doesn't
     * necessarily match stripe boundary.  So we want to come up with minKey to be one before the 1st
     * row of the first stripe that starts after getOffset() and maxKey to be the last row of the
     * stripe that contains getMaxOffset().  This breaks if getOffset() and getMaxOffset() are inside
     * the sames tripe - in this case we have minKey & isTail=false but rowLength is never set.
     * (HIVE-16953)
     */
    for (StripeInformation stripe : reader.getStripes()) {
        if (offset > stripe.getOffset()) {
            rowOffset += stripe.getNumberOfRows();
        } else if (maxOffset > stripe.getOffset()) {
            rowLength += stripe.getNumberOfRows();
        } else {
            isTail = false;
            break;
        }
    }
    if (rowOffset > 0) {
        minKey = new RecordIdentifier(tfp.syntheticWriteId, bucketProperty, rowOffset - 1);
    }
    if (!isTail) {
        maxKey = new RecordIdentifier(tfp.syntheticWriteId, bucketProperty, rowOffset + rowLength - 1);
    }
    return new KeyInterval(minKey, maxKey);
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) StripeInformation(org.apache.orc.StripeInformation)

Example 9 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class VectorizedOrcAcidRowBatchReader method findMinMaxKeys.

/**
 * A given ORC reader will always process one or more whole stripes but the
 * split boundaries may not line up with stripe boundaries if the InputFormat
 * doesn't understand ORC specifics. So first we need to figure out which
 * stripe(s) we are reading.
 *
 * Suppose txn1 writes 100K rows
 * and txn2 writes 100 rows so we have events
 * {1,0,0}....{1,0,100K},{2,0,0}...{2,0,100} in 2 files
 * After compaction we may have 2 stripes
 * {1,0,0}...{1,0,90K},{1,0,90001}...{2,0,100}
 *
 * Now suppose there is a delete stmt that deletes every row.  So when we load
 * the 2nd stripe, if we just look at stripe {@link ColumnStatistics},
 * minKey={1,0,100} and maxKey={2,0,90001}, all but the 1st 100 delete events
 * will get loaded.  But with {@link OrcRecordUpdater#ACID_KEY_INDEX_NAME},
 * minKey={1,0,90001} and maxKey={2,0,100} so we only load about 10K deletes.
 *
 * Also, even with Query Based compactor (once we have it), FileSinkOperator
 * uses OrcRecordWriter to write to file, so we should have the
 * hive.acid.index in place.
 *
 * If reading the 1st stripe, we don't have the start event, so we'll get it
 * from stats, which will strictly speaking be accurate only wrt writeId and
 * bucket but that is good enough.
 *
 * @return empty <code>KeyInterval</code> if KeyInterval could not be
 * determined
 */
private OrcRawRecordMerger.KeyInterval findMinMaxKeys(OrcSplit orcSplit, Configuration conf, Reader.Options deleteEventReaderOptions) throws IOException {
    final boolean noDeleteDeltas = orcSplit.getDeltas().size() == 0;
    if (!HiveConf.getBoolVar(conf, ConfVars.FILTER_DELETE_EVENTS) || noDeleteDeltas) {
        LOG.debug("findMinMaxKeys() " + ConfVars.FILTER_DELETE_EVENTS + "=false");
        return new OrcRawRecordMerger.KeyInterval(null, null);
    }
    try (VectorizedOrcAcidRowBatchReader.ReaderData orcReaderData = getOrcReaderData(orcSplit.getPath(), conf, cacheTag, orcSplit.getFileKey())) {
        if (orcSplit.isOriginal()) {
            /**
             * Among originals we may have files with _copy_N suffix.  To properly
             * generate a synthetic ROW___ID for them we need
             * {@link OffsetAndBucketProperty} which could be an expensive computation
             * if there are lots of copy_N files for a given bucketId. But unless
             * there are delete events, we often don't need synthetic ROW__IDs at all.
             * Kind of chicken-and-egg - deal with this later.
             * See {@link OrcRawRecordMerger#discoverOriginalKeyBounds(Reader, int,
             * Reader.Options, Configuration, OrcRawRecordMerger.Options)}
             */
            LOG.debug("findMinMaxKeys(original split)");
            return findOriginalMinMaxKeys(orcSplit, orcReaderData.orcTail, deleteEventReaderOptions);
        }
        List<StripeInformation> stripes = orcReaderData.orcTail.getStripes();
        final long splitStart = orcSplit.getStart();
        final long splitEnd = splitStart + orcSplit.getLength();
        int firstStripeIndex = -1;
        int lastStripeIndex = -1;
        for (int i = 0; i < stripes.size(); i++) {
            StripeInformation stripe = stripes.get(i);
            long stripeEnd = stripe.getOffset() + stripe.getLength();
            if (firstStripeIndex == -1 && stripe.getOffset() >= splitStart) {
                firstStripeIndex = i;
            }
            if (lastStripeIndex == -1 && splitEnd <= stripeEnd) {
                lastStripeIndex = i;
            }
        }
        if (lastStripeIndex == -1) {
            // split goes to the EOF which is > end of stripe since file has a footer
            assert stripes.get(stripes.size() - 1).getOffset() + stripes.get(stripes.size() - 1).getLength() < splitEnd;
            lastStripeIndex = stripes.size() - 1;
        }
        if (firstStripeIndex > lastStripeIndex || firstStripeIndex == -1) {
            /**
             * If the firstStripeIndex was set after the lastStripeIndex the split lies entirely within a single stripe.
             * In case the split lies entirely within the last stripe, the firstStripeIndex will never be found, hence the
             * second condition.
             * In this case, the reader for this split will not read any data.
             * See {@link org.apache.orc.impl.RecordReaderImpl#RecordReaderImpl
             * Create a KeyInterval such that no delete delta records are loaded into memory in the deleteEventRegistry.
             */
            long minRowId = 1;
            long maxRowId = 0;
            int minBucketProp = 1;
            int maxBucketProp = 0;
            OrcRawRecordMerger.KeyInterval keyIntervalTmp = new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(1, minBucketProp, minRowId), new RecordIdentifier(0, maxBucketProp, maxRowId));
            setSARG(keyIntervalTmp, deleteEventReaderOptions, minBucketProp, maxBucketProp, minRowId, maxRowId);
            LOG.info("findMinMaxKeys(): " + keyIntervalTmp + " stripes(" + firstStripeIndex + "," + lastStripeIndex + ")");
            return keyIntervalTmp;
        }
        if (firstStripeIndex == -1 || lastStripeIndex == -1) {
            // this should not happen but... if we don't know which stripe(s) are
            // involved we can't figure out min/max bounds
            LOG.warn("Could not find stripe (" + firstStripeIndex + "," + lastStripeIndex + ")");
            return new OrcRawRecordMerger.KeyInterval(null, null);
        }
        RecordIdentifier[] keyIndex = OrcRecordUpdater.parseKeyIndex(orcReaderData.orcTail);
        if (keyIndex == null) {
            LOG.warn("Could not find keyIndex (" + firstStripeIndex + "," + lastStripeIndex + "," + stripes.size() + ")");
        }
        if (keyIndex != null && keyIndex.length != stripes.size()) {
            LOG.warn("keyIndex length doesn't match (" + firstStripeIndex + "," + lastStripeIndex + "," + stripes.size() + "," + keyIndex.length + ")");
            return new OrcRawRecordMerger.KeyInterval(null, null);
        }
        /**
         * If {@link OrcConf.ROW_INDEX_STRIDE} is set to 0 all column stats on
         * ORC file are disabled though objects for them exist but and have
         * min/max set to MIN_LONG/MAX_LONG so we only use column stats if they
         * are actually computed.  Streaming ingest used to set it 0 and Minor
         * compaction so there are lots of legacy files with no (rather, bad)
         * column stats
         */
        boolean columnStatsPresent = orcReaderData.orcTail.getFooter().getRowIndexStride() > 0;
        if (!columnStatsPresent) {
            LOG.debug("findMinMaxKeys() No ORC column stats");
        }
        List<StripeStatistics> stats = orcReaderData.reader.getVariantStripeStatistics(null);
        assert stripes.size() == stats.size() : "str.s=" + stripes.size() + " sta.s=" + stats.size();
        RecordIdentifier minKey = null;
        if (firstStripeIndex > 0 && keyIndex != null) {
            // valid keys are strictly > than this key
            minKey = keyIndex[firstStripeIndex - 1];
            // add 1 to make comparison >= to match the case of 0th stripe
            minKey.setRowId(minKey.getRowId() + 1);
        } else {
            if (columnStatsPresent) {
                minKey = getKeyInterval(stats.get(firstStripeIndex).getColumnStatistics()).getMinKey();
            }
        }
        RecordIdentifier maxKey = null;
        if (keyIndex != null) {
            maxKey = keyIndex[lastStripeIndex];
        } else {
            if (columnStatsPresent) {
                maxKey = getKeyInterval(stats.get(lastStripeIndex).getColumnStatistics()).getMaxKey();
            }
        }
        OrcRawRecordMerger.KeyInterval keyInterval = new OrcRawRecordMerger.KeyInterval(minKey, maxKey);
        LOG.info("findMinMaxKeys(): " + keyInterval + " stripes(" + firstStripeIndex + "," + lastStripeIndex + ")");
        long minBucketProp = Long.MAX_VALUE, maxBucketProp = Long.MIN_VALUE;
        long minRowId = Long.MAX_VALUE, maxRowId = Long.MIN_VALUE;
        if (columnStatsPresent) {
            /**
             * figure out min/max bucket, rowid for push down.  This is different from
             * min/max ROW__ID because ROW__ID comparison uses dictionary order on two
             * tuples (a,b,c), but PPD can only do
             * (a between (x,y) and b between(x1,y1) and c between(x2,y2))
             * Consider:
             * (0,536936448,0), (0,536936448,2), (10000001,536936448,0)
             * 1st is min ROW_ID, 3r is max ROW_ID
             * and Delete events (0,536936448,2),....,(10000001,536936448,1000000)
             * So PPD based on min/max ROW_ID would have 0<= rowId <=0 which will
             * miss this delete event.  But we still want PPD to filter out data if
             * possible.
             *
             * So use stripe stats to find proper min/max for bucketProp and rowId
             * writeId is the same in both cases
             */
            for (int i = firstStripeIndex; i <= lastStripeIndex; i++) {
                OrcRawRecordMerger.KeyInterval key = getKeyInterval(stats.get(i).getColumnStatistics());
                if (key.getMinKey().getBucketProperty() < minBucketProp) {
                    minBucketProp = key.getMinKey().getBucketProperty();
                }
                if (key.getMaxKey().getBucketProperty() > maxBucketProp) {
                    maxBucketProp = key.getMaxKey().getBucketProperty();
                }
                if (key.getMinKey().getRowId() < minRowId) {
                    minRowId = key.getMinKey().getRowId();
                }
                if (key.getMaxKey().getRowId() > maxRowId) {
                    maxRowId = key.getMaxKey().getRowId();
                }
            }
        }
        if (minBucketProp == Long.MAX_VALUE)
            minBucketProp = Long.MIN_VALUE;
        if (maxBucketProp == Long.MIN_VALUE)
            maxBucketProp = Long.MAX_VALUE;
        if (minRowId == Long.MAX_VALUE)
            minRowId = Long.MIN_VALUE;
        if (maxRowId == Long.MIN_VALUE)
            maxRowId = Long.MAX_VALUE;
        setSARG(keyInterval, deleteEventReaderOptions, minBucketProp, maxBucketProp, minRowId, maxRowId);
        return keyInterval;
    }
}
Also used : StripeStatistics(org.apache.orc.StripeStatistics) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) StripeInformation(org.apache.orc.StripeInformation)

Example 10 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class VectorizedOrcAcidRowBatchReader method setSARG.

/**
 * Generates a SearchArgument to push down to delete_delta files.
 *
 * Note that bucket is a bit packed int, so even thought all delete events
 * for a given split have the same bucket ID but not the same "bucket" value
 * {@link BucketCodec}
 */
private void setSARG(OrcRawRecordMerger.KeyInterval keyInterval, Reader.Options deleteEventReaderOptions, long minBucketProp, long maxBucketProp, long minRowId, long maxRowId) {
    SearchArgument.Builder b = null;
    if (keyInterval.getMinKey() != null) {
        RecordIdentifier k = keyInterval.getMinKey();
        b = SearchArgumentFactory.newBuilder();
        // not(ot < 7) -> ot >=7
        b.startAnd().startNot().lessThan(OrcRecordUpdater.ORIGINAL_WRITEID_FIELD_NAME, PredicateLeaf.Type.LONG, k.getWriteId()).end();
        b.startNot().lessThan(OrcRecordUpdater.BUCKET_FIELD_NAME, PredicateLeaf.Type.LONG, minBucketProp).end();
        b.startNot().lessThan(OrcRecordUpdater.ROW_ID_FIELD_NAME, PredicateLeaf.Type.LONG, minRowId).end();
        b.end();
    }
    if (keyInterval.getMaxKey() != null) {
        RecordIdentifier k = keyInterval.getMaxKey();
        if (b == null) {
            b = SearchArgumentFactory.newBuilder();
        }
        b.startAnd().lessThanEquals(OrcRecordUpdater.ORIGINAL_WRITEID_FIELD_NAME, PredicateLeaf.Type.LONG, k.getWriteId());
        b.lessThanEquals(OrcRecordUpdater.BUCKET_FIELD_NAME, PredicateLeaf.Type.LONG, maxBucketProp);
        b.lessThanEquals(OrcRecordUpdater.ROW_ID_FIELD_NAME, PredicateLeaf.Type.LONG, maxRowId);
        b.end();
    }
    if (b != null) {
        deleteEventSarg = b.build();
        LOG.info("deleteReader SARG(" + deleteEventSarg + ") ");
        deleteEventReaderOptions.searchArgument(deleteEventSarg, new String[] { OrcRecordUpdater.ORIGINAL_WRITEID_FIELD_NAME, OrcRecordUpdater.BUCKET_FIELD_NAME, OrcRecordUpdater.ROW_ID_FIELD_NAME });
        return;
    }
    deleteEventReaderOptions.searchArgument(null, null);
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) SearchArgument(org.apache.hadoop.hive.ql.io.sarg.SearchArgument)

Aggregations

RecordIdentifier (org.apache.hadoop.hive.ql.io.RecordIdentifier)40 Test (org.junit.Test)13 Path (org.apache.hadoop.fs.Path)9 AcidOutputFormat (org.apache.hadoop.hive.ql.io.AcidOutputFormat)9 StripeInformation (org.apache.orc.StripeInformation)9 Configuration (org.apache.hadoop.conf.Configuration)7 BitSet (java.util.BitSet)5 FileSystem (org.apache.hadoop.fs.FileSystem)5 ValidReadTxnList (org.apache.hadoop.hive.common.ValidReadTxnList)5 ValidWriteIdList (org.apache.hadoop.hive.common.ValidWriteIdList)5 RecordUpdater (org.apache.hadoop.hive.ql.io.RecordUpdater)5 ValidReaderWriteIdList (org.apache.hadoop.hive.common.ValidReaderWriteIdList)4 Table (org.apache.hadoop.hive.metastore.api.Table)4 VectorizedRowBatchCtx (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx)4 OrcInputFormat (org.apache.hadoop.hive.ql.io.orc.OrcInputFormat)4 ReaderKey (org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.ReaderKey)4 OrcStruct (org.apache.hadoop.hive.ql.io.orc.OrcStruct)4 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)4 ArrayList (java.util.ArrayList)3 AcidInputFormat (org.apache.hadoop.hive.ql.io.AcidInputFormat)3