Search in sources :

Example 16 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventFiltering3.

private void testDeleteEventFiltering3() throws Exception {
    boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
    boolean columnStatsPresent = OrcConf.ROW_INDEX_STRIDE.getLong(conf) != 0;
    // To create small stripes
    OrcConf.STRIPE_SIZE.setLong(conf, 1);
    // Need to use a bigger row than DummyRow for the writer to flush the stripes
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
    // Use OrcRecordUpdater.OrcOptions to set the batch size.
    OrcRecordUpdater.OrcOptions orcOptions = new OrcRecordUpdater.OrcOptions(conf);
    orcOptions.orcOptions(OrcFile.writerOptions(conf).batchSize(1));
    int bucket = 1;
    AcidOutputFormat.Options options = orcOptions.filesystem(fs).bucket(bucket).writingBase(true).minimumWriteId(10000002).maximumWriteId(10000002).inspector(bigRowInspector).reporter(Reporter.NULL).recordIdColumn(1).finalDestination(root);
    int bucketProperty = BucketCodec.V1.encode(options);
    // Create 3 stripes with 1 row each
    byte[] data = new byte[1000];
    RecordUpdater updater = new OrcRecordUpdater(root, options);
    updater.insert(10000002, new BigRow(data, 0, 0, bucket));
    updater.insert(10000002, new BigRow(data, 1, 0, bucket));
    updater.insert(10000002, new BigRow(data, 2, 0, bucket));
    updater.close(false);
    String acidFile = "base_10000002/bucket_00001";
    Path acidFilePath = new Path(root, acidFile);
    Reader reader = OrcFile.createReader(acidFilePath, OrcFile.readerOptions(conf));
    List<StripeInformation> stripes = reader.getStripes();
    // Make sure 3 stripes are created
    assertEquals(3, stripes.size());
    long fileLength = fs.getFileStatus(acidFilePath).getLen();
    // 1. Splits within a stripe
    // A split that's completely within the 2nd stripe
    StripeInformation stripe = stripes.get(1);
    OrcSplit split = new OrcSplit(acidFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
    validateKeyInterval(split, new RecordIdentifier(1, 1, 1), new RecordIdentifier(0, 0, 0), filterOn);
    // A split that's completely within the last stripe
    stripe = stripes.get(2);
    split = new OrcSplit(acidFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
    validateKeyInterval(split, new RecordIdentifier(1, 1, 1), new RecordIdentifier(0, 0, 0), filterOn);
    // 2. Splits starting at a stripe boundary
    // A split that starts where the 1st stripe starts and ends before the 1st stripe ends
    stripe = stripes.get(0);
    split = new OrcSplit(acidFilePath, null, stripe.getOffset(), stripe.getLength() - 50, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
    // The key interval for the 1st stripe
    if (columnStatsPresent) {
        validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 0), new RecordIdentifier(10000002, bucketProperty, 0), filterOn);
    } else {
        validateKeyInterval(split, null, new RecordIdentifier(10000002, bucketProperty, 0), filterOn);
    }
    // A split that starts where the 2nd stripe starts and ends after the 2nd stripe ends
    stripe = stripes.get(1);
    split = new OrcSplit(acidFilePath, null, stripe.getOffset(), stripe.getLength() + 50, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
    // The key interval for the last 2 stripes
    validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 1), new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
    // 3. Splits ending at a stripe boundary
    // A split that starts before the last stripe starts and ends at the last stripe boundary
    stripe = stripes.get(2);
    split = new OrcSplit(acidFilePath, null, stripe.getOffset() - 50, stripe.getLength() + 50, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
    // The key interval for the last stripe
    validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 2), new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
    // A split that starts after the 1st stripe starts and ends where the last stripe ends
    split = new OrcSplit(acidFilePath, null, stripes.get(0).getOffset() + 50, reader.getContentLength() - 50, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
    // The key interval for the last 2 stripes
    validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 1), new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
    // A split that starts where the 1st stripe starts and ends where the last stripe ends
    split = new OrcSplit(acidFilePath, null, stripes.get(0).getOffset(), reader.getContentLength(), new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
    // The key interval for all 3 stripes
    if (columnStatsPresent) {
        validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 0), new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
    } else {
        validateKeyInterval(split, null, new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) StripeInformation(org.apache.orc.StripeInformation)

Example 17 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventOriginalFiltering.

public void testDeleteEventOriginalFiltering() throws Exception {
    boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
    conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, false);
    // Create 3 original files with 3 rows each
    Properties properties = new Properties();
    properties.setProperty("columns", DummyOriginalRow.getColumnNamesProperty());
    properties.setProperty("columns.types", DummyOriginalRow.getColumnTypesProperty());
    OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(properties, conf);
    writerOptions.inspector(originalInspector);
    Path testFilePath = new Path(root, "000000_0");
    Writer writer = OrcFile.createWriter(testFilePath, writerOptions);
    writer.addRow(new DummyOriginalRow(0));
    writer.addRow(new DummyOriginalRow(0));
    writer.addRow(new DummyOriginalRow(0));
    writer.close();
    testFilePath = new Path(root, "000000_0_copy_1");
    writer = OrcFile.createWriter(testFilePath, writerOptions);
    writer.addRow(new DummyOriginalRow(0));
    writer.addRow(new DummyOriginalRow(0));
    writer.addRow(new DummyOriginalRow(0));
    writer.close();
    testFilePath = new Path(root, "000000_0_copy_2");
    writer = OrcFile.createWriter(testFilePath, writerOptions);
    writer.addRow(new DummyOriginalRow(0));
    writer.addRow(new DummyOriginalRow(0));
    writer.addRow(new DummyOriginalRow(0));
    writer.close();
    conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
    conf.set(ValidTxnList.VALID_TXNS_KEY, new ValidReadTxnList(new long[0], new BitSet(), 1000, Long.MAX_VALUE).writeToString());
    int bucket = 0;
    AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).filesystem(fs).bucket(bucket).writingBase(false).minimumWriteId(1).maximumWriteId(1).inspector(inspector).reporter(Reporter.NULL).recordIdColumn(1).finalDestination(root);
    int bucketProperty = BucketCodec.V1.encode(options);
    RecordUpdater updater = new OrcRecordUpdater(root, options);
    // delete 1 row from each of the original files
    // Delete the last record in this split to test boundary conditions. It should not be present in the delete event
    // registry for the next split
    updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 2, 0, bucket));
    // Delete the first record in this split to test boundary conditions. It should not be present in the delete event
    // registry for the previous split
    updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 3, 0, bucket));
    updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 7, 0, bucket));
    updater.close(false);
    // HWM is not important - just make sure deltas created above are read as if committed
    conf.set(ValidWriteIdList.VALID_WRITEIDS_KEY, "tbl:2:" + Long.MAX_VALUE + "::");
    // Set vector mode to true int the map work so that we recognize this as a vector mode execution during the split
    // generation. Without this we will not compute the offset for the synthetic row ids.
    MapWork mapWork = new MapWork();
    mapWork.setVectorMode(true);
    VectorizedRowBatchCtx vrbContext = new VectorizedRowBatchCtx();
    mapWork.setVectorizedRowBatchCtx(vrbContext);
    HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp");
    Utilities.setMapWork(conf, mapWork);
    // now we have 3 delete events total, but for each split we should only
    // load 1 into DeleteRegistry (if filtering is on)
    List<OrcInputFormat.SplitStrategy<?>> splitStrategies = getSplitStrategies();
    assertEquals(1, splitStrategies.size());
    List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
    assertEquals(3, splits.size());
    assertEquals(root.toUri().toString() + File.separator + "000000_0", splits.get(0).getPath().toUri().toString());
    assertTrue(splits.get(0).isOriginal());
    assertEquals(root.toUri().toString() + File.separator + "000000_0_copy_1", splits.get(1).getPath().toUri().toString());
    assertTrue(splits.get(1).isOriginal());
    assertEquals(root.toUri().toString() + File.separator + "000000_0_copy_2", splits.get(2).getPath().toUri().toString());
    assertTrue(splits.get(2).isOriginal());
    VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(0), conf, Reporter.NULL, vrbContext);
    ColumnizedDeleteEventRegistry deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
    assertEquals("number of delete events for stripe 1", filterOn ? 1 : 3, deleteEventRegistry.size());
    OrcRawRecordMerger.KeyInterval keyInterval = vectorizedReader.getKeyInterval();
    if (filterOn) {
        assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(0, bucketProperty, 2)), keyInterval);
    } else {
        assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
    }
    vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(1), conf, Reporter.NULL, vrbContext);
    deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
    assertEquals("number of delete events for stripe 2", filterOn ? 1 : 3, deleteEventRegistry.size());
    keyInterval = vectorizedReader.getKeyInterval();
    if (filterOn) {
        assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(0, bucketProperty, 3), new RecordIdentifier(0, bucketProperty, 5)), keyInterval);
    } else {
        assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
    }
    vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(2), conf, Reporter.NULL, vrbContext);
    deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
    assertEquals("number of delete events for stripe 3", filterOn ? 1 : 3, deleteEventRegistry.size());
    keyInterval = vectorizedReader.getKeyInterval();
    if (filterOn) {
        assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(0, bucketProperty, 6), new RecordIdentifier(0, bucketProperty, 8)), keyInterval);
    } else {
        assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
    }
}
Also used : Properties(java.util.Properties) ValidReadTxnList(org.apache.hadoop.hive.common.ValidReadTxnList) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) Path(org.apache.hadoop.fs.Path) BitSet(java.util.BitSet) VectorizedRowBatchCtx(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ColumnizedDeleteEventRegistry(org.apache.hadoop.hive.ql.io.orc.VectorizedOrcAcidRowBatchReader.ColumnizedDeleteEventRegistry)

Example 18 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class CompactorTest method addFile.

private void addFile(Table t, Partition p, long minTxn, long maxTxn, int numRecords, FileType type, int numBuckets, boolean allBucketsPresent, long visibilityId) throws Exception {
    String partValue = (p == null) ? null : p.getValues().get(0);
    Path location = new Path(getLocation(t.getTableName(), partValue));
    String filename = null;
    switch(type) {
        case BASE:
            filename = AcidUtils.BASE_PREFIX + maxTxn + (visibilityId > 0 ? AcidUtils.VISIBILITY_PREFIX + visibilityId : "");
            break;
        // Fall through to delta
        case LENGTH_FILE:
        case DELTA:
            filename = makeDeltaDirName(minTxn, maxTxn);
            break;
        // handled below
        case LEGACY:
            break;
    }
    FileSystem fs = FileSystem.get(conf);
    for (int bucket = 0; bucket < numBuckets; bucket++) {
        // skip one
        if (bucket == 0 && !allBucketsPresent)
            continue;
        Path partFile = null;
        if (type == FileType.LEGACY) {
            partFile = new Path(location, String.format(AcidUtils.LEGACY_FILE_BUCKET_DIGITS, bucket) + "_0");
        } else {
            Path dir = new Path(location, filename);
            fs.mkdirs(dir);
            partFile = AcidUtils.createBucketFile(dir, bucket);
            if (type == FileType.LENGTH_FILE) {
                partFile = new Path(partFile.toString() + AcidUtils.DELTA_SIDE_FILE_SUFFIX);
            }
        }
        FSDataOutputStream out = fs.create(partFile);
        if (type == FileType.LENGTH_FILE) {
            // hmm - length files should store length in bytes...
            out.writeInt(numRecords);
        } else {
            for (int i = 0; i < numRecords; i++) {
                RecordIdentifier ri = new RecordIdentifier(maxTxn - 1, bucket, i);
                ri.write(out);
                out.writeBytes("mary had a little lamb its fleece was white as snow\n");
            }
        }
        out.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream)

Example 19 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class OrcRawRecordMerger method discoverOriginalKeyBounds.

/**
   * Find the key range for original bucket files.
   * @param reader the reader
   * @param bucket the bucket number we are reading
   * @param options the options for reading with
   * @throws IOException
   */
private void discoverOriginalKeyBounds(Reader reader, int bucket, Reader.Options options) throws IOException {
    long rowLength = 0;
    long rowOffset = 0;
    long offset = options.getOffset();
    long maxOffset = options.getMaxOffset();
    boolean isTail = true;
    for (StripeInformation stripe : reader.getStripes()) {
        if (offset > stripe.getOffset()) {
            rowOffset += stripe.getNumberOfRows();
        } else if (maxOffset > stripe.getOffset()) {
            rowLength += stripe.getNumberOfRows();
        } else {
            isTail = false;
            break;
        }
    }
    if (rowOffset > 0) {
        minKey = new RecordIdentifier(0, bucket, rowOffset - 1);
    }
    if (!isTail) {
        maxKey = new RecordIdentifier(0, bucket, rowOffset + rowLength - 1);
    }
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) StripeInformation(org.apache.orc.StripeInformation)

Example 20 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class TestRecordInspectorImpl method testExtractRecordIdentifier.

@Test
public void testExtractRecordIdentifier() {
    RecordIdentifier recordIdentifier = new RecordIdentifier(10L, 4, 20L);
    MutableRecord record = new MutableRecord(1, "hello", recordIdentifier);
    assertThat(inspector.extractRecordIdentifier(record), is(recordIdentifier));
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) MutableRecord(org.apache.hive.hcatalog.streaming.mutate.MutableRecord) Test(org.junit.Test)

Aggregations

RecordIdentifier (org.apache.hadoop.hive.ql.io.RecordIdentifier)40 Test (org.junit.Test)13 Path (org.apache.hadoop.fs.Path)9 AcidOutputFormat (org.apache.hadoop.hive.ql.io.AcidOutputFormat)9 StripeInformation (org.apache.orc.StripeInformation)9 Configuration (org.apache.hadoop.conf.Configuration)7 BitSet (java.util.BitSet)5 FileSystem (org.apache.hadoop.fs.FileSystem)5 ValidReadTxnList (org.apache.hadoop.hive.common.ValidReadTxnList)5 ValidWriteIdList (org.apache.hadoop.hive.common.ValidWriteIdList)5 RecordUpdater (org.apache.hadoop.hive.ql.io.RecordUpdater)5 ValidReaderWriteIdList (org.apache.hadoop.hive.common.ValidReaderWriteIdList)4 Table (org.apache.hadoop.hive.metastore.api.Table)4 VectorizedRowBatchCtx (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx)4 OrcInputFormat (org.apache.hadoop.hive.ql.io.orc.OrcInputFormat)4 ReaderKey (org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.ReaderKey)4 OrcStruct (org.apache.hadoop.hive.ql.io.orc.OrcStruct)4 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)4 ArrayList (java.util.ArrayList)3 AcidInputFormat (org.apache.hadoop.hive.ql.io.AcidInputFormat)3