Search in sources :

Example 11 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class VectorizedOrcAcidRowBatchReader method findOriginalMinMaxKeys.

private OrcRawRecordMerger.KeyInterval findOriginalMinMaxKeys(OrcSplit orcSplit, OrcTail orcTail, Reader.Options deleteEventReaderOptions) {
    if (syntheticProps == null) {
        // If there aren't any delete delta files, then we don't need this anyway.
        return new OrcRawRecordMerger.KeyInterval(null, null);
    }
    long splitStart = orcSplit.getStart();
    long splitEnd = orcSplit.getStart() + orcSplit.getLength();
    long minRowId = syntheticProps.getRowIdOffset();
    long maxRowId = syntheticProps.getRowIdOffset();
    for (StripeInformation stripe : orcTail.getStripes()) {
        if (splitStart > stripe.getOffset()) {
            // This stripe starts before the current split starts. This stripe is not included in this split.
            minRowId += stripe.getNumberOfRows();
        }
        if (splitEnd > stripe.getOffset()) {
            // This stripe starts before the current split ends.
            maxRowId += stripe.getNumberOfRows();
        } else {
            // Remaining stripes are not included in this split.
            break;
        }
    }
    RecordIdentifier minKey = new RecordIdentifier(syntheticProps.getSyntheticWriteId(), syntheticProps.getBucketProperty(), minRowId);
    RecordIdentifier maxKey = new RecordIdentifier(syntheticProps.getSyntheticWriteId(), syntheticProps.getBucketProperty(), maxRowId > 0 ? maxRowId - 1 : 0);
    OrcRawRecordMerger.KeyInterval keyIntervalTmp = new OrcRawRecordMerger.KeyInterval(minKey, maxKey);
    if (minRowId >= maxRowId) {
        /**
         * The split lies entirely within a single stripe. In this case, the reader for this split will not read any data.
         * See {@link org.apache.orc.impl.RecordReaderImpl#RecordReaderImpl
         * We can return the min max key interval as is (it will not read any of the delete delta records into mem)
         */
        LOG.info("findOriginalMinMaxKeys(): This split starts and ends in the same stripe.");
    }
    LOG.info("findOriginalMinMaxKeys(): " + keyIntervalTmp);
    // Using min/max ROW__ID from original will work for ppd to the delete deltas because the writeid is the same in
    // the min and the max ROW__ID
    setSARG(keyIntervalTmp, deleteEventReaderOptions, minKey.getBucketProperty(), maxKey.getBucketProperty(), minKey.getRowId(), maxKey.getRowId());
    return keyIntervalTmp;
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) StripeInformation(org.apache.orc.StripeInformation)

Example 12 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class FixAcidKeyIndex method validate.

public static AcidKeyIndexValidationResult validate(Configuration conf, Path inputPath) throws IOException {
    AcidKeyIndexValidationResult result = new AcidKeyIndexValidationResult();
    FileSystem fs = inputPath.getFileSystem(conf);
    try (Reader reader = OrcFile.createReader(fs, inputPath);
        RecordReader rr = reader.rows()) {
        List<StripeInformation> stripes = reader.getStripes();
        RecordIdentifier[] keyIndex = OrcRecordUpdater.parseKeyIndex(reader);
        StructObjectInspector soi = (StructObjectInspector) reader.getObjectInspector();
        // struct<operation:int,originalTransaction:bigint,bucket:int,rowId:bigint,currentTransaction:bigint
        List<? extends StructField> structFields = soi.getAllStructFieldRefs();
        StructField transactionField = structFields.get(1);
        LongObjectInspector transactionOI = (LongObjectInspector) transactionField.getFieldObjectInspector();
        StructField bucketField = structFields.get(2);
        IntObjectInspector bucketOI = (IntObjectInspector) bucketField.getFieldObjectInspector();
        StructField rowIdField = structFields.get(3);
        LongObjectInspector rowIdOI = (LongObjectInspector) rowIdField.getFieldObjectInspector();
        long rowsProcessed = 0;
        for (int i = 0; i < stripes.size(); i++) {
            rowsProcessed += stripes.get(i).getNumberOfRows();
            rr.seekToRow(rowsProcessed - 1);
            OrcStruct row = (OrcStruct) rr.next(null);
            long lastTransaction = transactionOI.get(soi.getStructFieldData(row, transactionField));
            int lastBucket = bucketOI.get(soi.getStructFieldData(row, bucketField));
            long lastRowId = rowIdOI.get(soi.getStructFieldData(row, rowIdField));
            RecordIdentifier recordIdentifier = new RecordIdentifier(lastTransaction, lastBucket, lastRowId);
            result.recordIdentifiers.add(recordIdentifier);
            if (stripes.size() != keyIndex.length || keyIndex[i] == null || recordIdentifier.compareTo(keyIndex[i]) != 0) {
                result.isValid = false;
            }
        }
    }
    return result;
}
Also used : LongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) FileSystem(org.apache.hadoop.fs.FileSystem) StripeInformation(org.apache.orc.StripeInformation) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 13 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class TestOrcRawRecordMerger method testOriginalReaderPair.

@Test
public void testOriginalReaderPair() throws Exception {
    int BUCKET = 10;
    ReaderKey key = new ReaderKey();
    Configuration conf = new Configuration();
    int bucketProperty = OrcRawRecordMerger.encodeBucketId(conf, BUCKET, 0);
    Reader reader = createMockOriginalReader();
    RecordIdentifier minKey = new RecordIdentifier(0, bucketProperty, 1);
    RecordIdentifier maxKey = new RecordIdentifier(0, bucketProperty, 3);
    boolean[] includes = new boolean[] { true, true };
    FileSystem fs = FileSystem.getLocal(conf);
    Path root = new Path(tmpDir, "testOriginalReaderPair");
    fs.makeQualified(root);
    fs.create(root);
    ReaderPair pair = new OrcRawRecordMerger.OriginalReaderPairToRead(key, reader, BUCKET, minKey, maxKey, new Reader.Options().include(includes), new OrcRawRecordMerger.Options().rootPath(root), conf, new ValidReaderWriteIdList(), 0);
    RecordReader recordReader = pair.getRecordReader();
    assertEquals(0, key.getWriteId());
    assertEquals(bucketProperty, key.getBucketProperty());
    assertEquals(2, key.getRowId());
    assertEquals(0, key.getCurrentWriteId());
    assertEquals("third", value(pair.nextRecord()));
    pair.next(pair.nextRecord());
    assertEquals(0, key.getWriteId());
    assertEquals(bucketProperty, key.getBucketProperty());
    assertEquals(3, key.getRowId());
    assertEquals(0, key.getCurrentWriteId());
    assertEquals("fourth", value(pair.nextRecord()));
    pair.next(pair.nextRecord());
    assertEquals(null, pair.nextRecord());
    Mockito.verify(recordReader).close();
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) ReaderPair(org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.ReaderPair) FileSystem(org.apache.hadoop.fs.FileSystem) ReaderKey(org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.ReaderKey) ValidReaderWriteIdList(org.apache.hadoop.hive.common.ValidReaderWriteIdList) Test(org.junit.Test)

Example 14 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class TestOrcRawRecordMerger method testNewBase.

@Test
public void testNewBase() throws Exception {
    Configuration conf = new Configuration();
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "col1");
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "string");
    HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
    Reader reader = Mockito.mock(Reader.class, settings);
    RecordReader recordReader = Mockito.mock(RecordReader.class, settings);
    List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
    OrcProto.Type.Builder typeBuilder = OrcProto.Type.newBuilder();
    typeBuilder.setKind(OrcProto.Type.Kind.STRUCT).addSubtypes(1).addSubtypes(2).addSubtypes(3).addSubtypes(4).addSubtypes(5).addSubtypes(6);
    typeBuilder.addAllFieldNames(Lists.newArrayList(OrcRecordUpdater.OPERATION_FIELD_NAME, OrcRecordUpdater.CURRENT_WRITEID_FIELD_NAME, OrcRecordUpdater.BUCKET_FIELD_NAME, OrcRecordUpdater.ROW_ID_FIELD_NAME, OrcRecordUpdater.CURRENT_WRITEID_FIELD_NAME, OrcRecordUpdater.ROW_FIELD_NAME));
    types.add(typeBuilder.build());
    types.add(null);
    types.add(null);
    types.add(null);
    types.add(null);
    types.add(null);
    typeBuilder.clearSubtypes();
    typeBuilder.addSubtypes(7);
    typeBuilder.addAllFieldNames(Lists.newArrayList("col1"));
    types.add(typeBuilder.build());
    typeBuilder.clear();
    typeBuilder.setKind(OrcProto.Type.Kind.STRING);
    types.add(typeBuilder.build());
    when(reader.getTypes()).thenReturn(types);
    when(reader.rowsOptions(any(Reader.Options.class), any())).thenReturn(recordReader);
    OrcStruct row1 = new OrcStruct(OrcRecordUpdater.FIELDS);
    setRow(row1, OrcRecordUpdater.INSERT_OPERATION, 10, 20, 20, 100, "first");
    OrcStruct row2 = new OrcStruct(OrcRecordUpdater.FIELDS);
    setRow(row2, OrcRecordUpdater.INSERT_OPERATION, 10, 20, 30, 110, "second");
    OrcStruct row3 = new OrcStruct(OrcRecordUpdater.FIELDS);
    setRow(row3, OrcRecordUpdater.INSERT_OPERATION, 10, 20, 40, 120, "third");
    OrcStruct row4 = new OrcStruct(OrcRecordUpdater.FIELDS);
    setRow(row4, OrcRecordUpdater.INSERT_OPERATION, 40, 50, 60, 130, "fourth");
    OrcStruct row5 = new OrcStruct(OrcRecordUpdater.FIELDS);
    setRow(row5, OrcRecordUpdater.INSERT_OPERATION, 40, 50, 61, 140, "fifth");
    when(recordReader.hasNext()).thenReturn(true, true, true, true, true, false);
    when(recordReader.getProgress()).thenReturn(1.0f);
    when(recordReader.next(null)).thenReturn(row1, row4);
    when(recordReader.next(row1)).thenReturn(row2);
    when(recordReader.next(row2)).thenReturn(row3);
    when(recordReader.next(row3)).thenReturn(row5);
    when(reader.hasMetadataValue(OrcRecordUpdater.ACID_KEY_INDEX_NAME)).thenReturn(true);
    when(reader.getMetadataValue(OrcRecordUpdater.ACID_KEY_INDEX_NAME)).thenReturn(ByteBuffer.wrap("10,20,30;40,50,60;40,50,61".getBytes("UTF-8")));
    when(reader.getStripes()).thenReturn(createStripes(2, 2, 1));
    OrcRawRecordMerger merger = new OrcRawRecordMerger(conf, false, reader, false, 10, createMaximalTxnList(), new Reader.Options().range(1000, 1000), null, new OrcRawRecordMerger.Options());
    RecordReader rr = merger.getCurrentReader().getRecordReader();
    assertEquals(0, merger.getOtherReaders().size());
    assertEquals("" + merger.getMinKey(), new RecordIdentifier(10, 20, 30), merger.getMinKey());
    assertEquals("" + merger.getMaxKey(), new RecordIdentifier(40, 50, 60), merger.getMaxKey());
    RecordIdentifier id = merger.createKey();
    OrcStruct event = merger.createValue();
    assertEquals(true, merger.next(id, event));
    assertEquals(10, id.getWriteId());
    assertEquals(20, id.getBucketProperty());
    assertEquals(40, id.getRowId());
    assertEquals("third", getValue(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(40, id.getWriteId());
    assertEquals(50, id.getBucketProperty());
    assertEquals(60, id.getRowId());
    assertEquals("fourth", getValue(event));
    assertEquals(false, merger.next(id, event));
    assertEquals(1.0, merger.getProgress(), 0.01);
    merger.close();
    Mockito.verify(rr).close();
    Mockito.verify(rr).getProgress();
    StructObjectInspector eventObjectInspector = (StructObjectInspector) merger.getObjectInspector();
    List<? extends StructField> fields = eventObjectInspector.getAllStructFieldRefs();
    assertEquals(OrcRecordUpdater.FIELDS, fields.size());
    assertEquals(OrcRecordUpdater.OPERATION_FIELD_NAME, fields.get(OrcRecordUpdater.OPERATION).getFieldName());
    assertEquals(OrcRecordUpdater.CURRENT_WRITEID_FIELD_NAME, fields.get(OrcRecordUpdater.CURRENT_WRITEID).getFieldName());
    assertEquals(OrcRecordUpdater.ORIGINAL_WRITEID_FIELD_NAME, fields.get(OrcRecordUpdater.ORIGINAL_WRITEID).getFieldName());
    assertEquals(OrcRecordUpdater.BUCKET_FIELD_NAME, fields.get(OrcRecordUpdater.BUCKET).getFieldName());
    assertEquals(OrcRecordUpdater.ROW_ID_FIELD_NAME, fields.get(OrcRecordUpdater.ROW_ID).getFieldName());
    StructObjectInspector rowObjectInspector = (StructObjectInspector) fields.get(OrcRecordUpdater.ROW).getFieldObjectInspector();
    assertEquals("col1", rowObjectInspector.getAllStructFieldRefs().get(0).getFieldName());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) OrcProto(org.apache.orc.OrcProto) ArrayList(java.util.ArrayList) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 15 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class TestOrcRawRecordMerger method testOrdering.

@Test
public void testOrdering() throws Exception {
    ReaderKey left = new ReaderKey(100, 200, 1200, 300);
    ReaderKey right = new ReaderKey();
    right.setValues(100, 200, 1000, 200, false);
    assertTrue(right.compareTo(left) < 0);
    assertTrue(left.compareTo(right) > 0);
    assertEquals(false, left.equals(right));
    left.set(right);
    assertTrue(right.compareTo(left) == 0);
    assertEquals(true, right.equals(left));
    right.setRowId(2000);
    assertTrue(right.compareTo(left) > 0);
    left.setValues(1, 2, 3, 4, false);
    right.setValues(100, 2, 3, 4, false);
    assertTrue(left.compareTo(right) < 0);
    assertTrue(right.compareTo(left) > 0);
    left.setValues(1, 2, 3, 4, false);
    right.setValues(1, 100, 3, 4, false);
    assertTrue(left.compareTo(right) < 0);
    assertTrue(right.compareTo(left) > 0);
    left.setValues(1, 2, 3, 100, false);
    right.setValues(1, 2, 3, 4, false);
    assertTrue(left.compareTo(right) < 0);
    assertTrue(right.compareTo(left) > 0);
    // ensure that we are consistent when comparing to the base class
    RecordIdentifier ri = new RecordIdentifier(1, 2, 3);
    assertEquals(1, ri.compareTo(left));
    assertEquals(-1, left.compareTo(ri));
    assertEquals(false, ri.equals(left));
    assertEquals(false, left.equals(ri));
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) ReaderKey(org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.ReaderKey) Test(org.junit.Test)

Aggregations

RecordIdentifier (org.apache.hadoop.hive.ql.io.RecordIdentifier)40 Test (org.junit.Test)13 Path (org.apache.hadoop.fs.Path)9 AcidOutputFormat (org.apache.hadoop.hive.ql.io.AcidOutputFormat)9 StripeInformation (org.apache.orc.StripeInformation)9 Configuration (org.apache.hadoop.conf.Configuration)7 BitSet (java.util.BitSet)5 FileSystem (org.apache.hadoop.fs.FileSystem)5 ValidReadTxnList (org.apache.hadoop.hive.common.ValidReadTxnList)5 ValidWriteIdList (org.apache.hadoop.hive.common.ValidWriteIdList)5 RecordUpdater (org.apache.hadoop.hive.ql.io.RecordUpdater)5 ValidReaderWriteIdList (org.apache.hadoop.hive.common.ValidReaderWriteIdList)4 Table (org.apache.hadoop.hive.metastore.api.Table)4 VectorizedRowBatchCtx (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx)4 OrcInputFormat (org.apache.hadoop.hive.ql.io.orc.OrcInputFormat)4 ReaderKey (org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.ReaderKey)4 OrcStruct (org.apache.hadoop.hive.ql.io.orc.OrcStruct)4 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)4 ArrayList (java.util.ArrayList)3 AcidInputFormat (org.apache.hadoop.hive.ql.io.AcidInputFormat)3