use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class VectorizedOrcAcidRowBatchReader method findOriginalMinMaxKeys.
private OrcRawRecordMerger.KeyInterval findOriginalMinMaxKeys(OrcSplit orcSplit, OrcTail orcTail, Reader.Options deleteEventReaderOptions) {
if (syntheticProps == null) {
// If there aren't any delete delta files, then we don't need this anyway.
return new OrcRawRecordMerger.KeyInterval(null, null);
}
long splitStart = orcSplit.getStart();
long splitEnd = orcSplit.getStart() + orcSplit.getLength();
long minRowId = syntheticProps.getRowIdOffset();
long maxRowId = syntheticProps.getRowIdOffset();
for (StripeInformation stripe : orcTail.getStripes()) {
if (splitStart > stripe.getOffset()) {
// This stripe starts before the current split starts. This stripe is not included in this split.
minRowId += stripe.getNumberOfRows();
}
if (splitEnd > stripe.getOffset()) {
// This stripe starts before the current split ends.
maxRowId += stripe.getNumberOfRows();
} else {
// Remaining stripes are not included in this split.
break;
}
}
RecordIdentifier minKey = new RecordIdentifier(syntheticProps.getSyntheticWriteId(), syntheticProps.getBucketProperty(), minRowId);
RecordIdentifier maxKey = new RecordIdentifier(syntheticProps.getSyntheticWriteId(), syntheticProps.getBucketProperty(), maxRowId > 0 ? maxRowId - 1 : 0);
OrcRawRecordMerger.KeyInterval keyIntervalTmp = new OrcRawRecordMerger.KeyInterval(minKey, maxKey);
if (minRowId >= maxRowId) {
/**
* The split lies entirely within a single stripe. In this case, the reader for this split will not read any data.
* See {@link org.apache.orc.impl.RecordReaderImpl#RecordReaderImpl
* We can return the min max key interval as is (it will not read any of the delete delta records into mem)
*/
LOG.info("findOriginalMinMaxKeys(): This split starts and ends in the same stripe.");
}
LOG.info("findOriginalMinMaxKeys(): " + keyIntervalTmp);
// Using min/max ROW__ID from original will work for ppd to the delete deltas because the writeid is the same in
// the min and the max ROW__ID
setSARG(keyIntervalTmp, deleteEventReaderOptions, minKey.getBucketProperty(), maxKey.getBucketProperty(), minKey.getRowId(), maxKey.getRowId());
return keyIntervalTmp;
}
use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class FixAcidKeyIndex method validate.
public static AcidKeyIndexValidationResult validate(Configuration conf, Path inputPath) throws IOException {
AcidKeyIndexValidationResult result = new AcidKeyIndexValidationResult();
FileSystem fs = inputPath.getFileSystem(conf);
try (Reader reader = OrcFile.createReader(fs, inputPath);
RecordReader rr = reader.rows()) {
List<StripeInformation> stripes = reader.getStripes();
RecordIdentifier[] keyIndex = OrcRecordUpdater.parseKeyIndex(reader);
StructObjectInspector soi = (StructObjectInspector) reader.getObjectInspector();
// struct<operation:int,originalTransaction:bigint,bucket:int,rowId:bigint,currentTransaction:bigint
List<? extends StructField> structFields = soi.getAllStructFieldRefs();
StructField transactionField = structFields.get(1);
LongObjectInspector transactionOI = (LongObjectInspector) transactionField.getFieldObjectInspector();
StructField bucketField = structFields.get(2);
IntObjectInspector bucketOI = (IntObjectInspector) bucketField.getFieldObjectInspector();
StructField rowIdField = structFields.get(3);
LongObjectInspector rowIdOI = (LongObjectInspector) rowIdField.getFieldObjectInspector();
long rowsProcessed = 0;
for (int i = 0; i < stripes.size(); i++) {
rowsProcessed += stripes.get(i).getNumberOfRows();
rr.seekToRow(rowsProcessed - 1);
OrcStruct row = (OrcStruct) rr.next(null);
long lastTransaction = transactionOI.get(soi.getStructFieldData(row, transactionField));
int lastBucket = bucketOI.get(soi.getStructFieldData(row, bucketField));
long lastRowId = rowIdOI.get(soi.getStructFieldData(row, rowIdField));
RecordIdentifier recordIdentifier = new RecordIdentifier(lastTransaction, lastBucket, lastRowId);
result.recordIdentifiers.add(recordIdentifier);
if (stripes.size() != keyIndex.length || keyIndex[i] == null || recordIdentifier.compareTo(keyIndex[i]) != 0) {
result.isValid = false;
}
}
}
return result;
}
use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class TestOrcRawRecordMerger method testOriginalReaderPair.
@Test
public void testOriginalReaderPair() throws Exception {
int BUCKET = 10;
ReaderKey key = new ReaderKey();
Configuration conf = new Configuration();
int bucketProperty = OrcRawRecordMerger.encodeBucketId(conf, BUCKET, 0);
Reader reader = createMockOriginalReader();
RecordIdentifier minKey = new RecordIdentifier(0, bucketProperty, 1);
RecordIdentifier maxKey = new RecordIdentifier(0, bucketProperty, 3);
boolean[] includes = new boolean[] { true, true };
FileSystem fs = FileSystem.getLocal(conf);
Path root = new Path(tmpDir, "testOriginalReaderPair");
fs.makeQualified(root);
fs.create(root);
ReaderPair pair = new OrcRawRecordMerger.OriginalReaderPairToRead(key, reader, BUCKET, minKey, maxKey, new Reader.Options().include(includes), new OrcRawRecordMerger.Options().rootPath(root), conf, new ValidReaderWriteIdList(), 0);
RecordReader recordReader = pair.getRecordReader();
assertEquals(0, key.getWriteId());
assertEquals(bucketProperty, key.getBucketProperty());
assertEquals(2, key.getRowId());
assertEquals(0, key.getCurrentWriteId());
assertEquals("third", value(pair.nextRecord()));
pair.next(pair.nextRecord());
assertEquals(0, key.getWriteId());
assertEquals(bucketProperty, key.getBucketProperty());
assertEquals(3, key.getRowId());
assertEquals(0, key.getCurrentWriteId());
assertEquals("fourth", value(pair.nextRecord()));
pair.next(pair.nextRecord());
assertEquals(null, pair.nextRecord());
Mockito.verify(recordReader).close();
}
use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class TestOrcRawRecordMerger method testNewBase.
@Test
public void testNewBase() throws Exception {
Configuration conf = new Configuration();
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "col1");
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "string");
HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
Reader reader = Mockito.mock(Reader.class, settings);
RecordReader recordReader = Mockito.mock(RecordReader.class, settings);
List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
OrcProto.Type.Builder typeBuilder = OrcProto.Type.newBuilder();
typeBuilder.setKind(OrcProto.Type.Kind.STRUCT).addSubtypes(1).addSubtypes(2).addSubtypes(3).addSubtypes(4).addSubtypes(5).addSubtypes(6);
typeBuilder.addAllFieldNames(Lists.newArrayList(OrcRecordUpdater.OPERATION_FIELD_NAME, OrcRecordUpdater.CURRENT_WRITEID_FIELD_NAME, OrcRecordUpdater.BUCKET_FIELD_NAME, OrcRecordUpdater.ROW_ID_FIELD_NAME, OrcRecordUpdater.CURRENT_WRITEID_FIELD_NAME, OrcRecordUpdater.ROW_FIELD_NAME));
types.add(typeBuilder.build());
types.add(null);
types.add(null);
types.add(null);
types.add(null);
types.add(null);
typeBuilder.clearSubtypes();
typeBuilder.addSubtypes(7);
typeBuilder.addAllFieldNames(Lists.newArrayList("col1"));
types.add(typeBuilder.build());
typeBuilder.clear();
typeBuilder.setKind(OrcProto.Type.Kind.STRING);
types.add(typeBuilder.build());
when(reader.getTypes()).thenReturn(types);
when(reader.rowsOptions(any(Reader.Options.class), any())).thenReturn(recordReader);
OrcStruct row1 = new OrcStruct(OrcRecordUpdater.FIELDS);
setRow(row1, OrcRecordUpdater.INSERT_OPERATION, 10, 20, 20, 100, "first");
OrcStruct row2 = new OrcStruct(OrcRecordUpdater.FIELDS);
setRow(row2, OrcRecordUpdater.INSERT_OPERATION, 10, 20, 30, 110, "second");
OrcStruct row3 = new OrcStruct(OrcRecordUpdater.FIELDS);
setRow(row3, OrcRecordUpdater.INSERT_OPERATION, 10, 20, 40, 120, "third");
OrcStruct row4 = new OrcStruct(OrcRecordUpdater.FIELDS);
setRow(row4, OrcRecordUpdater.INSERT_OPERATION, 40, 50, 60, 130, "fourth");
OrcStruct row5 = new OrcStruct(OrcRecordUpdater.FIELDS);
setRow(row5, OrcRecordUpdater.INSERT_OPERATION, 40, 50, 61, 140, "fifth");
when(recordReader.hasNext()).thenReturn(true, true, true, true, true, false);
when(recordReader.getProgress()).thenReturn(1.0f);
when(recordReader.next(null)).thenReturn(row1, row4);
when(recordReader.next(row1)).thenReturn(row2);
when(recordReader.next(row2)).thenReturn(row3);
when(recordReader.next(row3)).thenReturn(row5);
when(reader.hasMetadataValue(OrcRecordUpdater.ACID_KEY_INDEX_NAME)).thenReturn(true);
when(reader.getMetadataValue(OrcRecordUpdater.ACID_KEY_INDEX_NAME)).thenReturn(ByteBuffer.wrap("10,20,30;40,50,60;40,50,61".getBytes("UTF-8")));
when(reader.getStripes()).thenReturn(createStripes(2, 2, 1));
OrcRawRecordMerger merger = new OrcRawRecordMerger(conf, false, reader, false, 10, createMaximalTxnList(), new Reader.Options().range(1000, 1000), null, new OrcRawRecordMerger.Options());
RecordReader rr = merger.getCurrentReader().getRecordReader();
assertEquals(0, merger.getOtherReaders().size());
assertEquals("" + merger.getMinKey(), new RecordIdentifier(10, 20, 30), merger.getMinKey());
assertEquals("" + merger.getMaxKey(), new RecordIdentifier(40, 50, 60), merger.getMaxKey());
RecordIdentifier id = merger.createKey();
OrcStruct event = merger.createValue();
assertEquals(true, merger.next(id, event));
assertEquals(10, id.getWriteId());
assertEquals(20, id.getBucketProperty());
assertEquals(40, id.getRowId());
assertEquals("third", getValue(event));
assertEquals(true, merger.next(id, event));
assertEquals(40, id.getWriteId());
assertEquals(50, id.getBucketProperty());
assertEquals(60, id.getRowId());
assertEquals("fourth", getValue(event));
assertEquals(false, merger.next(id, event));
assertEquals(1.0, merger.getProgress(), 0.01);
merger.close();
Mockito.verify(rr).close();
Mockito.verify(rr).getProgress();
StructObjectInspector eventObjectInspector = (StructObjectInspector) merger.getObjectInspector();
List<? extends StructField> fields = eventObjectInspector.getAllStructFieldRefs();
assertEquals(OrcRecordUpdater.FIELDS, fields.size());
assertEquals(OrcRecordUpdater.OPERATION_FIELD_NAME, fields.get(OrcRecordUpdater.OPERATION).getFieldName());
assertEquals(OrcRecordUpdater.CURRENT_WRITEID_FIELD_NAME, fields.get(OrcRecordUpdater.CURRENT_WRITEID).getFieldName());
assertEquals(OrcRecordUpdater.ORIGINAL_WRITEID_FIELD_NAME, fields.get(OrcRecordUpdater.ORIGINAL_WRITEID).getFieldName());
assertEquals(OrcRecordUpdater.BUCKET_FIELD_NAME, fields.get(OrcRecordUpdater.BUCKET).getFieldName());
assertEquals(OrcRecordUpdater.ROW_ID_FIELD_NAME, fields.get(OrcRecordUpdater.ROW_ID).getFieldName());
StructObjectInspector rowObjectInspector = (StructObjectInspector) fields.get(OrcRecordUpdater.ROW).getFieldObjectInspector();
assertEquals("col1", rowObjectInspector.getAllStructFieldRefs().get(0).getFieldName());
}
use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class TestOrcRawRecordMerger method testOrdering.
@Test
public void testOrdering() throws Exception {
ReaderKey left = new ReaderKey(100, 200, 1200, 300);
ReaderKey right = new ReaderKey();
right.setValues(100, 200, 1000, 200, false);
assertTrue(right.compareTo(left) < 0);
assertTrue(left.compareTo(right) > 0);
assertEquals(false, left.equals(right));
left.set(right);
assertTrue(right.compareTo(left) == 0);
assertEquals(true, right.equals(left));
right.setRowId(2000);
assertTrue(right.compareTo(left) > 0);
left.setValues(1, 2, 3, 4, false);
right.setValues(100, 2, 3, 4, false);
assertTrue(left.compareTo(right) < 0);
assertTrue(right.compareTo(left) > 0);
left.setValues(1, 2, 3, 4, false);
right.setValues(1, 100, 3, 4, false);
assertTrue(left.compareTo(right) < 0);
assertTrue(right.compareTo(left) > 0);
left.setValues(1, 2, 3, 100, false);
right.setValues(1, 2, 3, 4, false);
assertTrue(left.compareTo(right) < 0);
assertTrue(right.compareTo(left) > 0);
// ensure that we are consistent when comparing to the base class
RecordIdentifier ri = new RecordIdentifier(1, 2, 3);
assertEquals(1, ri.compareTo(left));
assertEquals(-1, left.compareTo(ri));
assertEquals(false, ri.equals(left));
assertEquals(false, left.equals(ri));
}
Aggregations