use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class BucketIdResolverImpl method attachBucketIdToRecord.
@Override
public Object attachBucketIdToRecord(Object record) {
int bucketId = computeBucketId(record);
int bucketProperty = BucketCodec.V1.encode(new AcidOutputFormat.Options(null).bucket(bucketId));
RecordIdentifier recordIdentifier = new RecordIdentifier(INVALID_TRANSACTION_ID, bucketProperty, INVALID_ROW_ID);
structObjectInspector.setStructFieldData(record, recordIdentifierField, recordIdentifier);
return record;
}
use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class OrcInputFormat method getReader.
@Override
public RowReader<OrcStruct> getReader(InputSplit inputSplit, Options options) throws IOException {
final OrcSplit split = (OrcSplit) inputSplit;
// Retrieve the acidOperationalProperties for the table, initialized in HiveInputFormat.
AcidUtils.AcidOperationalProperties acidOperationalProperties = AcidUtils.getAcidOperationalProperties(options.getConfiguration());
if (!acidOperationalProperties.isSplitUpdate()) {
throw new IllegalStateException("Expected SpliUpdate table: " + split.getPath());
}
Map<String, AcidInputFormat.DeltaMetaData> pathToDeltaMetaData = new HashMap<>();
final Path[] deltas = VectorizedOrcAcidRowBatchReader.getDeleteDeltaDirsFromSplit(split, pathToDeltaMetaData);
final Configuration conf = options.getConfiguration();
final Reader reader = OrcInputFormat.createOrcReaderForSplit(conf, split);
OrcRawRecordMerger.Options mergerOptions = new OrcRawRecordMerger.Options().isCompacting(false);
mergerOptions.rootPath(split.getRootDir());
mergerOptions.bucketPath(split.getPath());
final int bucket;
if (split.hasBase()) {
AcidOutputFormat.Options acidIOOptions = AcidUtils.parseBaseOrDeltaBucketFilename(split.getPath(), conf);
if (acidIOOptions.getBucketId() < 0) {
LOG.warn("Can't determine bucket ID for " + split.getPath() + "; ignoring");
}
bucket = acidIOOptions.getBucketId();
} else {
bucket = (int) split.getStart();
assert false : "We should never have a split w/o base in acid 2.0 for full acid: " + split.getPath();
}
// todo: createOptionsForReader() assumes it's !isOriginal.... why?
final Reader.Options readOptions = OrcInputFormat.createOptionsForReader(conf);
readOptions.range(split.getStart(), split.getLength());
String txnString = conf.get(ValidWriteIdList.VALID_WRITEIDS_KEY);
ValidWriteIdList validWriteIdList = (txnString == null) ? new ValidReaderWriteIdList() : new ValidReaderWriteIdList(txnString);
if (LOG.isDebugEnabled()) {
LOG.debug("getReader:: Read ValidWriteIdList: " + validWriteIdList.toString() + " isTransactionalTable: " + HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN));
LOG.debug("Creating merger for {} and {}", split.getPath(), Arrays.toString(deltas));
}
boolean fetchDeletedRows = acidOperationalProperties.isFetchDeletedRows();
Map<String, Integer> deltaToAttemptId = AcidUtils.getDeltaToAttemptIdMap(pathToDeltaMetaData, deltas, bucket);
final OrcRawRecordMerger records;
if (!fetchDeletedRows) {
records = new OrcRawRecordMerger(conf, true, reader, split.isOriginal(), bucket, validWriteIdList, readOptions, deltas, mergerOptions, deltaToAttemptId);
} else {
records = new OrcRawRecordMerger(conf, true, reader, split.isOriginal(), bucket, validWriteIdList, readOptions, deltas, mergerOptions, deltaToAttemptId) {
@Override
protected boolean collapse(RecordIdentifier recordIdentifier) {
((ReaderKey) recordIdentifier).setValues(prevKey.getCurrentWriteId(), prevKey.getBucketProperty(), prevKey.getRowId(), prevKey.getCurrentWriteId(), true);
return false;
}
};
}
return new OrcRowReader(records, readOptions);
}
use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class VectorizedOrcAcidRowBatchReader method getKeyInterval.
/**
* Calculates the min/max record key.
* Structure in data is like this:
* <op, owid, writerId, rowid, cwid, <f1, ... fn>>
* The +1 is to account for the top level struct which has a
* ColumnStatistics object in colsStats. Top level struct is normally
* dropped by the Reader (I guess because of orc.impl.SchemaEvolution)
* @param colStats The statistics array
* @return The min record key
*/
private static OrcRawRecordMerger.KeyInterval getKeyInterval(ColumnStatistics[] colStats) {
IntegerColumnStatistics origWriteId = (IntegerColumnStatistics) colStats[OrcRecordUpdater.ORIGINAL_WRITEID + 1];
IntegerColumnStatistics bucketProperty = (IntegerColumnStatistics) colStats[OrcRecordUpdater.BUCKET + 1];
IntegerColumnStatistics rowId = (IntegerColumnStatistics) colStats[OrcRecordUpdater.ROW_ID + 1];
// the following cast to int
assert bucketProperty.getMaximum() <= Integer.MAX_VALUE : "was bucketProperty (max) changed to a long (" + bucketProperty.getMaximum() + ")?!";
assert bucketProperty.getMinimum() <= Integer.MAX_VALUE : "was bucketProperty (min) changed to a long (" + bucketProperty.getMaximum() + ")?!";
RecordIdentifier maxKey = new RecordIdentifier(origWriteId.getMaximum(), (int) bucketProperty.getMaximum(), rowId.getMaximum());
RecordIdentifier minKey = new RecordIdentifier(origWriteId.getMinimum(), (int) bucketProperty.getMinimum(), rowId.getMinimum());
return new OrcRawRecordMerger.KeyInterval(minKey, maxKey);
}
use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class OrcRawRecordMerger method discoverKeyBounds.
/**
* Find the key range for the split (of the base). These are used to filter delta files since
* both are sorted by key.
* @param reader the reader
* @param options the options for reading with
* @throws IOException
*/
private KeyInterval discoverKeyBounds(Reader reader, Reader.Options options) throws IOException {
RecordIdentifier[] keyIndex = OrcRecordUpdater.parseKeyIndex(reader);
long offset = options.getOffset();
long maxOffset = options.getMaxOffset();
int firstStripe = 0;
int stripeCount = 0;
boolean isTail = true;
RecordIdentifier minKey = null;
RecordIdentifier maxKey = null;
List<StripeInformation> stripes = reader.getStripes();
for (StripeInformation stripe : stripes) {
if (offset > stripe.getOffset()) {
firstStripe += 1;
} else if (maxOffset > stripe.getOffset()) {
stripeCount += 1;
} else {
isTail = false;
break;
}
}
if (firstStripe != 0) {
minKey = keyIndex[firstStripe - 1];
}
if (!isTail) {
maxKey = keyIndex[firstStripe + stripeCount - 1];
}
return new KeyInterval(minKey, maxKey);
}
use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class TestFileSinkOperator method setupData.
private void setupData(DataFormat format) {
Class<?> rType;
switch(format) {
case WITH_PARTITION_VALUE:
rType = RowWithPartVal.class;
break;
case WITH_RECORD_ID:
rType = RowWithRecID.class;
break;
case WITH_RECORD_ID_AND_PARTITION_VALUE:
rType = RowWithPartNRecID.class;
break;
default:
throw new RuntimeException("Unknown type");
}
inspector = ObjectInspectorFactory.getReflectionObjectInspector(rType, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
rows = new ArrayList<Row>();
Row r;
for (int i = 0; i < 10; i++) {
switch(format) {
case WITH_PARTITION_VALUE:
r = new RowWithPartVal(new Text("mary had a little lamb"), (i < 5) ? new Text("Monday") : new Text("Tuesday"));
break;
case WITH_RECORD_ID:
r = new RowWithRecID(new RecordIdentifier(1, 1, i), (i < 5) ? new Text("Monday") : new Text("Tuesday"));
break;
case WITH_RECORD_ID_AND_PARTITION_VALUE:
r = new RowWithPartNRecID(new Text("its fleect was white as snow"), (i < 5) ? new Text("Monday") : new Text("Tuesday"), new RecordIdentifier(1, 1, i));
break;
default:
throw new RuntimeException("Unknown data format");
}
rows.add(r);
}
}
Aggregations