use of org.apache.hadoop.hive.ql.io.HiveKey in project hive by apache.
the class ReduceSinkOperator method process.
@Override
@SuppressWarnings("unchecked")
public void process(Object row, int tag) throws HiveException {
try {
ObjectInspector rowInspector = inputObjInspectors[tag];
if (firstRow) {
firstRow = false;
// matter, then we can create this in ctor and get rid of firstRow.
if (isLogInfoEnabled) {
LOG.info("keys are " + conf.getOutputKeyColumnNames() + " num distributions: " + conf.getNumDistributionKeys());
}
keyObjectInspector = initEvaluatorsAndReturnStruct(keyEval, distinctColIndices, conf.getOutputKeyColumnNames(), numDistributionKeys, rowInspector);
valueObjectInspector = initEvaluatorsAndReturnStruct(valueEval, conf.getOutputValueColumnNames(), rowInspector);
partitionObjectInspectors = initEvaluators(partitionEval, rowInspector);
if (bucketEval != null) {
bucketObjectInspectors = initEvaluators(bucketEval, rowInspector);
}
int numKeys = numDistinctExprs > 0 ? numDistinctExprs : 1;
int keyLen = numDistinctExprs > 0 ? numDistributionKeys + 1 : numDistributionKeys;
cachedKeys = new Object[numKeys][keyLen];
cachedValues = new Object[valueEval.length];
}
// Determine distKeyLength (w/o distincts), and then add the first if present.
populateCachedDistributionKeys(row, 0);
// replace bucketing columns with hashcode % numBuckets
int bucketNumber = -1;
if (bucketEval != null) {
bucketNumber = computeBucketNumber(row, conf.getNumBuckets());
cachedKeys[0][buckColIdxInKey] = new Text(String.valueOf(bucketNumber));
}
if (buckColIdxInKeyForSdpo != -1) {
cachedKeys[0][buckColIdxInKeyForSdpo] = new Text(String.valueOf(bucketNumber));
}
HiveKey firstKey = toHiveKey(cachedKeys[0], tag, null);
int distKeyLength = firstKey.getDistKeyLength();
if (numDistinctExprs > 0) {
populateCachedDistinctKeys(row, 0);
firstKey = toHiveKey(cachedKeys[0], tag, distKeyLength);
}
final int hashCode;
// distKeyLength doesn't include tag, but includes buckNum in cachedKeys[0]
if (useUniformHash && partitionEval.length > 0) {
hashCode = computeMurmurHash(firstKey);
} else {
hashCode = computeHashCode(row, bucketNumber);
}
firstKey.setHashCode(hashCode);
/*
* in case of TopN for windowing, we need to distinguish between rows with
* null partition keys and rows with value 0 for partition keys.
*/
boolean partKeyNull = conf.isPTFReduceSink() && partitionKeysAreNull(row);
// Try to store the first key.
// if TopNHashes aren't active, always forward
// if TopNHashes are active, proceed if not already excluded (i.e order by limit)
final int firstIndex = (reducerHash != null) ? reducerHash.tryStoreKey(firstKey, partKeyNull) : TopNHash.FORWARD;
// Nothing to do.
if (firstIndex == TopNHash.EXCLUDE)
return;
// Compute value and hashcode - we'd either store or forward them.
BytesWritable value = makeValueWritable(row);
if (firstIndex == TopNHash.FORWARD) {
collect(firstKey, value);
} else {
// invariant: reducerHash != null
assert firstIndex >= 0;
reducerHash.storeValue(firstIndex, firstKey.hashCode(), value, false);
}
// All other distinct keys will just be forwarded. This could be optimized...
for (int i = 1; i < numDistinctExprs; i++) {
System.arraycopy(cachedKeys[0], 0, cachedKeys[i], 0, numDistributionKeys);
populateCachedDistinctKeys(row, i);
HiveKey hiveKey = toHiveKey(cachedKeys[i], tag, distKeyLength);
hiveKey.setHashCode(hashCode);
collect(hiveKey, value);
}
} catch (HiveException e) {
throw e;
} catch (Exception e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.io.HiveKey in project hive by apache.
the class PTFTopNHash method getVectorizedKeyToForward.
public HiveKey getVectorizedKeyToForward(int batchIndex) {
prevIndexPartIsNull = indexesWithNullPartKey.contains(batchIndex);
Key pk = new Key(prevIndexPartIsNull, batchIndexToResult[batchIndex]);
TopNHash partHeap = partitionHeaps.get(pk);
return partHeap.getVectorizedKeyToForward(batchIndex);
}
use of org.apache.hadoop.hive.ql.io.HiveKey in project hive by apache.
the class PTFTopNHash method _tryStoreKey.
public int _tryStoreKey(HiveKey key, boolean partColsIsNull, int batchIndex) throws HiveException, IOException {
if (!isEnabled) {
// short-circuit quickly - forward all rows
return FORWARD;
}
if (topN == 0) {
// short-circuit quickly - eat all rows
return EXCLUDE;
}
Key pk = new Key(partColsIsNull, key.hashCode());
TopNHash partHeap = partitionHeaps.get(pk);
if (partHeap == null) {
partHeap = new TopNHash();
partHeap.initialize(topN, memUsage, isMapGroupBy, collector, conf, hconf);
if (batchIndex >= 0) {
partHeap.startVectorizedBatch(batchSize);
}
partitionHeaps.put(pk, partHeap);
}
usage = usage - partHeap.usage;
int r = 0;
if (batchIndex >= 0) {
partHeap.tryStoreVectorizedKey(key, false, batchIndex);
} else {
r = partHeap.tryStoreKey(key, false);
}
usage = usage + partHeap.usage;
updateLargest(partHeap);
if (usage > threshold) {
usage -= largestPartition.usage;
largestPartition.flush();
usage += largestPartition.usage;
largestPartition = null;
findLargest();
}
return r;
}
use of org.apache.hadoop.hive.ql.io.HiveKey in project hive by apache.
the class HiveKVResultCache method next.
public synchronized Tuple2<HiveKey, BytesWritable> next() {
Preconditions.checkState(hasNext());
if (!readBufferUsed) {
try {
if (input == null && output != null) {
// Close output stream if open
output.close();
output = null;
FileInputStream fis = null;
try {
fis = new FileInputStream(tmpFile);
input = new Input(fis);
} finally {
if (input == null && fis != null) {
fis.close();
}
}
}
if (input != null) {
// Load next batch from disk
for (int i = 0; i < IN_MEMORY_NUM_ROWS; i++) {
ObjectPair<HiveKey, BytesWritable> pair = readBuffer[i];
pair.setFirst(readHiveKey(input));
pair.setSecond(readValue(input));
}
if (input.eof()) {
input.close();
input = null;
}
rowsInReadBuffer = IN_MEMORY_NUM_ROWS;
readBufferUsed = true;
readCursor = 0;
} else if (writeCursor == 1) {
ObjectPair<HiveKey, BytesWritable> pair = writeBuffer[0];
Tuple2<HiveKey, BytesWritable> row = new Tuple2<HiveKey, BytesWritable>(pair.getFirst(), pair.getSecond());
pair.setFirst(null);
pair.setSecond(null);
writeCursor = 0;
return row;
} else {
// No record on disk, more data in write buffer
switchBufferAndResetCursor();
}
} catch (Exception e) {
// Clean up the cache
clear();
throw new RuntimeException("Failed to load rows from disk", e);
}
}
ObjectPair<HiveKey, BytesWritable> pair = readBuffer[readCursor];
Tuple2<HiveKey, BytesWritable> row = new Tuple2<HiveKey, BytesWritable>(pair.getFirst(), pair.getSecond());
pair.setFirst(null);
pair.setSecond(null);
if (++readCursor >= rowsInReadBuffer) {
readBufferUsed = false;
rowsInReadBuffer = 0;
readCursor = 0;
}
return row;
}
use of org.apache.hadoop.hive.ql.io.HiveKey in project hive by apache.
the class SortByShuffler method shuffle.
@Override
public JavaPairRDD<HiveKey, BytesWritable> shuffle(JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) {
JavaPairRDD<HiveKey, BytesWritable> rdd;
if (totalOrder) {
if (numPartitions > 0) {
if (numPartitions > 1 && input.getStorageLevel() == StorageLevel.NONE()) {
input.persist(StorageLevel.DISK_ONLY());
sparkPlan.addCachedRDDId(input.id());
}
rdd = input.sortByKey(true, numPartitions);
} else {
rdd = input.sortByKey(true);
}
} else {
Partitioner partitioner = new HashPartitioner(numPartitions);
rdd = input.repartitionAndSortWithinPartitions(partitioner);
}
return rdd;
}
Aggregations