use of org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite in project hive by apache.
the class TestVectorSerDeRow method testVectorSerializeRow.
void testVectorSerializeRow(Random r, SerializationType serializationType) throws HiveException, IOException, SerDeException {
String[] emptyScratchTypeNames = new String[0];
VectorRandomRowSource source = new VectorRandomRowSource();
source.init(r);
VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx();
batchContext.init(source.rowStructObjectInspector(), emptyScratchTypeNames);
VectorizedRowBatch batch = batchContext.createVectorizedRowBatch();
VectorAssignRow vectorAssignRow = new VectorAssignRow();
vectorAssignRow.init(source.typeNames());
int fieldCount = source.typeNames().size();
DeserializeRead deserializeRead;
SerializeWrite serializeWrite;
switch(serializationType) {
case BINARY_SORTABLE:
deserializeRead = new BinarySortableDeserializeRead(source.primitiveTypeInfos(), /* useExternalBuffer */
false);
serializeWrite = new BinarySortableSerializeWrite(fieldCount);
break;
case LAZY_BINARY:
deserializeRead = new LazyBinaryDeserializeRead(source.primitiveTypeInfos(), /* useExternalBuffer */
false);
serializeWrite = new LazyBinarySerializeWrite(fieldCount);
break;
case LAZY_SIMPLE:
{
StructObjectInspector rowObjectInspector = source.rowStructObjectInspector();
LazySerDeParameters lazySerDeParams = getSerDeParams(rowObjectInspector);
byte separator = (byte) '\t';
deserializeRead = new LazySimpleDeserializeRead(source.primitiveTypeInfos(), /* useExternalBuffer */
false, separator, lazySerDeParams);
serializeWrite = new LazySimpleSerializeWrite(fieldCount, separator, lazySerDeParams);
}
break;
default:
throw new Error("Unknown serialization type " + serializationType);
}
VectorSerializeRow vectorSerializeRow = new VectorSerializeRow(serializeWrite);
vectorSerializeRow.init(source.typeNames());
Object[][] randomRows = source.randomRows(100000);
int firstRandomRowIndex = 0;
for (int i = 0; i < randomRows.length; i++) {
Object[] row = randomRows[i];
vectorAssignRow.assignRow(batch, batch.size, row);
batch.size++;
if (batch.size == batch.DEFAULT_SIZE) {
serializeBatch(batch, vectorSerializeRow, deserializeRead, source, randomRows, firstRandomRowIndex);
firstRandomRowIndex = i + 1;
batch.reset();
}
}
if (batch.size > 0) {
serializeBatch(batch, vectorSerializeRow, deserializeRead, source, randomRows, firstRandomRowIndex);
}
}
use of org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite in project hive by apache.
the class TestVectorMapJoinFastRowHashMap method addAndVerifyRows.
private void addAndVerifyRows(VectorRandomRowSource valueSource, Object[][] rows, VectorMapJoinFastHashTable map, HashTableKeyType hashTableKeyType, VerifyFastRowHashMap verifyTable, String[] keyTypeNames, boolean doClipping, boolean useExactBytes) throws HiveException, IOException, SerDeException {
final int keyCount = keyTypeNames.length;
PrimitiveTypeInfo[] keyPrimitiveTypeInfos = new PrimitiveTypeInfo[keyCount];
PrimitiveCategory[] keyPrimitiveCategories = new PrimitiveCategory[keyCount];
ArrayList<ObjectInspector> keyPrimitiveObjectInspectorList = new ArrayList<ObjectInspector>(keyCount);
for (int i = 0; i < keyCount; i++) {
PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) TypeInfoUtils.getTypeInfoFromTypeString(keyTypeNames[i]);
keyPrimitiveTypeInfos[i] = primitiveTypeInfo;
PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory();
keyPrimitiveCategories[i] = primitiveCategory;
keyPrimitiveObjectInspectorList.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(primitiveTypeInfo));
}
boolean[] keyColumnSortOrderIsDesc = new boolean[keyCount];
Arrays.fill(keyColumnSortOrderIsDesc, false);
byte[] keyColumnNullMarker = new byte[keyCount];
Arrays.fill(keyColumnNullMarker, BinarySortableSerDe.ZERO);
byte[] keyColumnNotNullMarker = new byte[keyCount];
Arrays.fill(keyColumnNotNullMarker, BinarySortableSerDe.ONE);
BinarySortableSerializeWrite keySerializeWrite = new BinarySortableSerializeWrite(keyColumnSortOrderIsDesc, keyColumnNullMarker, keyColumnNotNullMarker);
PrimitiveTypeInfo[] valuePrimitiveTypeInfos = valueSource.primitiveTypeInfos();
final int columnCount = valuePrimitiveTypeInfos.length;
SerializeWrite valueSerializeWrite = new LazyBinarySerializeWrite(columnCount);
final int count = rows.length;
for (int i = 0; i < count; i++) {
Object[] valueRow = rows[i];
Output valueOutput = new Output();
((LazyBinarySerializeWrite) valueSerializeWrite).set(valueOutput);
for (int index = 0; index < columnCount; index++) {
Writable writable = (Writable) valueRow[index];
VerifyFastRow.serializeWrite(valueSerializeWrite, valuePrimitiveTypeInfos[index], writable);
}
byte[] value = Arrays.copyOf(valueOutput.getData(), valueOutput.getLength());
// Add a new key or add a value to an existing key?
byte[] key;
if (random.nextBoolean() || verifyTable.getCount() == 0) {
Object[] keyRow = VectorRandomRowSource.randomRow(keyCount, random, keyPrimitiveObjectInspectorList, keyPrimitiveCategories, keyPrimitiveTypeInfos);
Output keyOutput = new Output();
keySerializeWrite.set(keyOutput);
for (int index = 0; index < keyCount; index++) {
Writable writable = (Writable) keyRow[index];
VerifyFastRow.serializeWrite(keySerializeWrite, keyPrimitiveTypeInfos[index], writable);
}
key = Arrays.copyOf(keyOutput.getData(), keyOutput.getLength());
verifyTable.add(key, keyRow, value, valueRow);
} else {
key = verifyTable.addRandomExisting(value, valueRow, random);
}
// Serialize keyRow into key bytes.
BytesWritable keyWritable = new BytesWritable(key);
BytesWritable valueWritable = new BytesWritable(value);
map.putRow(keyWritable, valueWritable);
// verifyTable.verify(map);
}
verifyTable.verify(map, hashTableKeyType, valuePrimitiveTypeInfos, doClipping, useExactBytes, random);
}
use of org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite in project hive by apache.
the class VectorizationContext method getStructInExpression.
private VectorExpression getStructInExpression(List<ExprNodeDesc> childExpr, ExprNodeDesc colExpr, TypeInfo colTypeInfo, List<ExprNodeDesc> inChildren, VectorExpressionDescriptor.Mode mode, TypeInfo returnType) throws HiveException {
VectorExpression expr = null;
StructTypeInfo structTypeInfo = (StructTypeInfo) colTypeInfo;
ArrayList<TypeInfo> fieldTypeInfos = structTypeInfo.getAllStructFieldTypeInfos();
final int fieldCount = fieldTypeInfos.size();
ColumnVector.Type[] fieldVectorColumnTypes = new ColumnVector.Type[fieldCount];
InConstantType[] fieldInConstantTypes = new InConstantType[fieldCount];
for (int f = 0; f < fieldCount; f++) {
TypeInfo fieldTypeInfo = fieldTypeInfos.get(f);
// Only primitive fields supports for now.
if (fieldTypeInfo.getCategory() != Category.PRIMITIVE) {
return null;
}
// We are going to serialize using the 4 basic types.
ColumnVector.Type fieldVectorColumnType = getColumnVectorTypeFromTypeInfo(fieldTypeInfo);
fieldVectorColumnTypes[f] = fieldVectorColumnType;
// We currently evaluate the IN (..) constants in special ways.
PrimitiveCategory fieldPrimitiveCategory = ((PrimitiveTypeInfo) fieldTypeInfo).getPrimitiveCategory();
InConstantType inConstantType = getInConstantTypeFromPrimitiveCategory(fieldPrimitiveCategory);
fieldInConstantTypes[f] = inConstantType;
}
Output buffer = new Output();
BinarySortableSerializeWrite binarySortableSerializeWrite = new BinarySortableSerializeWrite(fieldCount);
final int inChildrenCount = inChildren.size();
byte[][] serializedInChildren = new byte[inChildrenCount][];
try {
for (int i = 0; i < inChildrenCount; i++) {
final ExprNodeDesc node = inChildren.get(i);
final Object[] constants;
if (node instanceof ExprNodeConstantDesc) {
ExprNodeConstantDesc constNode = (ExprNodeConstantDesc) node;
ConstantObjectInspector output = constNode.getWritableObjectInspector();
constants = ((List<?>) output.getWritableConstantValue()).toArray();
} else {
ExprNodeGenericFuncDesc exprNode = (ExprNodeGenericFuncDesc) node;
ExprNodeEvaluator<?> evaluator = ExprNodeEvaluatorFactory.get(exprNode);
ObjectInspector output = evaluator.initialize(exprNode.getWritableObjectInspector());
constants = (Object[]) evaluator.evaluate(null);
}
binarySortableSerializeWrite.set(buffer);
for (int f = 0; f < fieldCount; f++) {
Object constant = constants[f];
if (constant == null) {
binarySortableSerializeWrite.writeNull();
} else {
InConstantType inConstantType = fieldInConstantTypes[f];
switch(inConstantType) {
case STRING_FAMILY:
{
byte[] bytes;
if (constant instanceof Text) {
Text text = (Text) constant;
bytes = text.getBytes();
binarySortableSerializeWrite.writeString(bytes, 0, text.getLength());
} else {
throw new HiveException("Unexpected constant String type " + constant.getClass().getSimpleName());
}
}
break;
case INT_FAMILY:
{
long value;
if (constant instanceof IntWritable) {
value = ((IntWritable) constant).get();
} else if (constant instanceof LongWritable) {
value = ((LongWritable) constant).get();
} else {
throw new HiveException("Unexpected constant Long type " + constant.getClass().getSimpleName());
}
binarySortableSerializeWrite.writeLong(value);
}
break;
case FLOAT_FAMILY:
{
double value;
if (constant instanceof DoubleWritable) {
value = ((DoubleWritable) constant).get();
} else {
throw new HiveException("Unexpected constant Double type " + constant.getClass().getSimpleName());
}
binarySortableSerializeWrite.writeDouble(value);
}
break;
// UNDONE...
case DATE:
case TIMESTAMP:
case DECIMAL:
default:
throw new RuntimeException("Unexpected IN constant type " + inConstantType.name());
}
}
}
serializedInChildren[i] = Arrays.copyOfRange(buffer.getData(), 0, buffer.getLength());
}
} catch (Exception e) {
throw new HiveException(e);
}
// Create a single child representing the scratch column where we will
// generate the serialized keys of the batch.
int scratchBytesCol = ocm.allocateOutputColumn(TypeInfoFactory.stringTypeInfo);
Class<?> cl = (mode == VectorExpressionDescriptor.Mode.FILTER ? FilterStructColumnInList.class : StructColumnInList.class);
expr = createVectorExpression(cl, null, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
((IStringInExpr) expr).setInListValues(serializedInChildren);
((IStructInExpr) expr).setScratchBytesColumn(scratchBytesCol);
((IStructInExpr) expr).setStructColumnExprs(this, colExpr.getChildren(), fieldVectorColumnTypes);
return expr;
}
use of org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite in project hive by apache.
the class VectorReduceSinkCommonOperator method initializeOp.
@Override
protected void initializeOp(Configuration hconf) throws HiveException {
super.initializeOp(hconf);
if (LOG.isDebugEnabled()) {
// Determine the name of our map or reduce task for debug tracing.
BaseWork work = Utilities.getMapWork(hconf);
if (work == null) {
work = Utilities.getReduceWork(hconf);
}
taskName = work.getName();
}
String context = hconf.get(Operator.CONTEXT_NAME_KEY, "");
if (context != null && !context.isEmpty()) {
context = "_" + context.replace(" ", "_");
}
statsMap.put(Counter.RECORDS_OUT_INTERMEDIATE + context, recordCounter);
reduceSkipTag = conf.getSkipTag();
reduceTagByte = (byte) conf.getTag();
if (isLogInfoEnabled) {
LOG.info("Using tag = " + (int) reduceTagByte);
}
TableDesc keyTableDesc = conf.getKeySerializeInfo();
boolean[] columnSortOrder = getColumnSortOrder(keyTableDesc.getProperties(), reduceSinkKeyColumnMap.length);
byte[] columnNullMarker = getColumnNullMarker(keyTableDesc.getProperties(), reduceSinkKeyColumnMap.length, columnSortOrder);
byte[] columnNotNullMarker = getColumnNotNullMarker(keyTableDesc.getProperties(), reduceSinkKeyColumnMap.length, columnSortOrder);
keyBinarySortableSerializeWrite = new BinarySortableSerializeWrite(columnSortOrder, columnNullMarker, columnNotNullMarker);
// Create all nulls key.
try {
Output nullKeyOutput = new Output();
keyBinarySortableSerializeWrite.set(nullKeyOutput);
for (int i = 0; i < reduceSinkKeyColumnMap.length; i++) {
keyBinarySortableSerializeWrite.writeNull();
}
int nullBytesLength = nullKeyOutput.getLength();
nullBytes = new byte[nullBytesLength];
System.arraycopy(nullKeyOutput.getData(), 0, nullBytes, 0, nullBytesLength);
nullKeyHashCode = HashCodeUtil.calculateBytesHashCode(nullBytes, 0, nullBytesLength);
} catch (Exception e) {
throw new HiveException(e);
}
valueLazyBinarySerializeWrite = new LazyBinarySerializeWrite(reduceSinkValueColumnMap.length);
valueVectorSerializeRow = new VectorSerializeRow<LazyBinarySerializeWrite>(valueLazyBinarySerializeWrite);
valueVectorSerializeRow.init(reduceSinkValueTypeInfos, reduceSinkValueColumnMap);
valueOutput = new Output();
valueVectorSerializeRow.setOutput(valueOutput);
keyWritable = new HiveKey();
valueBytesWritable = new BytesWritable();
batchCounter = 0;
}
use of org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite in project hive by apache.
the class VectorMapJoinLeftSemiMultiKeyOperator method process.
//---------------------------------------------------------------------------
// Process Multi-Key Left-Semi Join on a vectorized row batch.
//
@Override
public void process(Object row, int tag) throws HiveException {
try {
VectorizedRowBatch batch = (VectorizedRowBatch) row;
alias = (byte) tag;
if (needCommonSetup) {
// Our one time process method initialization.
commonSetup(batch);
/*
* Initialize Multi-Key members for this specialized class.
*/
keyVectorSerializeWrite = new VectorSerializeRow(new BinarySortableSerializeWrite(bigTableKeyColumnMap.length));
keyVectorSerializeWrite.init(bigTableKeyTypeInfos, bigTableKeyColumnMap);
currentKeyOutput = new Output();
saveKeyOutput = new Output();
needCommonSetup = false;
}
if (needHashTableSetup) {
// Setup our hash table specialization. It will be the first time the process
// method is called, or after a Hybrid Grace reload.
/*
* Get our Multi-Key hash set information for this specialized class.
*/
hashSet = (VectorMapJoinBytesHashSet) vectorMapJoinHashTable;
needHashTableSetup = false;
}
batchCounter++;
// For left semi joins, we may apply the filter(s) now.
for (VectorExpression ve : bigTableFilterExpressions) {
ve.evaluate(batch);
}
final int inputLogicalSize = batch.size;
if (inputLogicalSize == 0) {
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty");
}
return;
}
// Perform any key expressions. Results will go into scratch columns.
if (bigTableKeyExpressions != null) {
for (VectorExpression ve : bigTableKeyExpressions) {
ve.evaluate(batch);
}
}
/*
* Multi-Key specific declarations.
*/
// None.
/*
* Multi-Key Long check for repeating.
*/
// If all BigTable input columns to key expressions are isRepeating, then
// calculate key once; lookup once.
boolean allKeyInputColumnsRepeating;
if (bigTableKeyColumnMap.length == 0) {
allKeyInputColumnsRepeating = false;
} else {
allKeyInputColumnsRepeating = true;
for (int i = 0; i < bigTableKeyColumnMap.length; i++) {
if (!batch.cols[bigTableKeyColumnMap[i]].isRepeating) {
allKeyInputColumnsRepeating = false;
break;
}
}
}
if (allKeyInputColumnsRepeating) {
/*
* Repeating.
*/
// All key input columns are repeating. Generate key once. Lookup once.
// Since the key is repeated, we must use entry 0 regardless of selectedInUse.
/*
* Multi-Key specific repeated lookup.
*/
keyVectorSerializeWrite.setOutput(currentKeyOutput);
keyVectorSerializeWrite.serializeWrite(batch, 0);
JoinUtil.JoinResult joinResult;
if (keyVectorSerializeWrite.getHasAnyNulls()) {
joinResult = JoinUtil.JoinResult.NOMATCH;
} else {
byte[] keyBytes = currentKeyOutput.getData();
int keyLength = currentKeyOutput.getLength();
// LOG.debug(CLASS_NAME + " processOp all " + displayBytes(keyBytes, 0, keyLength));
joinResult = hashSet.contains(keyBytes, 0, keyLength, hashSetResults[0]);
}
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name());
}
finishLeftSemiRepeated(batch, joinResult, hashSetResults[0]);
} else {
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated");
}
// We remember any matching rows in matchs / matchSize. At the end of the loop,
// selected / batch.size will represent both matching and non-matching rows for outer join.
// Only deferred rows will have been removed from selected.
int[] selected = batch.selected;
boolean selectedInUse = batch.selectedInUse;
int hashSetResultCount = 0;
int allMatchCount = 0;
int spillCount = 0;
/*
* Multi-Key specific variables.
*/
Output temp;
// We optimize performance by only looking up the first key in a series of equal keys.
boolean haveSaveKey = false;
JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH;
// Logical loop over the rows in the batch since the batch may have selected in use.
for (int logical = 0; logical < inputLogicalSize; logical++) {
int batchIndex = (selectedInUse ? selected[logical] : logical);
/*
* Multi-Key get key.
*/
// Generate binary sortable key for current row in vectorized row batch.
keyVectorSerializeWrite.setOutput(currentKeyOutput);
keyVectorSerializeWrite.serializeWrite(batch, batchIndex);
boolean isAnyNull = keyVectorSerializeWrite.getHasAnyNulls();
if (isAnyNull || !haveSaveKey || !saveKeyOutput.arraysEquals(currentKeyOutput)) {
if (haveSaveKey) {
// Move on with our counts.
switch(saveJoinResult) {
case MATCH:
// We have extracted the existence from the hash set result, so we don't keep it.
break;
case SPILL:
// We keep the hash set result for its spill information.
hashSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (isAnyNull) {
saveJoinResult = JoinUtil.JoinResult.NOMATCH;
haveSaveKey = false;
} else {
// Regardless of our matching result, we keep that information to make multiple use
// of it for a possible series of equal keys.
haveSaveKey = true;
/*
* Multi-Key specific save key and lookup.
*/
temp = saveKeyOutput;
saveKeyOutput = currentKeyOutput;
currentKeyOutput = temp;
/*
* Multi-key specific lookup key.
*/
byte[] keyBytes = saveKeyOutput.getData();
int keyLength = saveKeyOutput.getLength();
saveJoinResult = hashSet.contains(keyBytes, 0, keyLength, hashSetResults[hashSetResultCount]);
}
switch(saveJoinResult) {
case MATCH:
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey);
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey);
break;
}
} else {
switch(saveJoinResult) {
case MATCH:
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate");
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate");
break;
}
}
}
if (haveSaveKey) {
// Update our counts for the last key.
switch(saveJoinResult) {
case MATCH:
// We have extracted the existence from the hash set result, so we don't keep it.
break;
case SPILL:
// We keep the hash set result for its spill information.
hashSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + " spills " + intArrayToRangesString(spills, spillCount) + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashSetResults, 0, hashSetResultCount)));
}
finishLeftSemi(batch, allMatchCount, spillCount, (VectorMapJoinHashTableResult[]) hashSetResults);
}
if (batch.size > 0) {
// Forward any remaining selected rows.
forwardBigTableBatch(batch);
}
} catch (IOException e) {
throw new HiveException(e);
} catch (Exception e) {
throw new HiveException(e);
}
}
Aggregations