use of org.apache.hyracks.api.dataflow.value.IBinaryHashFunction in project asterixdb by apache.
the class AMurmurHash3BinaryHashFunctionFamily method createBinaryHashFunction.
// This hash function family is used to promote a numeric type to a DOUBLE numeric type
// to return same hash value for the original numeric value, regardless of the numeric type.
// (e.g., h( int64("1") ) = h( double("1.0") )
@Override
public IBinaryHashFunction createBinaryHashFunction(final int seed) {
return new IBinaryHashFunction() {
private ArrayBackedValueStorage fieldValueBuffer = new ArrayBackedValueStorage();
private DataOutput fieldValueBufferOutput = fieldValueBuffer.getDataOutput();
private ATypeTag sourceTag = null;
private boolean numericTypePromotionApplied = false;
@Override
public int hash(byte[] bytes, int offset, int length) throws HyracksDataException {
// If a numeric type is encountered, then we promote each numeric type to the DOUBLE type.
fieldValueBuffer.reset();
sourceTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(bytes[offset]);
switch(sourceTag) {
case TINYINT:
case SMALLINT:
case INTEGER:
case BIGINT:
try {
IntegerToDoubleTypeConvertComputer.getInstance().convertType(bytes, offset + 1, length - 1, fieldValueBufferOutput);
} catch (IOException e) {
throw new HyracksDataException("A numeric type promotion error has occurred before doing hash(). Can't continue process. Detailed Error message:" + e.getMessage());
}
numericTypePromotionApplied = true;
break;
case FLOAT:
try {
FloatToDoubleTypeConvertComputer.getInstance().convertType(bytes, offset + 1, length - 1, fieldValueBufferOutput);
} catch (IOException e) {
throw new HyracksDataException("A numeric type promotion error has occurred before doing hash(). Can't continue process. Detailed Error message:" + e.getMessage());
}
numericTypePromotionApplied = true;
break;
default:
numericTypePromotionApplied = false;
break;
}
// If a numeric type promotion happened
if (numericTypePromotionApplied) {
return MurmurHash3BinaryHash.hash(fieldValueBuffer.getByteArray(), fieldValueBuffer.getStartOffset(), fieldValueBuffer.getLength(), seed);
} else {
// Usual case for non numeric types and the DOBULE numeric type
return MurmurHash3BinaryHash.hash(bytes, offset, length, seed);
}
}
};
}
use of org.apache.hyracks.api.dataflow.value.IBinaryHashFunction in project asterixdb by apache.
the class ListItemBinaryHashFunctionFactory method createBinaryHashFunction.
public IBinaryHashFunction createBinaryHashFunction(final ATypeTag itemTypeTag, final boolean ignoreCase) {
return new IBinaryHashFunction() {
private IBinaryHashFunction lowerCaseStringHash = new PointableBinaryHashFunctionFactory(UTF8StringLowercasePointable.FACTORY).createBinaryHashFunction();
private IBinaryHashFunction genericBinaryHash = MurmurHash3BinaryHashFunctionFamily.INSTANCE.createBinaryHashFunction(0);
private GrowableArray taggedBytes = new GrowableArray();
@Override
public int hash(byte[] bytes, int offset, int length) throws HyracksDataException {
ATypeTag tag = itemTypeTag;
int skip = 0;
if (itemTypeTag == ATypeTag.ANY) {
tag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(bytes[offset]);
skip = 1;
}
switch(tag) {
case STRING:
{
if (ignoreCase) {
return lowerCaseStringHash.hash(bytes, offset + skip, length - skip);
}
}
default:
{
if (itemTypeTag != ATypeTag.ANY) {
// add the itemTypeTag in front of the data
try {
resetTaggedBytes(bytes, offset, length);
return genericBinaryHash.hash(taggedBytes.getByteArray(), 0, length + 1);
} catch (IOException e) {
throw new HyracksDataException(e);
}
} else {
return genericBinaryHash.hash(bytes, offset, length);
}
}
}
}
private void resetTaggedBytes(byte[] data, int offset, int length) throws IOException {
taggedBytes.reset();
taggedBytes.getDataOutput().writeByte(itemTypeTag.serialize());
taggedBytes.getDataOutput().write(data, offset, length);
}
};
}
use of org.apache.hyracks.api.dataflow.value.IBinaryHashFunction in project asterixdb by apache.
the class FieldHashPartitionComputerFamily method createPartitioner.
@Override
public ITuplePartitionComputer createPartitioner(int seed) {
final IBinaryHashFunction[] hashFunctions = new IBinaryHashFunction[hashFunctionGeneratorFactories.length];
for (int i = 0; i < hashFunctionGeneratorFactories.length; ++i) {
hashFunctions[i] = hashFunctionGeneratorFactories[i].createBinaryHashFunction(seed);
}
return new ITuplePartitionComputer() {
@Override
public int partition(IFrameTupleAccessor accessor, int tIndex, int nParts) throws HyracksDataException {
int h = 0;
int startOffset = accessor.getTupleStartOffset(tIndex);
int slotLength = accessor.getFieldSlotsLength();
for (int j = 0; j < hashFields.length; ++j) {
int fIdx = hashFields[j];
IBinaryHashFunction hashFn = hashFunctions[j];
int fStart = accessor.getFieldStartOffset(tIndex, fIdx);
int fEnd = accessor.getFieldEndOffset(tIndex, fIdx);
int fh = hashFn.hash(accessor.getBuffer().array(), startOffset + slotLength + fStart, fEnd - fStart);
h += fh;
}
if (h < 0) {
h = -(h + 1);
}
return h % nParts;
}
};
}
use of org.apache.hyracks.api.dataflow.value.IBinaryHashFunction in project asterixdb by apache.
the class RecordAddFieldsDescriptor method createEvaluatorFactory.
@Override
public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
return new IScalarEvaluatorFactory() {
private static final long serialVersionUID = 1L;
@Override
public IScalarEvaluator createScalarEvaluator(final IHyracksTaskContext ctx) throws HyracksDataException {
final PointableAllocator allocator = new PointableAllocator();
final IVisitablePointable vp0 = allocator.allocateRecordValue(inRecType);
final IVisitablePointable vp1 = allocator.allocateListValue(inListType);
final IPointable argPtr0 = new VoidPointable();
final IPointable argPtr1 = new VoidPointable();
final IScalarEvaluator eval0 = args[0].createScalarEvaluator(ctx);
final IScalarEvaluator eval1 = args[1].createScalarEvaluator(ctx);
final ArrayBackedValueStorage fieldNamePointable = new ArrayBackedValueStorage();
final ArrayBackedValueStorage fieldValuePointer = new ArrayBackedValueStorage();
final PointableHelper pointableHelper = new PointableHelper();
try {
pointableHelper.serializeString("field-name", fieldNamePointable, true);
pointableHelper.serializeString("field-value", fieldValuePointer, true);
} catch (AsterixException e) {
throw new HyracksDataException(e);
}
return new IScalarEvaluator() {
// the default 32k frame size
public static final int TABLE_FRAME_SIZE = 32768;
// the default 32k frame size
public static final int TABLE_SIZE = 100;
private final RecordBuilder recordBuilder = new RecordBuilder();
private final RuntimeRecordTypeInfo requiredRecordTypeInfo = new RuntimeRecordTypeInfo();
private final IBinaryHashFunction putHashFunc = ListItemBinaryHashFunctionFactory.INSTANCE.createBinaryHashFunction();
private final IBinaryHashFunction getHashFunc = ListItemBinaryHashFunctionFactory.INSTANCE.createBinaryHashFunction();
private final BinaryEntry keyEntry = new BinaryEntry();
private final BinaryEntry valEntry = new BinaryEntry();
private final IVisitablePointable tempValReference = allocator.allocateEmpty();
private final IBinaryComparator cmp = ListItemBinaryComparatorFactory.INSTANCE.createBinaryComparator();
private BinaryHashMap hashMap = new BinaryHashMap(TABLE_SIZE, TABLE_FRAME_SIZE, putHashFunc, getHashFunc, cmp);
private ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();
private DataOutput out = resultStorage.getDataOutput();
@Override
public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
resultStorage.reset();
recordBuilder.reset(outRecType);
requiredRecordTypeInfo.reset(outRecType);
eval0.evaluate(tuple, argPtr0);
eval1.evaluate(tuple, argPtr1);
// Make sure we get a valid record
byte typeTag0 = argPtr0.getByteArray()[argPtr0.getStartOffset()];
if (typeTag0 != ATypeTag.SERIALIZED_RECORD_TYPE_TAG) {
throw new TypeMismatchException(getIdentifier(), 0, typeTag0, ATypeTag.SERIALIZED_RECORD_TYPE_TAG);
}
// Make sure we get a valid list
byte typeTag1 = argPtr1.getByteArray()[argPtr1.getStartOffset()];
if (typeTag1 != ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG) {
throw new TypeMismatchException(getIdentifier(), 1, typeTag1, ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG);
}
vp0.set(argPtr0);
vp1.set(argPtr1);
ARecordVisitablePointable recordPointable = (ARecordVisitablePointable) vp0;
AListVisitablePointable listPointable = (AListVisitablePointable) vp1;
// Initialize our hashmap
int tableSize = recordPointable.getFieldNames().size() + listPointable.getItems().size();
// Thus avoiding unnecessary object construction
if (hashMap == null || tableSize > TABLE_SIZE) {
hashMap = new BinaryHashMap(tableSize, TABLE_FRAME_SIZE, putHashFunc, getHashFunc, cmp);
} else {
hashMap.clear();
}
addFields(recordPointable, listPointable);
recordBuilder.write(out, true);
result.set(resultStorage);
}
private void addFields(ARecordVisitablePointable inputRecordPointer, AListVisitablePointable listPointable) throws HyracksDataException {
List<IVisitablePointable> inputRecordFieldNames = inputRecordPointer.getFieldNames();
List<IVisitablePointable> inputRecordFieldValues = inputRecordPointer.getFieldValues();
List<IVisitablePointable> inputFields = listPointable.getItems();
IVisitablePointable namePointable = null;
IVisitablePointable valuePointable = null;
int numInputRecordFields = inputRecordFieldNames.size();
try {
// Add original record without duplicate checking
for (int i = 0; i < numInputRecordFields; ++i) {
IVisitablePointable fnp = inputRecordFieldNames.get(i);
IVisitablePointable fvp = inputRecordFieldValues.get(i);
int pos = requiredRecordTypeInfo.getFieldIndex(fnp.getByteArray(), fnp.getStartOffset() + 1, fnp.getLength() - 1);
if (pos >= 0) {
recordBuilder.addField(pos, fvp);
} else {
recordBuilder.addField(fnp, fvp);
}
keyEntry.set(fnp.getByteArray(), fnp.getStartOffset(), fnp.getLength());
valEntry.set(fvp.getByteArray(), fvp.getStartOffset(), fvp.getLength());
hashMap.put(keyEntry, valEntry);
}
// Get the fields from a list of records
for (int i = 0; i < inputFields.size(); i++) {
if (!PointableHelper.sameType(ATypeTag.OBJECT, inputFields.get(i))) {
throw new AsterixException("Expected list of record, got " + PointableHelper.getTypeTag(inputFields.get(i)));
}
List<IVisitablePointable> names = ((ARecordVisitablePointable) inputFields.get(i)).getFieldNames();
List<IVisitablePointable> values = ((ARecordVisitablePointable) inputFields.get(i)).getFieldValues();
// Get name and value of the field to be added
// Use loop to account for the cases where users switches the order of the fields
IVisitablePointable fieldName;
for (int j = 0; j < names.size(); j++) {
fieldName = names.get(j);
// if fieldName is "field-name" then read the name
if (PointableHelper.byteArrayEqual(fieldNamePointable, fieldName)) {
namePointable = values.get(j);
} else {
// otherwise the fieldName is "field-value". Thus, read the value
valuePointable = values.get(j);
}
}
if (namePointable == null || valuePointable == null) {
throw new InvalidDataFormatException(getIdentifier(), "fields to be added");
}
// Check that the field being added is a valid field
int pos = requiredRecordTypeInfo.getFieldIndex(namePointable.getByteArray(), namePointable.getStartOffset() + 1, namePointable.getLength() - 1);
keyEntry.set(namePointable.getByteArray(), namePointable.getStartOffset(), namePointable.getLength());
// Check if already in our built record
BinaryEntry entry = hashMap.get(keyEntry);
if (entry != null) {
tempValReference.set(entry.getBuf(), entry.getOffset(), entry.getLength());
// If value is not equal throw conflicting duplicate field, otherwise ignore
if (!PointableHelper.byteArrayEqual(valuePointable, tempValReference)) {
throw new RuntimeDataException(ErrorCode.DUPLICATE_FIELD_NAME, getIdentifier());
}
} else {
if (pos > -1) {
recordBuilder.addField(pos, valuePointable);
} else {
recordBuilder.addField(namePointable, valuePointable);
}
valEntry.set(valuePointable.getByteArray(), valuePointable.getStartOffset(), valuePointable.getLength());
hashMap.put(keyEntry, valEntry);
}
}
} catch (AsterixException e) {
throw new HyracksDataException(e);
}
}
};
}
};
}
use of org.apache.hyracks.api.dataflow.value.IBinaryHashFunction in project asterixdb by apache.
the class ARecordSerializerDeserializer method getFieldOffsetByName.
public static int getFieldOffsetByName(byte[] serRecord, int start, int len, byte[] fieldName, int nstart) throws HyracksDataException {
// a record with len < 5 is empty
if (serRecord[start] != ATypeTag.SERIALIZED_RECORD_TYPE_TAG || len <= 5 || serRecord[start + 5] != 1) {
return -1;
}
// 6 is the index of the first byte of the openPartOffset value.
int openPartOffset = start + AInt32SerializerDeserializer.getInt(serRecord, start + 6);
int numberOfOpenField = AInt32SerializerDeserializer.getInt(serRecord, openPartOffset);
int fieldUtflength = UTF8StringUtil.getUTFLength(fieldName, nstart + 1);
int fieldUtfMetaLen = UTF8StringUtil.getNumBytesToStoreLength(fieldUtflength);
IBinaryHashFunction utf8HashFunction = BinaryHashFunctionFactoryProvider.UTF8STRING_POINTABLE_INSTANCE.createBinaryHashFunction();
IBinaryComparator utf8BinaryComparator = BinaryComparatorFactoryProvider.UTF8STRING_POINTABLE_INSTANCE.createBinaryComparator();
int fieldNameHashCode = utf8HashFunction.hash(fieldName, nstart + 1, fieldUtflength + fieldUtfMetaLen);
int offset = openPartOffset + 4;
int fieldOffset = -1;
int mid = 0;
int high = numberOfOpenField - 1;
int low = 0;
while (low <= high) {
mid = (high + low) / 2;
// 8 = hash code (4) + offset to the (name + tag + value ) of the field (4).
int h = AInt32SerializerDeserializer.getInt(serRecord, offset + (8 * mid));
if (h == fieldNameHashCode) {
fieldOffset = start + AInt32SerializerDeserializer.getInt(serRecord, offset + (8 * mid) + 4);
// the utf8 comparator do not require to put the precise length, we can just pass a estimated limit.
if (utf8BinaryComparator.compare(serRecord, fieldOffset, len, fieldName, nstart + 1, fieldUtflength + fieldUtfMetaLen) == 0) {
// since they are equal, we can directly use the meta length and the utf length.
return fieldOffset + fieldUtfMetaLen + fieldUtflength;
} else {
// this else part has not been tested yet
for (int j = mid + 1; j < numberOfOpenField; j++) {
h = AInt32SerializerDeserializer.getInt(serRecord, offset + (8 * j));
if (h == fieldNameHashCode) {
fieldOffset = start + AInt32SerializerDeserializer.getInt(serRecord, offset + (8 * j) + 4);
if (utf8BinaryComparator.compare(serRecord, fieldOffset, len, fieldName, nstart + 1, fieldUtflength) == 0) {
return fieldOffset + fieldUtfMetaLen + fieldUtflength;
}
} else {
break;
}
}
}
}
if (fieldNameHashCode > h) {
low = mid + 1;
} else {
high = mid - 1;
}
}
// no field with this name.
return -1;
}
Aggregations