use of org.apache.asterix.fuzzyjoin.similarity.SimilarityFiltersJaccard in project asterixdb by apache.
the class SimilarityJaccardPrefixEvaluator method evaluate.
@Override
public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
resultStorage.reset();
// similarity threshold
sim = 0;
evalThreshold.evaluate(tuple, inputVal);
float similarityThreshold = AFloatSerializerDeserializer.getFloat(inputVal.getByteArray(), inputVal.getStartOffset() + 1);
if (similarityThreshold != similarityThresholdCache || similarityFilters == null) {
similarityFilters = new SimilarityFiltersJaccard(similarityThreshold);
similarityThresholdCache = similarityThreshold;
}
evalLen1.evaluate(tuple, inputVal);
int length1 = ATypeHierarchy.getIntegerValue(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 0, inputVal.getByteArray(), inputVal.getStartOffset());
evalLen2.evaluate(tuple, inputVal);
int length2 = ATypeHierarchy.getIntegerValue(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 2, inputVal.getByteArray(), inputVal.getStartOffset());
//
if (similarityFilters.passLengthFilter(length1, length2)) {
// -- - tokens1 - --
int i;
tokens1.reset();
evalTokens1.evaluate(tuple, inputVal);
byte[] serList = inputVal.getByteArray();
int startOffset = inputVal.getStartOffset();
if (serList[startOffset] != ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG && serList[startOffset] != ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG) {
throw new TypeMismatchException(BuiltinFunctions.SIMILARITY_JACCARD, 1, serList[startOffset], ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG, ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG);
}
int lengthTokens1;
if (serList[startOffset] == ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG) {
lengthTokens1 = AOrderedListSerializerDeserializer.getNumberOfItems(inputVal.getByteArray(), startOffset);
// read tokens
for (i = 0; i < lengthTokens1; i++) {
int itemOffset;
int token;
try {
itemOffset = AOrderedListSerializerDeserializer.getItemOffset(serList, startOffset, i);
} catch (AsterixException e) {
throw new HyracksDataException(e);
}
token = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 1, serList, itemOffset, startOffset + 1);
tokens1.add(token);
}
} else {
lengthTokens1 = AUnorderedListSerializerDeserializer.getNumberOfItems(inputVal.getByteArray(), startOffset);
// read tokens
for (i = 0; i < lengthTokens1; i++) {
int itemOffset;
int token;
try {
itemOffset = AUnorderedListSerializerDeserializer.getItemOffset(serList, startOffset, i);
} catch (AsterixException e) {
throw new HyracksDataException(e);
}
token = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 1, serList, itemOffset, startOffset + 1);
tokens1.add(token);
}
}
// pad tokens
for (; i < length1; i++) {
tokens1.add(Integer.MAX_VALUE);
}
// -- - tokens2 - --
tokens2.reset();
evalTokens2.evaluate(tuple, inputVal);
serList = inputVal.getByteArray();
startOffset = inputVal.getStartOffset();
if (serList[startOffset] != ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG && serList[startOffset] != ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG) {
throw new TypeMismatchException(BuiltinFunctions.SIMILARITY_JACCARD, 3, serList[startOffset], ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG, ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG);
}
int lengthTokens2;
if (serList[startOffset] == ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG) {
lengthTokens2 = AOrderedListSerializerDeserializer.getNumberOfItems(inputVal.getByteArray(), startOffset);
// read tokens
for (i = 0; i < lengthTokens2; i++) {
int itemOffset;
int token;
try {
itemOffset = AOrderedListSerializerDeserializer.getItemOffset(serList, startOffset, i);
} catch (AsterixException e) {
throw new HyracksDataException(e);
}
token = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 3, serList, itemOffset, startOffset + 1);
tokens2.add(token);
}
} else {
lengthTokens2 = AUnorderedListSerializerDeserializer.getNumberOfItems(inputVal.getByteArray(), startOffset);
// read tokens
for (i = 0; i < lengthTokens2; i++) {
int itemOffset;
int token;
try {
itemOffset = AUnorderedListSerializerDeserializer.getItemOffset(serList, startOffset, i);
} catch (AsterixException e) {
throw new HyracksDataException(e);
}
token = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 3, serList, itemOffset, startOffset + 1);
tokens2.add(token);
}
}
// pad tokens
for (; i < length2; i++) {
tokens2.add(Integer.MAX_VALUE);
}
// -- - token prefix - --
evalTokenPrefix.evaluate(tuple, inputVal);
int tokenPrefix = ATypeHierarchy.getIntegerValue(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 4, inputVal.getByteArray(), inputVal.getStartOffset());
//
// -- - position filter - --
//
SimilarityMetric.getPartialIntersectSize(tokens1.get(), 0, tokens1.length(), tokens2.get(), 0, tokens2.length(), tokenPrefix, parInter);
if (similarityFilters.passPositionFilter(parInter.intersectSize, parInter.posXStop, length1, parInter.posYStop, length2)) {
//
if (similarityFilters.passSuffixFilter(tokens1.get(), 0, tokens1.length(), parInter.posXStart, tokens2.get(), 0, tokens2.length(), parInter.posYStart)) {
sim = similarityFilters.passSimilarityFilter(tokens1.get(), 0, tokens1.length(), parInter.posXStop + 1, tokens2.get(), 0, tokens2.length(), parInter.posYStop + 1, parInter.intersectSize);
}
}
}
try {
writeResult();
} catch (IOException e) {
throw new HyracksDataException(e);
}
result.set(resultStorage);
}
use of org.apache.asterix.fuzzyjoin.similarity.SimilarityFiltersJaccard in project asterixdb by apache.
the class PrefixLenJaccardDescriptor method createEvaluatorFactory.
@Override
public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
return new IScalarEvaluatorFactory() {
private static final long serialVersionUID = 1L;
@Override
public IScalarEvaluator createScalarEvaluator(final IHyracksTaskContext ctx) throws HyracksDataException {
return new IScalarEvaluator() {
private final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();
private final DataOutput out = resultStorage.getDataOutput();
private final IPointable lenPtr = new VoidPointable();
private final IPointable thresholdPtr = new VoidPointable();
private final IScalarEvaluator evalLen = args[0].createScalarEvaluator(ctx);
private final IScalarEvaluator evalThreshold = args[1].createScalarEvaluator(ctx);
private float similarityThresholdCache;
private SimilarityFiltersJaccard similarityFilters;
// result
private final AMutableInt32 res = new AMutableInt32(0);
@SuppressWarnings("unchecked")
private final ISerializerDeserializer<AInt32> int32Serde = SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.AINT32);
@Override
public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
resultStorage.reset();
evalLen.evaluate(tuple, lenPtr);
evalThreshold.evaluate(tuple, thresholdPtr);
// length
int length = ATypeHierarchy.getIntegerValue(getIdentifier().getName(), 0, lenPtr.getByteArray(), lenPtr.getStartOffset());
// similarity threshold
byte[] data = thresholdPtr.getByteArray();
int offset = thresholdPtr.getStartOffset();
if (data[offset] != ATypeTag.SERIALIZED_FLOAT_TYPE_TAG) {
throw new TypeMismatchException(getIdentifier(), 1, data[offset], ATypeTag.SERIALIZED_FLOAT_TYPE_TAG);
}
float similarityThreshold = AFloatSerializerDeserializer.getFloat(data, offset + 1);
if (similarityThreshold != similarityThresholdCache || similarityFilters == null) {
similarityFilters = new SimilarityFiltersJaccard(similarityThreshold);
}
int prefixLength = similarityFilters.getPrefixLength(length);
res.setValue(prefixLength);
try {
int32Serde.serialize(res, out);
} catch (IOException e) {
throw new HyracksDataException(e);
}
result.set(resultStorage);
}
};
}
};
}
Aggregations