Search in sources :

Example 1 with SimilarityFiltersJaccard

use of org.apache.asterix.fuzzyjoin.similarity.SimilarityFiltersJaccard in project asterixdb by apache.

the class SimilarityJaccardPrefixEvaluator method evaluate.

@Override
public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
    resultStorage.reset();
    // similarity threshold
    sim = 0;
    evalThreshold.evaluate(tuple, inputVal);
    float similarityThreshold = AFloatSerializerDeserializer.getFloat(inputVal.getByteArray(), inputVal.getStartOffset() + 1);
    if (similarityThreshold != similarityThresholdCache || similarityFilters == null) {
        similarityFilters = new SimilarityFiltersJaccard(similarityThreshold);
        similarityThresholdCache = similarityThreshold;
    }
    evalLen1.evaluate(tuple, inputVal);
    int length1 = ATypeHierarchy.getIntegerValue(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 0, inputVal.getByteArray(), inputVal.getStartOffset());
    evalLen2.evaluate(tuple, inputVal);
    int length2 = ATypeHierarchy.getIntegerValue(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 2, inputVal.getByteArray(), inputVal.getStartOffset());
    //
    if (similarityFilters.passLengthFilter(length1, length2)) {
        // -- - tokens1 - --
        int i;
        tokens1.reset();
        evalTokens1.evaluate(tuple, inputVal);
        byte[] serList = inputVal.getByteArray();
        int startOffset = inputVal.getStartOffset();
        if (serList[startOffset] != ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG && serList[startOffset] != ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG) {
            throw new TypeMismatchException(BuiltinFunctions.SIMILARITY_JACCARD, 1, serList[startOffset], ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG, ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG);
        }
        int lengthTokens1;
        if (serList[startOffset] == ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG) {
            lengthTokens1 = AOrderedListSerializerDeserializer.getNumberOfItems(inputVal.getByteArray(), startOffset);
            // read tokens
            for (i = 0; i < lengthTokens1; i++) {
                int itemOffset;
                int token;
                try {
                    itemOffset = AOrderedListSerializerDeserializer.getItemOffset(serList, startOffset, i);
                } catch (AsterixException e) {
                    throw new HyracksDataException(e);
                }
                token = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 1, serList, itemOffset, startOffset + 1);
                tokens1.add(token);
            }
        } else {
            lengthTokens1 = AUnorderedListSerializerDeserializer.getNumberOfItems(inputVal.getByteArray(), startOffset);
            // read tokens
            for (i = 0; i < lengthTokens1; i++) {
                int itemOffset;
                int token;
                try {
                    itemOffset = AUnorderedListSerializerDeserializer.getItemOffset(serList, startOffset, i);
                } catch (AsterixException e) {
                    throw new HyracksDataException(e);
                }
                token = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 1, serList, itemOffset, startOffset + 1);
                tokens1.add(token);
            }
        }
        // pad tokens
        for (; i < length1; i++) {
            tokens1.add(Integer.MAX_VALUE);
        }
        // -- - tokens2 - --
        tokens2.reset();
        evalTokens2.evaluate(tuple, inputVal);
        serList = inputVal.getByteArray();
        startOffset = inputVal.getStartOffset();
        if (serList[startOffset] != ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG && serList[startOffset] != ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG) {
            throw new TypeMismatchException(BuiltinFunctions.SIMILARITY_JACCARD, 3, serList[startOffset], ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG, ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG);
        }
        int lengthTokens2;
        if (serList[startOffset] == ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG) {
            lengthTokens2 = AOrderedListSerializerDeserializer.getNumberOfItems(inputVal.getByteArray(), startOffset);
            // read tokens
            for (i = 0; i < lengthTokens2; i++) {
                int itemOffset;
                int token;
                try {
                    itemOffset = AOrderedListSerializerDeserializer.getItemOffset(serList, startOffset, i);
                } catch (AsterixException e) {
                    throw new HyracksDataException(e);
                }
                token = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 3, serList, itemOffset, startOffset + 1);
                tokens2.add(token);
            }
        } else {
            lengthTokens2 = AUnorderedListSerializerDeserializer.getNumberOfItems(inputVal.getByteArray(), startOffset);
            // read tokens
            for (i = 0; i < lengthTokens2; i++) {
                int itemOffset;
                int token;
                try {
                    itemOffset = AUnorderedListSerializerDeserializer.getItemOffset(serList, startOffset, i);
                } catch (AsterixException e) {
                    throw new HyracksDataException(e);
                }
                token = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 3, serList, itemOffset, startOffset + 1);
                tokens2.add(token);
            }
        }
        // pad tokens
        for (; i < length2; i++) {
            tokens2.add(Integer.MAX_VALUE);
        }
        // -- - token prefix - --
        evalTokenPrefix.evaluate(tuple, inputVal);
        int tokenPrefix = ATypeHierarchy.getIntegerValue(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 4, inputVal.getByteArray(), inputVal.getStartOffset());
        //
        // -- - position filter - --
        //
        SimilarityMetric.getPartialIntersectSize(tokens1.get(), 0, tokens1.length(), tokens2.get(), 0, tokens2.length(), tokenPrefix, parInter);
        if (similarityFilters.passPositionFilter(parInter.intersectSize, parInter.posXStop, length1, parInter.posYStop, length2)) {
            //
            if (similarityFilters.passSuffixFilter(tokens1.get(), 0, tokens1.length(), parInter.posXStart, tokens2.get(), 0, tokens2.length(), parInter.posYStart)) {
                sim = similarityFilters.passSimilarityFilter(tokens1.get(), 0, tokens1.length(), parInter.posXStop + 1, tokens2.get(), 0, tokens2.length(), parInter.posYStop + 1, parInter.intersectSize);
            }
        }
    }
    try {
        writeResult();
    } catch (IOException e) {
        throw new HyracksDataException(e);
    }
    result.set(resultStorage);
}
Also used : AsterixException(org.apache.asterix.common.exceptions.AsterixException) SimilarityFiltersJaccard(org.apache.asterix.fuzzyjoin.similarity.SimilarityFiltersJaccard) TypeMismatchException(org.apache.asterix.runtime.exceptions.TypeMismatchException) IOException(java.io.IOException) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException)

Example 2 with SimilarityFiltersJaccard

use of org.apache.asterix.fuzzyjoin.similarity.SimilarityFiltersJaccard in project asterixdb by apache.

the class PrefixLenJaccardDescriptor method createEvaluatorFactory.

@Override
public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
    return new IScalarEvaluatorFactory() {

        private static final long serialVersionUID = 1L;

        @Override
        public IScalarEvaluator createScalarEvaluator(final IHyracksTaskContext ctx) throws HyracksDataException {
            return new IScalarEvaluator() {

                private final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();

                private final DataOutput out = resultStorage.getDataOutput();

                private final IPointable lenPtr = new VoidPointable();

                private final IPointable thresholdPtr = new VoidPointable();

                private final IScalarEvaluator evalLen = args[0].createScalarEvaluator(ctx);

                private final IScalarEvaluator evalThreshold = args[1].createScalarEvaluator(ctx);

                private float similarityThresholdCache;

                private SimilarityFiltersJaccard similarityFilters;

                // result
                private final AMutableInt32 res = new AMutableInt32(0);

                @SuppressWarnings("unchecked")
                private final ISerializerDeserializer<AInt32> int32Serde = SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.AINT32);

                @Override
                public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
                    resultStorage.reset();
                    evalLen.evaluate(tuple, lenPtr);
                    evalThreshold.evaluate(tuple, thresholdPtr);
                    // length
                    int length = ATypeHierarchy.getIntegerValue(getIdentifier().getName(), 0, lenPtr.getByteArray(), lenPtr.getStartOffset());
                    // similarity threshold
                    byte[] data = thresholdPtr.getByteArray();
                    int offset = thresholdPtr.getStartOffset();
                    if (data[offset] != ATypeTag.SERIALIZED_FLOAT_TYPE_TAG) {
                        throw new TypeMismatchException(getIdentifier(), 1, data[offset], ATypeTag.SERIALIZED_FLOAT_TYPE_TAG);
                    }
                    float similarityThreshold = AFloatSerializerDeserializer.getFloat(data, offset + 1);
                    if (similarityThreshold != similarityThresholdCache || similarityFilters == null) {
                        similarityFilters = new SimilarityFiltersJaccard(similarityThreshold);
                    }
                    int prefixLength = similarityFilters.getPrefixLength(length);
                    res.setValue(prefixLength);
                    try {
                        int32Serde.serialize(res, out);
                    } catch (IOException e) {
                        throw new HyracksDataException(e);
                    }
                    result.set(resultStorage);
                }
            };
        }
    };
}
Also used : DataOutput(java.io.DataOutput) TypeMismatchException(org.apache.asterix.runtime.exceptions.TypeMismatchException) IPointable(org.apache.hyracks.data.std.api.IPointable) IOException(java.io.IOException) IScalarEvaluator(org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator) ISerializerDeserializer(org.apache.hyracks.api.dataflow.value.ISerializerDeserializer) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException) IScalarEvaluatorFactory(org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory) ArrayBackedValueStorage(org.apache.hyracks.data.std.util.ArrayBackedValueStorage) SimilarityFiltersJaccard(org.apache.asterix.fuzzyjoin.similarity.SimilarityFiltersJaccard) IHyracksTaskContext(org.apache.hyracks.api.context.IHyracksTaskContext) VoidPointable(org.apache.hyracks.data.std.primitive.VoidPointable) IFrameTupleReference(org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference) AMutableInt32(org.apache.asterix.om.base.AMutableInt32)

Aggregations

IOException (java.io.IOException)2 SimilarityFiltersJaccard (org.apache.asterix.fuzzyjoin.similarity.SimilarityFiltersJaccard)2 TypeMismatchException (org.apache.asterix.runtime.exceptions.TypeMismatchException)2 HyracksDataException (org.apache.hyracks.api.exceptions.HyracksDataException)2 DataOutput (java.io.DataOutput)1 AsterixException (org.apache.asterix.common.exceptions.AsterixException)1 AMutableInt32 (org.apache.asterix.om.base.AMutableInt32)1 IScalarEvaluator (org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator)1 IScalarEvaluatorFactory (org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory)1 IHyracksTaskContext (org.apache.hyracks.api.context.IHyracksTaskContext)1 ISerializerDeserializer (org.apache.hyracks.api.dataflow.value.ISerializerDeserializer)1 IPointable (org.apache.hyracks.data.std.api.IPointable)1 VoidPointable (org.apache.hyracks.data.std.primitive.VoidPointable)1 ArrayBackedValueStorage (org.apache.hyracks.data.std.util.ArrayBackedValueStorage)1 IFrameTupleReference (org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference)1