Search in sources :

Example 1 with SimilarityFilters

use of org.apache.asterix.fuzzyjoin.similarity.SimilarityFilters in project asterixdb by apache.

the class PrefixLenDescriptor method createEvaluatorFactory.

@Override
public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
    return new IScalarEvaluatorFactory() {

        private static final long serialVersionUID = 1L;

        @Override
        public IScalarEvaluator createScalarEvaluator(final IHyracksTaskContext ctx) throws HyracksDataException {
            return new IScalarEvaluator() {

                private final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();

                private final DataOutput out = resultStorage.getDataOutput();

                private final IPointable inputVal = new VoidPointable();

                private final IScalarEvaluator evalLen = args[0].createScalarEvaluator(ctx);

                private final IScalarEvaluator evalSimilarity = args[1].createScalarEvaluator(ctx);

                private final IScalarEvaluator evalThreshold = args[2].createScalarEvaluator(ctx);

                private final SimilarityFiltersCache similarityFiltersCache = new SimilarityFiltersCache();

                // result
                private final AMutableInt32 res = new AMutableInt32(0);

                @SuppressWarnings("unchecked")
                private final ISerializerDeserializer<AInt32> int32Serde = SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.AINT32);

                @Override
                public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
                    resultStorage.reset();
                    // length
                    evalLen.evaluate(tuple, inputVal);
                    byte[] data = inputVal.getByteArray();
                    int offset = inputVal.getStartOffset();
                    if (data[offset] != ATypeTag.SERIALIZED_INT32_TYPE_TAG) {
                        throw new TypeMismatchException(getIdentifier(), 0, data[offset], ATypeTag.SERIALIZED_INT32_TYPE_TAG);
                    }
                    int length = IntegerPointable.getInteger(data, offset + 1);
                    // similarity threshold
                    evalThreshold.evaluate(tuple, inputVal);
                    data = inputVal.getByteArray();
                    offset = inputVal.getStartOffset();
                    if (data[offset] != ATypeTag.SERIALIZED_DOUBLE_TYPE_TAG) {
                        throw new TypeMismatchException(getIdentifier(), 1, data[offset], ATypeTag.SERIALIZED_DOUBLE_TYPE_TAG);
                    }
                    float similarityThreshold = (float) ADoubleSerializerDeserializer.getDouble(data, offset + 1);
                    // similarity name
                    evalSimilarity.evaluate(tuple, inputVal);
                    data = inputVal.getByteArray();
                    offset = inputVal.getStartOffset();
                    int len = inputVal.getLength();
                    if (data[offset] != ATypeTag.SERIALIZED_STRING_TYPE_TAG) {
                        throw new TypeMismatchException(getIdentifier(), 2, data[offset], ATypeTag.SERIALIZED_STRING_TYPE_TAG);
                    }
                    SimilarityFilters similarityFilters = similarityFiltersCache.get(similarityThreshold, data, offset, len);
                    int prefixLength = similarityFilters.getPrefixLength(length);
                    res.setValue(prefixLength);
                    try {
                        int32Serde.serialize(res, out);
                    } catch (IOException e) {
                        throw new HyracksDataException(e);
                    }
                    result.set(resultStorage);
                }
            };
        }
    };
}
Also used : DataOutput(java.io.DataOutput) TypeMismatchException(org.apache.asterix.runtime.exceptions.TypeMismatchException) SimilarityFiltersCache(org.apache.asterix.runtime.evaluators.common.SimilarityFiltersCache) SimilarityFilters(org.apache.asterix.fuzzyjoin.similarity.SimilarityFilters) IPointable(org.apache.hyracks.data.std.api.IPointable) IOException(java.io.IOException) IScalarEvaluator(org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator) ISerializerDeserializer(org.apache.hyracks.api.dataflow.value.ISerializerDeserializer) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException) IScalarEvaluatorFactory(org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory) ArrayBackedValueStorage(org.apache.hyracks.data.std.util.ArrayBackedValueStorage) IHyracksTaskContext(org.apache.hyracks.api.context.IHyracksTaskContext) VoidPointable(org.apache.hyracks.data.std.primitive.VoidPointable) IFrameTupleReference(org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference) AMutableInt32(org.apache.asterix.om.base.AMutableInt32)

Example 2 with SimilarityFilters

use of org.apache.asterix.fuzzyjoin.similarity.SimilarityFilters in project asterixdb by apache.

the class SimilarityDescriptor method createEvaluatorFactory.

@Override
public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
    return new IScalarEvaluatorFactory() {

        private static final long serialVersionUID = 1L;

        @Override
        public IScalarEvaluator createScalarEvaluator(final IHyracksTaskContext ctx) throws HyracksDataException {
            return new IScalarEvaluator() {

                private final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();

                private final DataOutput out = resultStorage.getDataOutput();

                private final IPointable inputVal = new VoidPointable();

                private final IScalarEvaluator evalLen1 = args[0].createScalarEvaluator(ctx);

                private final IScalarEvaluator evalTokens1 = args[1].createScalarEvaluator(ctx);

                private final IScalarEvaluator evalLen2 = args[2].createScalarEvaluator(ctx);

                private final IScalarEvaluator evalTokens2 = args[3].createScalarEvaluator(ctx);

                private final IScalarEvaluator evalTokenPrefix = args[4].createScalarEvaluator(ctx);

                private final IScalarEvaluator evalSimilarity = args[5].createScalarEvaluator(ctx);

                private final IScalarEvaluator evalThreshold = args[6].createScalarEvaluator(ctx);

                private final SimilarityFiltersCache similarityFiltersCache = new SimilarityFiltersCache();

                private final IntArray tokens1 = new IntArray();

                private final IntArray tokens2 = new IntArray();

                private final PartialIntersect parInter = new PartialIntersect();

                // result
                private final AMutableDouble res = new AMutableDouble(0);

                @SuppressWarnings("unchecked")
                private final ISerializerDeserializer<ADouble> doubleSerde = SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.ADOUBLE);

                @Override
                public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
                    resultStorage.reset();
                    // similarity threshold
                    evalThreshold.evaluate(tuple, inputVal);
                    byte[] data = inputVal.getByteArray();
                    int offset = inputVal.getStartOffset();
                    if (data[offset] != ATypeTag.SERIALIZED_DOUBLE_TYPE_TAG) {
                        throw new TypeMismatchException(getIdentifier(), 0, data[offset], ATypeTag.SERIALIZED_DOUBLE_TYPE_TAG);
                    }
                    float similarityThreshold = (float) ADoubleSerializerDeserializer.getDouble(data, offset + 1);
                    // similarity name
                    evalSimilarity.evaluate(tuple, inputVal);
                    data = inputVal.getByteArray();
                    offset = inputVal.getStartOffset();
                    int len = inputVal.getLength();
                    if (data[offset] != ATypeTag.SERIALIZED_STRING_TYPE_TAG) {
                        throw new TypeMismatchException(getIdentifier(), 1, data[offset], ATypeTag.SERIALIZED_DOUBLE_TYPE_TAG);
                    }
                    SimilarityFilters similarityFilters = similarityFiltersCache.get(similarityThreshold, data, offset, len);
                    evalLen1.evaluate(tuple, inputVal);
                    data = inputVal.getByteArray();
                    offset = inputVal.getStartOffset();
                    if (data[offset] != ATypeTag.SERIALIZED_INT32_TYPE_TAG) {
                        throw new TypeMismatchException(getIdentifier(), 2, data[offset], ATypeTag.SERIALIZED_INT32_TYPE_TAG);
                    }
                    int length1 = IntegerPointable.getInteger(data, offset + 1);
                    evalLen2.evaluate(tuple, inputVal);
                    data = inputVal.getByteArray();
                    offset = inputVal.getStartOffset();
                    if (data[offset] != ATypeTag.SERIALIZED_INT32_TYPE_TAG) {
                        throw new TypeMismatchException(getIdentifier(), 3, data[offset], ATypeTag.SERIALIZED_INT32_TYPE_TAG);
                    }
                    int length2 = IntegerPointable.getInteger(data, offset + 1);
                    float sim = 0;
                    //
                    if (similarityFilters.passLengthFilter(length1, length2)) {
                        // -- - tokens1 - --
                        int i;
                        tokens1.reset();
                        evalTokens1.evaluate(tuple, inputVal);
                        byte[] serList = inputVal.getByteArray();
                        offset = inputVal.getStartOffset();
                        if (serList[offset] != ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG && serList[offset] != ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG) {
                            throw new TypeMismatchException(getIdentifier(), 4, data[offset], ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG, ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG);
                        }
                        int lengthTokens1;
                        if (serList[offset] == ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG) {
                            lengthTokens1 = AOrderedListSerializerDeserializer.getNumberOfItems(serList, offset);
                            // read tokens
                            for (i = 0; i < lengthTokens1; i++) {
                                int itemOffset;
                                try {
                                    itemOffset = AOrderedListSerializerDeserializer.getItemOffset(serList, offset, i);
                                } catch (AsterixException e) {
                                    throw new HyracksDataException(e);
                                }
                                tokens1.add(IntegerPointable.getInteger(serList, itemOffset));
                            }
                        } else {
                            lengthTokens1 = AUnorderedListSerializerDeserializer.getNumberOfItems(serList, offset);
                            // read tokens
                            for (i = 0; i < lengthTokens1; i++) {
                                int itemOffset;
                                try {
                                    itemOffset = AUnorderedListSerializerDeserializer.getItemOffset(serList, offset, i);
                                } catch (AsterixException e) {
                                    throw new HyracksDataException(e);
                                }
                                tokens1.add(IntegerPointable.getInteger(serList, itemOffset));
                            }
                        }
                        // pad tokens
                        for (; i < length1; i++) {
                            tokens1.add(Integer.MAX_VALUE);
                        }
                        // -- - tokens2 - --
                        tokens2.reset();
                        evalTokens2.evaluate(tuple, inputVal);
                        serList = inputVal.getByteArray();
                        offset = inputVal.getStartOffset();
                        if (serList[offset] != ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG && serList[offset] != ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG) {
                            throw new TypeMismatchException(getIdentifier(), 5, data[offset], ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG, ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG);
                        }
                        int lengthTokens2;
                        if (serList[0] == ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG) {
                            lengthTokens2 = AOrderedListSerializerDeserializer.getNumberOfItems(serList, offset);
                            // read tokens
                            for (i = 0; i < lengthTokens2; i++) {
                                int itemOffset;
                                try {
                                    itemOffset = AOrderedListSerializerDeserializer.getItemOffset(serList, offset, i);
                                } catch (AsterixException e) {
                                    throw new HyracksDataException(e);
                                }
                                tokens2.add(IntegerPointable.getInteger(serList, itemOffset));
                            }
                        } else {
                            lengthTokens2 = AUnorderedListSerializerDeserializer.getNumberOfItems(serList, offset);
                            // read tokens
                            for (i = 0; i < lengthTokens2; i++) {
                                int itemOffset;
                                try {
                                    itemOffset = AUnorderedListSerializerDeserializer.getItemOffset(serList, offset, i);
                                } catch (AsterixException e) {
                                    throw new HyracksDataException(e);
                                }
                                tokens2.add(IntegerPointable.getInteger(serList, itemOffset));
                            }
                        }
                        // pad tokens
                        for (; i < length2; i++) {
                            tokens2.add(Integer.MAX_VALUE);
                        }
                        // -- - token prefix - --
                        evalTokenPrefix.evaluate(tuple, inputVal);
                        int tokenPrefix = IntegerPointable.getInteger(inputVal.getByteArray(), inputVal.getStartOffset() + 1);
                        //
                        // -- - position filter - --
                        //
                        SimilarityMetric.getPartialIntersectSize(tokens1.get(), 0, tokens1.length(), tokens2.get(), 0, tokens2.length(), tokenPrefix, parInter);
                        if (similarityFilters.passPositionFilter(parInter.intersectSize, parInter.posXStop, length1, parInter.posYStop, length2)) {
                            //
                            if (similarityFilters.passSuffixFilter(tokens1.get(), 0, tokens1.length(), parInter.posXStart, tokens2.get(), 0, tokens2.length(), parInter.posYStart)) {
                                sim = similarityFilters.passSimilarityFilter(tokens1.get(), 0, tokens1.length(), parInter.posXStop + 1, tokens2.get(), 0, tokens2.length(), parInter.posYStop + 1, parInter.intersectSize);
                            }
                        }
                    }
                    res.setValue(sim);
                    try {
                        doubleSerde.serialize(res, out);
                    } catch (IOException e) {
                        throw new HyracksDataException(e);
                    }
                    result.set(resultStorage);
                }
            };
        }
    };
}
Also used : DataOutput(java.io.DataOutput) TypeMismatchException(org.apache.asterix.runtime.exceptions.TypeMismatchException) SimilarityFiltersCache(org.apache.asterix.runtime.evaluators.common.SimilarityFiltersCache) SimilarityFilters(org.apache.asterix.fuzzyjoin.similarity.SimilarityFilters) IPointable(org.apache.hyracks.data.std.api.IPointable) IOException(java.io.IOException) IScalarEvaluator(org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator) ISerializerDeserializer(org.apache.hyracks.api.dataflow.value.ISerializerDeserializer) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException) IScalarEvaluatorFactory(org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory) ArrayBackedValueStorage(org.apache.hyracks.data.std.util.ArrayBackedValueStorage) AsterixException(org.apache.asterix.common.exceptions.AsterixException) IntArray(org.apache.asterix.fuzzyjoin.IntArray) IHyracksTaskContext(org.apache.hyracks.api.context.IHyracksTaskContext) VoidPointable(org.apache.hyracks.data.std.primitive.VoidPointable) PartialIntersect(org.apache.asterix.fuzzyjoin.similarity.PartialIntersect) AMutableDouble(org.apache.asterix.om.base.AMutableDouble) IFrameTupleReference(org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference)

Aggregations

DataOutput (java.io.DataOutput)2 IOException (java.io.IOException)2 SimilarityFilters (org.apache.asterix.fuzzyjoin.similarity.SimilarityFilters)2 SimilarityFiltersCache (org.apache.asterix.runtime.evaluators.common.SimilarityFiltersCache)2 TypeMismatchException (org.apache.asterix.runtime.exceptions.TypeMismatchException)2 IScalarEvaluator (org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator)2 IScalarEvaluatorFactory (org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory)2 IHyracksTaskContext (org.apache.hyracks.api.context.IHyracksTaskContext)2 ISerializerDeserializer (org.apache.hyracks.api.dataflow.value.ISerializerDeserializer)2 HyracksDataException (org.apache.hyracks.api.exceptions.HyracksDataException)2 IPointable (org.apache.hyracks.data.std.api.IPointable)2 VoidPointable (org.apache.hyracks.data.std.primitive.VoidPointable)2 ArrayBackedValueStorage (org.apache.hyracks.data.std.util.ArrayBackedValueStorage)2 IFrameTupleReference (org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference)2 AsterixException (org.apache.asterix.common.exceptions.AsterixException)1 IntArray (org.apache.asterix.fuzzyjoin.IntArray)1 PartialIntersect (org.apache.asterix.fuzzyjoin.similarity.PartialIntersect)1 AMutableDouble (org.apache.asterix.om.base.AMutableDouble)1 AMutableInt32 (org.apache.asterix.om.base.AMutableInt32)1