Search in sources :

Example 11 with ListField

use of edu.uci.ics.textdb.api.field.ListField in project textdb by TextDB.

the class SimilarityJoinPredicate method joinTuples.

@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws DataFlowException {
    if (similarityThreshold == 0) {
        return null;
    }
    // get the span list only with the joinAttributeName
    ListField<Span> innerSpanListField = innerTuple.getField(SchemaConstants.SPAN_LIST);
    List<Span> innerRelevantSpanList = innerSpanListField.getValue().stream().filter(span -> span.getAttributeName().equals(innerJoinAttrName)).collect(Collectors.toList());
    ListField<Span> outerSpanListField = outerTuple.getField(SchemaConstants.SPAN_LIST);
    List<Span> outerRelevantSpanList = outerSpanListField.getValue().stream().filter(span -> span.getAttributeName().equals(outerJoinAttrName)).collect(Collectors.toList());
    // get a set of span's values (since multiple spans may have the same value)
    Set<String> innerSpanValueSet = innerRelevantSpanList.stream().map(span -> span.getValue()).collect(Collectors.toSet());
    Set<String> outerSpanValueSet = outerRelevantSpanList.stream().map(span -> span.getValue()).collect(Collectors.toSet());
    // compute the result value set using the similarity function
    Set<String> resultValueSet = new HashSet<>();
    for (String innerString : innerSpanValueSet) {
        for (String outerString : outerSpanValueSet) {
            if (this.similarityFunc.calculateSimilarity(innerString, outerString) >= this.similarityThreshold) {
                resultValueSet.add(innerString);
                resultValueSet.add(outerString);
            }
        }
    }
    // return null if none of them are similar
    if (resultValueSet.isEmpty()) {
        return null;
    }
    // generate the result spans
    List<Span> resultSpans = new ArrayList<>();
    for (Span span : innerRelevantSpanList) {
        if (resultValueSet.contains(span.getValue())) {
            resultSpans.add(addFieldPrefix(span, INNER_PREFIX));
        }
    }
    for (Span span : outerRelevantSpanList) {
        if (resultValueSet.contains(span.getValue())) {
            resultSpans.add(addFieldPrefix(span, OUTER_PREFIX));
        }
    }
    return mergeTuples(innerTuple, outerTuple, outputSchema, resultSpans);
}
Also used : SchemaConstants(edu.uci.ics.textdb.api.constants.SchemaConstants) JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) java.util(java.util) Attribute(edu.uci.ics.textdb.api.schema.Attribute) IDField(edu.uci.ics.textdb.api.field.IDField) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Collectors(java.util.stream.Collectors) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) PredicateBase(edu.uci.ics.textdb.exp.common.PredicateBase) Schema(edu.uci.ics.textdb.api.schema.Schema) ListField(edu.uci.ics.textdb.api.field.ListField) IField(edu.uci.ics.textdb.api.field.IField) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) JsonIgnore(com.fasterxml.jackson.annotation.JsonIgnore) edu.uci.ics.textdb.api.tuple(edu.uci.ics.textdb.api.tuple) Span(edu.uci.ics.textdb.api.span.Span) PropertyNameConstants(edu.uci.ics.textdb.exp.common.PropertyNameConstants) IOperator(edu.uci.ics.textdb.api.dataflow.IOperator) NormalizedLevenshtein(info.debatty.java.stringsimilarity.NormalizedLevenshtein) Span(edu.uci.ics.textdb.api.span.Span)

Example 12 with ListField

use of edu.uci.ics.textdb.api.field.ListField in project textdb by TextDB.

the class KeywordMatcher method computePhraseMatchingResult.

private List<Span> computePhraseMatchingResult(Tuple inputTuple) throws DataFlowException {
    ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
    List<Span> payload = payloadField.getValue();
    List<Span> relevantSpans = filterRelevantSpans(payload);
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : this.predicate.getAttributeNames()) {
        AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        // for STRING type, the query should match the fieldValue completely
        if (attributeType == AttributeType.STRING) {
            if (fieldValue.equals(predicate.getQuery())) {
                matchingResults.add(new Span(attributeName, 0, predicate.getQuery().length(), predicate.getQuery(), fieldValue));
            }
        }
        // phrase query
        if (attributeType == AttributeType.TEXT) {
            List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
            if (!isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
                // in the spans
                continue;
            }
            // Sort current field's span list by token offset for later use
            Collections.sort(fieldSpanList, (span1, span2) -> span1.getTokenOffset() - span2.getTokenOffset());
            List<Integer> queryTokenOffset = new ArrayList<>();
            for (int i = 0; i < queryTokensWithStopwords.size(); i++) {
                if (queryTokenList.contains(queryTokensWithStopwords.get(i))) {
                    queryTokenOffset.add(i);
                }
            }
            // maintains position of term being checked in
            int iter = 0;
            // spanForThisField list
            while (iter < fieldSpanList.size()) {
                if (iter > fieldSpanList.size() - queryTokenList.size()) {
                    break;
                }
                // Verify if span in the spanForThisField correspond to our
                // phrase query, ie relative position offsets should be
                // similar
                // and the value should be same.
                // flag to check if a
                boolean isMismatchInSpan = false;
                // To check all the terms in query are verified
                for (int i = 0; i < queryTokenList.size() - 1; i++) {
                    Span first = fieldSpanList.get(iter + i);
                    Span second = fieldSpanList.get(iter + i + 1);
                    if (!(second.getTokenOffset() - first.getTokenOffset() == queryTokenOffset.get(i + 1) - queryTokenOffset.get(i) && first.getValue().equalsIgnoreCase(queryTokenList.get(i)) && second.getValue().equalsIgnoreCase(queryTokenList.get(i + 1)))) {
                        iter++;
                        isMismatchInSpan = true;
                        break;
                    }
                }
                if (isMismatchInSpan) {
                    continue;
                }
                int combinedSpanStartIndex = fieldSpanList.get(iter).getStart();
                int combinedSpanEndIndex = fieldSpanList.get(iter + queryTokenList.size() - 1).getEnd();
                Span combinedSpan = new Span(attributeName, combinedSpanStartIndex, combinedSpanEndIndex, predicate.getQuery(), fieldValue.substring(combinedSpanStartIndex, combinedSpanEndIndex));
                matchingResults.add(combinedSpan);
                iter = iter + queryTokenList.size();
            }
        }
    }
    return matchingResults;
}
Also used : SchemaConstants(edu.uci.ics.textdb.api.constants.SchemaConstants) Attribute(edu.uci.ics.textdb.api.schema.Attribute) Iterator(java.util.Iterator) ErrorMessages(edu.uci.ics.textdb.api.constants.ErrorMessages) AbstractSingleInputOperator(edu.uci.ics.textdb.exp.common.AbstractSingleInputOperator) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Set(java.util.Set) Utils(edu.uci.ics.textdb.api.utils.Utils) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) HashSet(java.util.HashSet) Schema(edu.uci.ics.textdb.api.schema.Schema) List(java.util.List) ListField(edu.uci.ics.textdb.api.field.ListField) Matcher(java.util.regex.Matcher) TextDBException(edu.uci.ics.textdb.api.exception.TextDBException) Pattern(java.util.regex.Pattern) Span(edu.uci.ics.textdb.api.span.Span) Collections(java.util.Collections) DataflowUtils(edu.uci.ics.textdb.exp.utils.DataflowUtils) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) ArrayList(java.util.ArrayList) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Span(edu.uci.ics.textdb.api.span.Span)

Example 13 with ListField

use of edu.uci.ics.textdb.api.field.ListField in project textdb by TextDB.

the class JoinDistancePredicate method joinTuples.

/**
     * This method is called by the Join operator to perform the join on the 
     * tuples passed.
     * 
     * @return New Tuple containing the result of join operation.
     */
@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws Exception {
    List<Span> newJoinSpanList = new ArrayList<>();
    /*
	     * We expect the values of all fields to be the same for innerTuple and outerTuple.
	     * We only checks _ID field, and field to be joined, since they are crucial to join operator.
	     * For other fields, we use the value from innerTuple.
	     * check if the _ID fields are the same
	     */
    if (!compareField(innerTuple, outerTuple, SchemaConstants._ID)) {
        return null;
    }
    // check if the fields to be joined are the same
    if (!compareField(innerTuple, outerTuple, this.joinAttributeName)) {
        return null;
    }
    /*
	     * If either/both tuples have no span information, return null.
	     * Check using try/catch if both the tuples have span information.
	     * If not return null; so we can process next tuple.
	     */
    ListField<Span> spanFieldOfInnerTuple = innerTuple.getField(SchemaConstants.SPAN_LIST);
    ListField<Span> spanFieldOfOuterTuple = outerTuple.getField(SchemaConstants.SPAN_LIST);
    List<Span> innerSpanList = null;
    List<Span> outerSpanList = null;
    // ListField
    if (spanFieldOfInnerTuple.getClass().equals(ListField.class)) {
        innerSpanList = spanFieldOfInnerTuple.getValue();
    }
    if (spanFieldOfOuterTuple.getClass().equals(ListField.class)) {
        outerSpanList = spanFieldOfOuterTuple.getValue();
    }
    Iterator<Span> outerSpanIter = outerSpanList.iterator();
    // the ones specified in the JoinPredicate during "sort merge"?)
    while (outerSpanIter.hasNext()) {
        Span outerSpan = outerSpanIter.next();
        // If not return null.
        if (!outerSpan.getAttributeName().equals(this.joinAttributeName)) {
            continue;
        }
        Iterator<Span> innerSpanIter = innerSpanList.iterator();
        while (innerSpanIter.hasNext()) {
            Span innerSpan = innerSpanIter.next();
            if (!innerSpan.getAttributeName().equals(this.joinAttributeName)) {
                continue;
            }
            Integer threshold = this.getThreshold();
            if (Math.abs(outerSpan.getStart() - innerSpan.getStart()) <= threshold && Math.abs(outerSpan.getEnd() - innerSpan.getEnd()) <= threshold) {
                Integer newSpanStartIndex = Math.min(innerSpan.getStart(), outerSpan.getStart());
                Integer newSpanEndIndex = Math.max(innerSpan.getEnd(), outerSpan.getEnd());
                String attributeName = this.joinAttributeName;
                String fieldValue = (String) innerTuple.getField(attributeName).getValue();
                String newFieldValue = fieldValue.substring(newSpanStartIndex, newSpanEndIndex);
                String spanKey = outerSpan.getKey() + "_" + innerSpan.getKey();
                Span newSpan = new Span(attributeName, newSpanStartIndex, newSpanEndIndex, spanKey, newFieldValue);
                newJoinSpanList.add(newSpan);
            }
        }
    }
    if (newJoinSpanList.isEmpty()) {
        return null;
    }
    // create output fields based on innerTuple's value
    List<Attribute> outputAttrList = outputSchema.getAttributes();
    List<IField> outputFields = outputAttrList.stream().filter(attr -> !attr.equals(SchemaConstants.SPAN_LIST_ATTRIBUTE)).map(attr -> attr.getAttributeName()).map(attributeName -> innerTuple.getField(attributeName, IField.class)).collect(Collectors.toList());
    outputFields.add(new ListField<>(newJoinSpanList));
    return new Tuple(outputSchema, outputFields.stream().toArray(IField[]::new));
}
Also used : SchemaConstants(edu.uci.ics.textdb.api.constants.SchemaConstants) JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) Attribute(edu.uci.ics.textdb.api.schema.Attribute) Iterator(java.util.Iterator) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) PredicateBase(edu.uci.ics.textdb.exp.common.PredicateBase) Schema(edu.uci.ics.textdb.api.schema.Schema) List(java.util.List) ListField(edu.uci.ics.textdb.api.field.ListField) IField(edu.uci.ics.textdb.api.field.IField) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) edu.uci.ics.textdb.api.tuple(edu.uci.ics.textdb.api.tuple) Span(edu.uci.ics.textdb.api.span.Span) PropertyNameConstants(edu.uci.ics.textdb.exp.common.PropertyNameConstants) IOperator(edu.uci.ics.textdb.api.dataflow.IOperator) Attribute(edu.uci.ics.textdb.api.schema.Attribute) ArrayList(java.util.ArrayList) IField(edu.uci.ics.textdb.api.field.IField) Span(edu.uci.ics.textdb.api.span.Span)

Example 14 with ListField

use of edu.uci.ics.textdb.api.field.ListField in project textdb by TextDB.

the class SpanTupleTest method testGetters.

@Test
public void testGetters() throws ParseException {
    // create data tuple first
    Attribute[] attributes = new Attribute[TestConstants.ATTRIBUTES_PEOPLE.length + 1];
    for (int count = 0; count < attributes.length - 1; count++) {
        attributes[count] = TestConstants.ATTRIBUTES_PEOPLE[count];
    }
    attributes[attributes.length - 1] = SchemaConstants.SPAN_LIST_ATTRIBUTE;
    List<IField> fields = new ArrayList<IField>(Arrays.asList(new IField[] { new StringField("bruce"), new StringField("lee"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("bruce was born in new york city and was grown up in los angeles") }));
    IField spanField = createSpanListField();
    fields.add(spanField);
    spanTuple = new Tuple(new Schema(attributes), fields.toArray(new IField[fields.size()]));
    IField spanFieldRetrieved = spanTuple.getField(SchemaConstants.SPAN_LIST);
    Assert.assertTrue(spanFieldRetrieved instanceof ListField);
    Assert.assertSame(spanField, spanFieldRetrieved);
}
Also used : Attribute(edu.uci.ics.textdb.api.schema.Attribute) Schema(edu.uci.ics.textdb.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.textdb.api.field.IntegerField) ListField(edu.uci.ics.textdb.api.field.ListField) IField(edu.uci.ics.textdb.api.field.IField) StringField(edu.uci.ics.textdb.api.field.StringField) TextField(edu.uci.ics.textdb.api.field.TextField) DateField(edu.uci.ics.textdb.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.textdb.api.field.DoubleField) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) Test(org.junit.Test)

Example 15 with ListField

use of edu.uci.ics.textdb.api.field.ListField in project textdb by TextDB.

the class SpanTupleTest method createSpanListField.

private IField createSpanListField() {
    List<Span> list = new ArrayList<Span>();
    // The key value will be:
    // For RegexMatcher : "n.*k"
    // For NamedEntityMatcher : LOCATION
    // For DictionaryMatcher: "new york" - For DictionaryMatcher the key and
    // value are same
    // For KeyWordMatcher: "new york" - the value can be "new" or "york"
    Span span1 = new Span("description", 18, 26, "LOCATION", "new york");
    Span span2 = new Span("description", 52, 63, "LOCATION", "los angeles");
    list.add(span1);
    list.add(span2);
    IField spanListField = new ListField<Span>(list);
    return spanListField;
}
Also used : ArrayList(java.util.ArrayList) ListField(edu.uci.ics.textdb.api.field.ListField) IField(edu.uci.ics.textdb.api.field.IField) Span(edu.uci.ics.textdb.api.span.Span)

Aggregations

ListField (edu.uci.ics.textdb.api.field.ListField)20 Span (edu.uci.ics.textdb.api.span.Span)18 IField (edu.uci.ics.textdb.api.field.IField)17 ArrayList (java.util.ArrayList)17 Attribute (edu.uci.ics.textdb.api.schema.Attribute)16 Schema (edu.uci.ics.textdb.api.schema.Schema)16 Tuple (edu.uci.ics.textdb.api.tuple.Tuple)14 Test (org.junit.Test)11 SchemaConstants (edu.uci.ics.textdb.api.constants.SchemaConstants)5 DataFlowException (edu.uci.ics.textdb.api.exception.DataFlowException)5 AttributeType (edu.uci.ics.textdb.api.schema.AttributeType)5 Collectors (java.util.stream.Collectors)5 Iterator (java.util.Iterator)4 List (java.util.List)4 ErrorMessages (edu.uci.ics.textdb.api.constants.ErrorMessages)3 TextDBException (edu.uci.ics.textdb.api.exception.TextDBException)3 Utils (edu.uci.ics.textdb.api.utils.Utils)3 AbstractSingleInputOperator (edu.uci.ics.textdb.exp.common.AbstractSingleInputOperator)3 DataflowUtils (edu.uci.ics.textdb.exp.utils.DataflowUtils)3 JsonCreator (com.fasterxml.jackson.annotation.JsonCreator)2