Search in sources :

Example 76 with Span

use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.

the class KeywordMatcher method computePhraseMatchingResult.

private List<Span> computePhraseMatchingResult(Tuple inputTuple) throws DataFlowException {
    ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
    List<Span> payload = payloadField.getValue();
    List<Span> relevantSpans = filterRelevantSpans(payload);
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : this.predicate.getAttributeNames()) {
        AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        // for STRING type, the query should match the fieldValue completely
        if (attributeType == AttributeType.STRING) {
            if (fieldValue.equals(predicate.getQuery())) {
                matchingResults.add(new Span(attributeName, 0, predicate.getQuery().length(), predicate.getQuery(), fieldValue));
            }
        }
        // phrase query
        if (attributeType == AttributeType.TEXT) {
            List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
            if (!isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
                // in the spans
                continue;
            }
            // Sort current field's span list by token offset for later use
            Collections.sort(fieldSpanList, (span1, span2) -> span1.getTokenOffset() - span2.getTokenOffset());
            List<Integer> queryTokenOffset = new ArrayList<>();
            for (int i = 0; i < queryTokensWithStopwords.size(); i++) {
                if (queryTokenList.contains(queryTokensWithStopwords.get(i))) {
                    queryTokenOffset.add(i);
                }
            }
            // maintains position of term being checked in
            int iter = 0;
            // spanForThisField list
            while (iter < fieldSpanList.size()) {
                if (iter > fieldSpanList.size() - queryTokenList.size()) {
                    break;
                }
                // Verify if span in the spanForThisField correspond to our
                // phrase query, ie relative position offsets should be
                // similar
                // and the value should be same.
                // flag to check if a
                boolean isMismatchInSpan = false;
                // To check all the terms in query are verified
                for (int i = 0; i < queryTokenList.size() - 1; i++) {
                    Span first = fieldSpanList.get(iter + i);
                    Span second = fieldSpanList.get(iter + i + 1);
                    if (!(second.getTokenOffset() - first.getTokenOffset() == queryTokenOffset.get(i + 1) - queryTokenOffset.get(i) && first.getValue().equalsIgnoreCase(queryTokenList.get(i)) && second.getValue().equalsIgnoreCase(queryTokenList.get(i + 1)))) {
                        iter++;
                        isMismatchInSpan = true;
                        break;
                    }
                }
                if (isMismatchInSpan) {
                    continue;
                }
                int combinedSpanStartIndex = fieldSpanList.get(iter).getStart();
                int combinedSpanEndIndex = fieldSpanList.get(iter + queryTokenList.size() - 1).getEnd();
                Span combinedSpan = new Span(attributeName, combinedSpanStartIndex, combinedSpanEndIndex, predicate.getQuery(), fieldValue.substring(combinedSpanStartIndex, combinedSpanEndIndex));
                matchingResults.add(combinedSpan);
                iter = iter + queryTokenList.size();
            }
        }
    }
    return matchingResults;
}
Also used : SchemaConstants(edu.uci.ics.textdb.api.constants.SchemaConstants) Attribute(edu.uci.ics.textdb.api.schema.Attribute) Iterator(java.util.Iterator) ErrorMessages(edu.uci.ics.textdb.api.constants.ErrorMessages) AbstractSingleInputOperator(edu.uci.ics.textdb.exp.common.AbstractSingleInputOperator) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Set(java.util.Set) Utils(edu.uci.ics.textdb.api.utils.Utils) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) HashSet(java.util.HashSet) Schema(edu.uci.ics.textdb.api.schema.Schema) List(java.util.List) ListField(edu.uci.ics.textdb.api.field.ListField) Matcher(java.util.regex.Matcher) TextDBException(edu.uci.ics.textdb.api.exception.TextDBException) Pattern(java.util.regex.Pattern) Span(edu.uci.ics.textdb.api.span.Span) Collections(java.util.Collections) DataflowUtils(edu.uci.ics.textdb.exp.utils.DataflowUtils) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) ArrayList(java.util.ArrayList) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Span(edu.uci.ics.textdb.api.span.Span)

Example 77 with Span

use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.

the class JoinDistancePredicate method joinTuples.

/**
     * This method is called by the Join operator to perform the join on the 
     * tuples passed.
     * 
     * @return New Tuple containing the result of join operation.
     */
@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws Exception {
    List<Span> newJoinSpanList = new ArrayList<>();
    /*
	     * We expect the values of all fields to be the same for innerTuple and outerTuple.
	     * We only checks _ID field, and field to be joined, since they are crucial to join operator.
	     * For other fields, we use the value from innerTuple.
	     * check if the _ID fields are the same
	     */
    if (!compareField(innerTuple, outerTuple, SchemaConstants._ID)) {
        return null;
    }
    // check if the fields to be joined are the same
    if (!compareField(innerTuple, outerTuple, this.joinAttributeName)) {
        return null;
    }
    /*
	     * If either/both tuples have no span information, return null.
	     * Check using try/catch if both the tuples have span information.
	     * If not return null; so we can process next tuple.
	     */
    ListField<Span> spanFieldOfInnerTuple = innerTuple.getField(SchemaConstants.SPAN_LIST);
    ListField<Span> spanFieldOfOuterTuple = outerTuple.getField(SchemaConstants.SPAN_LIST);
    List<Span> innerSpanList = null;
    List<Span> outerSpanList = null;
    // ListField
    if (spanFieldOfInnerTuple.getClass().equals(ListField.class)) {
        innerSpanList = spanFieldOfInnerTuple.getValue();
    }
    if (spanFieldOfOuterTuple.getClass().equals(ListField.class)) {
        outerSpanList = spanFieldOfOuterTuple.getValue();
    }
    Iterator<Span> outerSpanIter = outerSpanList.iterator();
    // the ones specified in the JoinPredicate during "sort merge"?)
    while (outerSpanIter.hasNext()) {
        Span outerSpan = outerSpanIter.next();
        // If not return null.
        if (!outerSpan.getAttributeName().equals(this.joinAttributeName)) {
            continue;
        }
        Iterator<Span> innerSpanIter = innerSpanList.iterator();
        while (innerSpanIter.hasNext()) {
            Span innerSpan = innerSpanIter.next();
            if (!innerSpan.getAttributeName().equals(this.joinAttributeName)) {
                continue;
            }
            Integer threshold = this.getThreshold();
            if (Math.abs(outerSpan.getStart() - innerSpan.getStart()) <= threshold && Math.abs(outerSpan.getEnd() - innerSpan.getEnd()) <= threshold) {
                Integer newSpanStartIndex = Math.min(innerSpan.getStart(), outerSpan.getStart());
                Integer newSpanEndIndex = Math.max(innerSpan.getEnd(), outerSpan.getEnd());
                String attributeName = this.joinAttributeName;
                String fieldValue = (String) innerTuple.getField(attributeName).getValue();
                String newFieldValue = fieldValue.substring(newSpanStartIndex, newSpanEndIndex);
                String spanKey = outerSpan.getKey() + "_" + innerSpan.getKey();
                Span newSpan = new Span(attributeName, newSpanStartIndex, newSpanEndIndex, spanKey, newFieldValue);
                newJoinSpanList.add(newSpan);
            }
        }
    }
    if (newJoinSpanList.isEmpty()) {
        return null;
    }
    // create output fields based on innerTuple's value
    List<Attribute> outputAttrList = outputSchema.getAttributes();
    List<IField> outputFields = outputAttrList.stream().filter(attr -> !attr.equals(SchemaConstants.SPAN_LIST_ATTRIBUTE)).map(attr -> attr.getAttributeName()).map(attributeName -> innerTuple.getField(attributeName, IField.class)).collect(Collectors.toList());
    outputFields.add(new ListField<>(newJoinSpanList));
    return new Tuple(outputSchema, outputFields.stream().toArray(IField[]::new));
}
Also used : SchemaConstants(edu.uci.ics.textdb.api.constants.SchemaConstants) JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) Attribute(edu.uci.ics.textdb.api.schema.Attribute) Iterator(java.util.Iterator) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) PredicateBase(edu.uci.ics.textdb.exp.common.PredicateBase) Schema(edu.uci.ics.textdb.api.schema.Schema) List(java.util.List) ListField(edu.uci.ics.textdb.api.field.ListField) IField(edu.uci.ics.textdb.api.field.IField) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) edu.uci.ics.textdb.api.tuple(edu.uci.ics.textdb.api.tuple) Span(edu.uci.ics.textdb.api.span.Span) PropertyNameConstants(edu.uci.ics.textdb.exp.common.PropertyNameConstants) IOperator(edu.uci.ics.textdb.api.dataflow.IOperator) Attribute(edu.uci.ics.textdb.api.schema.Attribute) ArrayList(java.util.ArrayList) IField(edu.uci.ics.textdb.api.field.IField) Span(edu.uci.ics.textdb.api.span.Span)

Example 78 with Span

use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.

the class KeywordMatcher method computeSubstringMatchingResult.

private List<Span> computeSubstringMatchingResult(Tuple inputTuple) throws DataFlowException {
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : this.predicate.getAttributeNames()) {
        AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        // for STRING type, the query should match the fieldValue completely
        if (attributeType == AttributeType.STRING) {
            if (fieldValue.equals(predicate.getQuery())) {
                matchingResults.add(new Span(attributeName, 0, predicate.getQuery().length(), predicate.getQuery(), fieldValue));
            }
        }
        if (attributeType == AttributeType.TEXT) {
            String regex = predicate.getQuery().toLowerCase();
            Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
            Matcher matcher = pattern.matcher(fieldValue.toLowerCase());
            while (matcher.find()) {
                int start = matcher.start();
                int end = matcher.end();
                matchingResults.add(new Span(attributeName, start, end, predicate.getQuery(), fieldValue.substring(start, end)));
            }
        }
    }
    return matchingResults;
}
Also used : Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) ArrayList(java.util.ArrayList) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Span(edu.uci.ics.textdb.api.span.Span)

Example 79 with Span

use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.

the class NlpEntityOperator method processOneInputTuple.

@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws TextDBException {
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : predicate.getAttributeNames()) {
        IField field = inputTuple.getField(attributeName);
        matchingResults.addAll(extractNlpSpans(field, attributeName));
    }
    if (matchingResults.isEmpty()) {
        return null;
    }
    ListField<Span> spanListField = inputTuple.getField(predicate.getSpanListName());
    List<Span> spanList = spanListField.getValue();
    spanList.addAll(matchingResults);
    return inputTuple;
}
Also used : ArrayList(java.util.ArrayList) IField(edu.uci.ics.textdb.api.field.IField) Span(edu.uci.ics.textdb.api.span.Span)

Example 80 with Span

use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.

the class RegexMatcher method re2jRegexMatch.

private List<Span> re2jRegexMatch(String fieldValue, String attributeName) {
    List<Span> matchingResults = new ArrayList<>();
    com.google.re2j.Matcher re2jMatcher = this.re2jPattern.matcher(fieldValue);
    while (re2jMatcher.find()) {
        int start = re2jMatcher.start();
        int end = re2jMatcher.end();
        matchingResults.add(new Span(attributeName, start, end, this.predicate.getRegex(), fieldValue.substring(start, end)));
    }
    return matchingResults;
}
Also used : ArrayList(java.util.ArrayList) Span(edu.uci.ics.textdb.api.span.Span)

Aggregations

Span (edu.uci.ics.textdb.api.span.Span)112 ArrayList (java.util.ArrayList)97 Schema (edu.uci.ics.textdb.api.schema.Schema)88 IField (edu.uci.ics.textdb.api.field.IField)86 Tuple (edu.uci.ics.textdb.api.tuple.Tuple)80 TextField (edu.uci.ics.textdb.api.field.TextField)71 Attribute (edu.uci.ics.textdb.api.schema.Attribute)71 Test (org.junit.Test)71 IntegerField (edu.uci.ics.textdb.api.field.IntegerField)60 StringField (edu.uci.ics.textdb.api.field.StringField)58 DoubleField (edu.uci.ics.textdb.api.field.DoubleField)49 DateField (edu.uci.ics.textdb.api.field.DateField)46 SimpleDateFormat (java.text.SimpleDateFormat)46 Dictionary (edu.uci.ics.textdb.exp.dictionarymatcher.Dictionary)25 ListField (edu.uci.ics.textdb.api.field.ListField)18 KeywordMatcherSourceOperator (edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatcherSourceOperator)10 AttributeType (edu.uci.ics.textdb.api.schema.AttributeType)9 JoinDistancePredicate (edu.uci.ics.textdb.exp.join.JoinDistancePredicate)9 DataFlowException (edu.uci.ics.textdb.api.exception.DataFlowException)7 SchemaConstants (edu.uci.ics.textdb.api.constants.SchemaConstants)5