Search in sources :

Example 6 with AttributeType

use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.

the class NlpSplitOperator method open.

@Override
public void open() throws TextDBException {
    if (cursor != CLOSED) {
        return;
    }
    if (inputOperator == null) {
        throw new DataFlowException(ErrorMessages.INPUT_OPERATOR_NOT_SPECIFIED);
    }
    inputOperator.open();
    Schema inputSchema = inputOperator.getOutputSchema();
    // check if input schema is present
    if (!inputSchema.containsField(predicate.getInputAttributeName())) {
        throw new DataFlowException(String.format("input attribute %s is not in the input schema %s", predicate.getInputAttributeName(), inputSchema.getAttributeNames()));
    }
    // check if attribute type is valid
    AttributeType inputAttributeType = inputSchema.getAttribute(predicate.getInputAttributeName()).getAttributeType();
    boolean isValidType = inputAttributeType.equals(AttributeType.STRING) || inputAttributeType.equals(AttributeType.TEXT);
    if (!isValidType) {
        throw new DataFlowException(String.format("input attribute %s must have type String or Text, its actual type is %s", predicate.getInputAttributeName(), inputAttributeType));
    }
    // generate output schema by transforming the input schema based on what output format
    // is chosen (OneToOne vs. OneToMany)
    outputSchema = transformSchema(inputOperator.getOutputSchema());
    cursor = OPENED;
}
Also used : AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) Schema(edu.uci.ics.textdb.api.schema.Schema) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException)

Example 7 with AttributeType

use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.

the class RegexMatcher method processOneInputTuple.

/**
     * This function returns a list of spans in the given tuple that match the
     * regex For example, given tuple ("george watson", "graduate student", 23,
     * "(949)888-8888") and regex "g[^\s]*", this function will return
     * [Span(name, 0, 6, "g[^\s]*", "george watson"), Span(position, 0, 8,
     * "g[^\s]*", "graduate student")]
     * 
     * @param tuple
     *            document in which search is performed
     * @return a list of spans describing the occurrence of a matching sequence
     *         in the document
     * @throws DataFlowException
     */
@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws DataFlowException {
    if (inputTuple == null) {
        return null;
    }
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : predicate.getAttributeNames()) {
        AttributeType attributeType = inputSchema.getAttribute(attributeName).getAttributeType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        switch(regexEngine) {
            case JavaRegex:
                matchingResults.addAll(javaRegexMatch(fieldValue, attributeName));
                break;
            case RE2J:
                matchingResults.addAll(re2jRegexMatch(fieldValue, attributeName));
                break;
        }
    }
    if (matchingResults.isEmpty()) {
        return null;
    }
    ListField<Span> spanListField = inputTuple.getField(predicate.getSpanListName());
    List<Span> spanList = spanListField.getValue();
    spanList.addAll(matchingResults);
    return inputTuple;
}
Also used : AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) ArrayList(java.util.ArrayList) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Span(edu.uci.ics.textdb.api.span.Span)

Example 8 with AttributeType

use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.

the class SimilarityJoinPredicate method generateOutputSchema.

@Override
public Schema generateOutputSchema(Schema innerOperatorSchema, Schema outerOperatorSchema) throws DataFlowException {
    List<Attribute> outputAttributeList = new ArrayList<>();
    // add _ID field first
    outputAttributeList.add(SchemaConstants._ID_ATTRIBUTE);
    for (Attribute attr : innerOperatorSchema.getAttributes()) {
        String attrName = attr.getAttributeName();
        AttributeType attrType = attr.getAttributeType();
        // ignore _id, spanList, and payload
        if (attrName.equals(SchemaConstants._ID) || attrName.equals(SchemaConstants.SPAN_LIST) || attrName.equals(SchemaConstants.PAYLOAD)) {
            continue;
        }
        outputAttributeList.add(new Attribute(INNER_PREFIX + attrName, attrType));
    }
    for (Attribute attr : outerOperatorSchema.getAttributes()) {
        String attrName = attr.getAttributeName();
        AttributeType attrType = attr.getAttributeType();
        // ignore _id, spanList, and payload
        if (attrName.equals(SchemaConstants._ID) || attrName.equals(SchemaConstants.SPAN_LIST) || attrName.equals(SchemaConstants.PAYLOAD)) {
            continue;
        }
        outputAttributeList.add(new Attribute(OUTER_PREFIX + attrName, attrType));
    }
    // add spanList field
    outputAttributeList.add(SchemaConstants.SPAN_LIST_ATTRIBUTE);
    // add payload field if one of them contains payload
    if (innerOperatorSchema.containsField(SchemaConstants.PAYLOAD) || outerOperatorSchema.containsField(SchemaConstants.PAYLOAD)) {
        outputAttributeList.add(SchemaConstants.PAYLOAD_ATTRIBUTE);
    }
    return new Schema(outputAttributeList.stream().toArray(Attribute[]::new));
}
Also used : Attribute(edu.uci.ics.textdb.api.schema.Attribute) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) Schema(edu.uci.ics.textdb.api.schema.Schema)

Example 9 with AttributeType

use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.

the class KeywordMatcher method computeConjunctionMatchingResult.

private List<Span> computeConjunctionMatchingResult(Tuple inputTuple) throws DataFlowException {
    ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
    List<Span> payload = payloadField.getValue();
    List<Span> relevantSpans = filterRelevantSpans(payload);
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : this.predicate.getAttributeNames()) {
        AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        // for STRING type, the query should match the fieldValue completely
        if (attributeType == AttributeType.STRING) {
            if (fieldValue.equals(predicate.getQuery())) {
                Span span = new Span(attributeName, 0, predicate.getQuery().length(), predicate.getQuery(), fieldValue);
                matchingResults.add(span);
            }
        }
        // list for this field
        if (attributeType == AttributeType.TEXT) {
            List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
            if (isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
                matchingResults.addAll(fieldSpanList);
            }
        }
    }
    return matchingResults;
}
Also used : SchemaConstants(edu.uci.ics.textdb.api.constants.SchemaConstants) Attribute(edu.uci.ics.textdb.api.schema.Attribute) Iterator(java.util.Iterator) ErrorMessages(edu.uci.ics.textdb.api.constants.ErrorMessages) AbstractSingleInputOperator(edu.uci.ics.textdb.exp.common.AbstractSingleInputOperator) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Set(java.util.Set) Utils(edu.uci.ics.textdb.api.utils.Utils) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) HashSet(java.util.HashSet) Schema(edu.uci.ics.textdb.api.schema.Schema) List(java.util.List) ListField(edu.uci.ics.textdb.api.field.ListField) Matcher(java.util.regex.Matcher) TextDBException(edu.uci.ics.textdb.api.exception.TextDBException) Pattern(java.util.regex.Pattern) Span(edu.uci.ics.textdb.api.span.Span) Collections(java.util.Collections) DataflowUtils(edu.uci.ics.textdb.exp.utils.DataflowUtils) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) ArrayList(java.util.ArrayList) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Span(edu.uci.ics.textdb.api.span.Span)

Example 10 with AttributeType

use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.

the class KeywordMatcher method computePhraseMatchingResult.

private List<Span> computePhraseMatchingResult(Tuple inputTuple) throws DataFlowException {
    ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
    List<Span> payload = payloadField.getValue();
    List<Span> relevantSpans = filterRelevantSpans(payload);
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : this.predicate.getAttributeNames()) {
        AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        // for STRING type, the query should match the fieldValue completely
        if (attributeType == AttributeType.STRING) {
            if (fieldValue.equals(predicate.getQuery())) {
                matchingResults.add(new Span(attributeName, 0, predicate.getQuery().length(), predicate.getQuery(), fieldValue));
            }
        }
        // phrase query
        if (attributeType == AttributeType.TEXT) {
            List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
            if (!isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
                // in the spans
                continue;
            }
            // Sort current field's span list by token offset for later use
            Collections.sort(fieldSpanList, (span1, span2) -> span1.getTokenOffset() - span2.getTokenOffset());
            List<Integer> queryTokenOffset = new ArrayList<>();
            for (int i = 0; i < queryTokensWithStopwords.size(); i++) {
                if (queryTokenList.contains(queryTokensWithStopwords.get(i))) {
                    queryTokenOffset.add(i);
                }
            }
            // maintains position of term being checked in
            int iter = 0;
            // spanForThisField list
            while (iter < fieldSpanList.size()) {
                if (iter > fieldSpanList.size() - queryTokenList.size()) {
                    break;
                }
                // Verify if span in the spanForThisField correspond to our
                // phrase query, ie relative position offsets should be
                // similar
                // and the value should be same.
                // flag to check if a
                boolean isMismatchInSpan = false;
                // To check all the terms in query are verified
                for (int i = 0; i < queryTokenList.size() - 1; i++) {
                    Span first = fieldSpanList.get(iter + i);
                    Span second = fieldSpanList.get(iter + i + 1);
                    if (!(second.getTokenOffset() - first.getTokenOffset() == queryTokenOffset.get(i + 1) - queryTokenOffset.get(i) && first.getValue().equalsIgnoreCase(queryTokenList.get(i)) && second.getValue().equalsIgnoreCase(queryTokenList.get(i + 1)))) {
                        iter++;
                        isMismatchInSpan = true;
                        break;
                    }
                }
                if (isMismatchInSpan) {
                    continue;
                }
                int combinedSpanStartIndex = fieldSpanList.get(iter).getStart();
                int combinedSpanEndIndex = fieldSpanList.get(iter + queryTokenList.size() - 1).getEnd();
                Span combinedSpan = new Span(attributeName, combinedSpanStartIndex, combinedSpanEndIndex, predicate.getQuery(), fieldValue.substring(combinedSpanStartIndex, combinedSpanEndIndex));
                matchingResults.add(combinedSpan);
                iter = iter + queryTokenList.size();
            }
        }
    }
    return matchingResults;
}
Also used : SchemaConstants(edu.uci.ics.textdb.api.constants.SchemaConstants) Attribute(edu.uci.ics.textdb.api.schema.Attribute) Iterator(java.util.Iterator) ErrorMessages(edu.uci.ics.textdb.api.constants.ErrorMessages) AbstractSingleInputOperator(edu.uci.ics.textdb.exp.common.AbstractSingleInputOperator) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Set(java.util.Set) Utils(edu.uci.ics.textdb.api.utils.Utils) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) HashSet(java.util.HashSet) Schema(edu.uci.ics.textdb.api.schema.Schema) List(java.util.List) ListField(edu.uci.ics.textdb.api.field.ListField) Matcher(java.util.regex.Matcher) TextDBException(edu.uci.ics.textdb.api.exception.TextDBException) Pattern(java.util.regex.Pattern) Span(edu.uci.ics.textdb.api.span.Span) Collections(java.util.Collections) DataflowUtils(edu.uci.ics.textdb.exp.utils.DataflowUtils) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) ArrayList(java.util.ArrayList) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Span(edu.uci.ics.textdb.api.span.Span)

Aggregations

AttributeType (edu.uci.ics.textdb.api.schema.AttributeType)17 ArrayList (java.util.ArrayList)11 DataFlowException (edu.uci.ics.textdb.api.exception.DataFlowException)10 Attribute (edu.uci.ics.textdb.api.schema.Attribute)8 Schema (edu.uci.ics.textdb.api.schema.Schema)8 Span (edu.uci.ics.textdb.api.span.Span)8 IField (edu.uci.ics.textdb.api.field.IField)5 SchemaConstants (edu.uci.ics.textdb.api.constants.SchemaConstants)4 ListField (edu.uci.ics.textdb.api.field.ListField)4 Tuple (edu.uci.ics.textdb.api.tuple.Tuple)4 Iterator (java.util.Iterator)4 List (java.util.List)4 Matcher (java.util.regex.Matcher)4 Pattern (java.util.regex.Pattern)4 Collectors (java.util.stream.Collectors)4 ErrorMessages (edu.uci.ics.textdb.api.constants.ErrorMessages)3 TextDBException (edu.uci.ics.textdb.api.exception.TextDBException)3 Utils (edu.uci.ics.textdb.api.utils.Utils)3 AbstractSingleInputOperator (edu.uci.ics.textdb.exp.common.AbstractSingleInputOperator)3 DataflowUtils (edu.uci.ics.textdb.exp.utils.DataflowUtils)3