Search in sources :

Example 41 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class DictionaryMatcher method computeNextMatchingTuple.

@Override
protected Tuple computeNextMatchingTuple() throws TexeraException {
    Tuple inputTuple;
    Tuple resultTuple = null;
    while ((inputTuple = inputOperator.getNextTuple()) != null) {
        resultTuple = processOneInputTuple(inputTuple);
        if (resultTuple != null) {
            break;
        }
    }
    return resultTuple;
}
Also used : Tuple(edu.uci.ics.texera.api.tuple.Tuple)

Example 42 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class DictionaryMatcher method processOneInputTuple.

@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws TexeraException {
    if (inputTuple == null) {
        return null;
    }
    // add payload if needed before passing it to the matching functions
    if (addPayload) {
        Tuple.Builder tupleBuilderPayload = new Tuple.Builder(inputTuple);
        tupleBuilderPayload.add(SchemaConstants.PAYLOAD_ATTRIBUTE, new ListField<Span>(DataflowUtils.generatePayloadFromTuple(inputTuple, predicate.getAnalyzerString())));
        inputTuple = tupleBuilderPayload.build();
    }
    List<Span> matchingResults = null;
    if (predicate.getKeywordMatchingType() == KeywordMatchingType.CONJUNCTION_INDEXBASED) {
        ArrayList<String> dictionaryEntries = predicate.getDictionary().getDictionaryEntries();
        ArrayList<Set<String>> tokenSetsNoStopwords = predicate.getDictionary().getTokenSetsNoStopwords();
        matchingResults = appendConjunctionMatchingSpans4Dictionary(inputTuple, predicate.getAttributeNames(), tokenSetsNoStopwords, dictionaryEntries);
    } else if (predicate.getKeywordMatchingType() == KeywordMatchingType.PHRASE_INDEXBASED) {
        ArrayList<String> dictionaryEntries = predicate.getDictionary().getDictionaryEntries();
        ArrayList<List<String>> tokenListsNoStopwords = predicate.getDictionary().getTokenListsNoStopwords();
        ArrayList<List<String>> tokenListsWithStopwords = predicate.getDictionary().getTokenListsWithStopwords();
        ArrayList<Set<String>> tokenSetsNoStopwords = predicate.getDictionary().getTokenSetsNoStopwords();
        matchingResults = appendPhraseMatchingSpans4Dictionary(inputTuple, predicate.getAttributeNames(), tokenListsNoStopwords, tokenSetsNoStopwords, tokenListsWithStopwords, dictionaryEntries);
    } else if (predicate.getKeywordMatchingType() == KeywordMatchingType.SUBSTRING_SCANBASED) {
        matchingResults = new ArrayList<Span>();
        for (String attributeName : predicate.getAttributeNames()) {
            AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
            String fieldValue = inputTuple.getField(attributeName).getValue().toString();
            // types other than TEXT and STRING: throw Exception for now
            if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
                throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
            }
            List<ACTrie.Emit> matchingEmits = dictionaryTrie.parseText(fieldValue);
            if (!matchingEmits.isEmpty()) {
                for (ACTrie.Emit emit : matchingEmits) {
                    matchingResults.add(new Span(attributeName, emit.getStart(), emit.getEnd(), emit.getKeyword(), fieldValue.substring(emit.getStart(), emit.getEnd())));
                }
            }
        }
    } else if (predicate.getKeywordMatchingType() == KeywordMatchingType.REGEX) {
        ArrayList<Pattern> patternList = predicate.getDictionary().getPatternList();
        ArrayList<String> dictionaryEntries = predicate.getDictionary().getDictionaryEntries();
        matchingResults = new ArrayList<>();
        for (int i = 0; i < dictionaryEntries.size(); i++) {
            for (String attributeName : predicate.getAttributeNames()) {
                AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
                String fieldValue = inputTuple.getField(attributeName).getValue().toString();
                // types other than TEXT and STRING: throw Exception for now
                if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
                    throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
                }
                Matcher javaMatcher = patternList.get(i).matcher(fieldValue);
                while (javaMatcher.find()) {
                    int start = javaMatcher.start();
                    int end = javaMatcher.end();
                    matchingResults.add(new Span(attributeName, start, end, dictionaryEntries.get(i), fieldValue.substring(start, end)));
                }
            }
        }
    }
    if (matchingResults.isEmpty()) {
        return null;
    }
    Tuple.Builder tupleBuilder = new Tuple.Builder(inputTuple);
    if (addResultAttribute) {
        tupleBuilder.add(predicate.getSpanListName(), AttributeType.LIST, new ListField<Span>(matchingResults));
    }
    return tupleBuilder.build();
}
Also used : Matcher(java.util.regex.Matcher) Span(edu.uci.ics.texera.api.span.Span) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Tuple(edu.uci.ics.texera.api.tuple.Tuple)

Example 43 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class FuzzyTokenMatcher method processOneInputTuple.

@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws TexeraException {
    // add payload if needed before passing it to the matching functions
    if (addPayload) {
        Tuple.Builder tupleBuilderPayload = new Tuple.Builder(inputTuple);
        tupleBuilderPayload.add(SchemaConstants.PAYLOAD_ATTRIBUTE, new ListField<Span>(DataflowUtils.generatePayloadFromTuple(inputTuple, predicate.getLuceneAnalyzerStr())));
        inputTuple = tupleBuilderPayload.build();
    }
    ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
    List<Span> relevantSpans = filterRelevantSpans(payloadField.getValue());
    List<Span> matchingResults = new ArrayList<>();
    /*
         * The source operator returns spans even for those fields which did not
         * satisfy the threshold criterion. So if two attributes A,B have 10 and
         * 5 matching tokens, and we set threshold to 10, the number of spans
         * returned is 15. So we need to filter those 5 spans for attribute B.
         */
    for (String attributeName : this.predicate.getAttributeNames()) {
        AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getType();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.TEXT && attributeType != AttributeType.STRING) {
            throw new DataflowException("FuzzyTokenMatcher: Fields other than TEXT or STRING are not supported");
        }
        List<Span> fieldSpans = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).filter(span -> predicate.getQueryTokens().contains(span.getKey())).collect(Collectors.toList());
        if (fieldSpans.size() >= predicate.getThreshold()) {
            matchingResults.addAll(fieldSpans);
        }
    }
    if (matchingResults.isEmpty()) {
        return null;
    }
    Tuple.Builder tupleBuilder = new Tuple.Builder(inputTuple);
    if (addResultAttribute) {
        tupleBuilder.add(predicate.getSpanListName(), AttributeType.LIST, new ListField<Span>(matchingResults));
    }
    return tupleBuilder.build();
}
Also used : ListField(edu.uci.ics.texera.api.field.ListField) Iterator(java.util.Iterator) Tuple(edu.uci.ics.texera.api.tuple.Tuple) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) Collectors(java.util.stream.Collectors) Span(edu.uci.ics.texera.api.span.Span) ArrayList(java.util.ArrayList) List(java.util.List) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) AbstractSingleInputOperator(edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowUtils(edu.uci.ics.texera.dataflow.utils.DataflowUtils) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) ArrayList(java.util.ArrayList) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Span(edu.uci.ics.texera.api.span.Span) Tuple(edu.uci.ics.texera.api.tuple.Tuple)

Example 44 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class Join method computeNextMatchingTuple.

/*
     * Called from getNextTuple() method in order to obtain the next tuple 
     * that satisfies the predicate. 
     * 
     * It returns null if there's no more tuples.
     */
private Tuple computeNextMatchingTuple() throws Exception {
    if (innerTupleList.isEmpty()) {
        return null;
    }
    Tuple nextTuple = null;
    while (nextTuple == null) {
        // if reach the end of inner tuple list
        if (innerTupleListCursor >= innerTupleList.size()) {
            // get next outer tuple
            currentOuterTuple = outerOperator.getNextTuple();
            if (currentOuterTuple == null) {
                return null;
            }
            // reset cursor if outerTuple is not null
            innerTupleListCursor = 0;
        }
        // compute next tuple
        nextTuple = joinPredicate.joinTuples(innerTupleList.get(innerTupleListCursor), currentOuterTuple, outputSchema);
        // increment cursor
        innerTupleListCursor++;
    }
    return nextTuple;
}
Also used : Tuple(edu.uci.ics.texera.api.tuple.Tuple)

Example 45 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class Join method getNextTuple.

/**
 * Gets the next tuple which is a joint of two tuples which passed the
 * criteria set in the JoinPredicate. <br>
 * Example in JoinPredicate.java
 *
 * @return nextTuple
 */
@Override
public Tuple getNextTuple() throws TexeraException {
    if (cursor == CLOSED) {
        throw new DataflowException(ErrorMessages.OPERATOR_NOT_OPENED);
    }
    // load all tuples from inner operator into memory in the first time
    if (innerTupleList == null) {
        innerTupleList = new ArrayList<>();
        Tuple tuple;
        while ((tuple = innerOperator.getNextTuple()) != null) {
            innerTupleList.add(tuple);
        }
    }
    // load the first outer tuple
    currentOuterTuple = outerOperator.getNextTuple();
    // all outer tuples have been consumed
    if (innerTupleList.isEmpty() || currentOuterTuple == null) {
        return null;
    }
    if (resultCursor >= limit + offset - 1 || limit == 0) {
        return null;
    }
    try {
        Tuple resultTuple = null;
        while (true) {
            resultTuple = computeNextMatchingTuple();
            if (resultTuple == null) {
                break;
            }
            resultCursor++;
            if (resultCursor >= offset) {
                break;
            }
        }
        return resultTuple;
    } catch (Exception e) {
        throw new DataflowException(e.getMessage(), e);
    }
}
Also used : DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Tuple(edu.uci.ics.texera.api.tuple.Tuple) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) TexeraException(edu.uci.ics.texera.api.exception.TexeraException)

Aggregations

Tuple (edu.uci.ics.texera.api.tuple.Tuple)280 ArrayList (java.util.ArrayList)167 Test (org.junit.Test)158 Schema (edu.uci.ics.texera.api.schema.Schema)108 IField (edu.uci.ics.texera.api.field.IField)106 Span (edu.uci.ics.texera.api.span.Span)99 TextField (edu.uci.ics.texera.api.field.TextField)90 StringField (edu.uci.ics.texera.api.field.StringField)83 IntegerField (edu.uci.ics.texera.api.field.IntegerField)80 Attribute (edu.uci.ics.texera.api.schema.Attribute)75 DoubleField (edu.uci.ics.texera.api.field.DoubleField)59 DateField (edu.uci.ics.texera.api.field.DateField)55 SimpleDateFormat (java.text.SimpleDateFormat)53 DataWriter (edu.uci.ics.texera.storage.DataWriter)32 Dictionary (edu.uci.ics.texera.dataflow.dictionarymatcher.Dictionary)30 ListField (edu.uci.ics.texera.api.field.ListField)27 ScanBasedSourceOperator (edu.uci.ics.texera.dataflow.source.scan.ScanBasedSourceOperator)21 ScanSourcePredicate (edu.uci.ics.texera.dataflow.source.scan.ScanSourcePredicate)21 KeywordMatcherSourceOperator (edu.uci.ics.texera.dataflow.keywordmatcher.KeywordMatcherSourceOperator)20 RelationManager (edu.uci.ics.texera.storage.RelationManager)19