Search in sources :

Example 96 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class DictionaryMatcher method appendConjunctionMatchingSpans4Dictionary.

private List<Span> appendConjunctionMatchingSpans4Dictionary(Tuple inputTuple, List<String> attributeNames, List<Set<String>> queryTokenSetList, List<String> queryList) throws DataflowException {
    List<Span> matchingResults = new ArrayList<>();
    ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
    List<Span> payload = payloadField.getValue();
    Map<Integer, List<Span>> relevantSpansMap = filterRelevantSpans(payload, queryTokenSetList);
    for (String attributeName : attributeNames) {
        AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        // for STRING type, check if the dictionary entries contains the complete fieldValue
        if (attributeType == AttributeType.STRING) {
            if (queryList.contains(fieldValue)) {
                Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
                matchingResults.add(span);
            }
        }
        // for TEXT type, every token in the query should be present in span
        if (attributeType == AttributeType.TEXT) {
            for (int index : relevantSpansMap.keySet()) {
                List<Span> fieldSpanList = relevantSpansMap.get(index).stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
                if (DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSetList.get(index))) {
                    matchingResults.addAll(fieldSpanList);
                }
            }
        }
    }
    return matchingResults;
}
Also used : ListField(edu.uci.ics.texera.api.field.ListField) java.util(java.util) Tuple(edu.uci.ics.texera.api.tuple.Tuple) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) Collectors(java.util.stream.Collectors) Span(edu.uci.ics.texera.api.span.Span) Matcher(java.util.regex.Matcher) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IOperator(edu.uci.ics.texera.api.dataflow.IOperator) AbstractSingleInputOperator(edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator) ErrorMessages(edu.uci.ics.texera.api.constants.ErrorMessages) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Pattern(java.util.regex.Pattern) DataflowUtils(edu.uci.ics.texera.dataflow.utils.DataflowUtils) KeywordMatchingType(edu.uci.ics.texera.dataflow.keywordmatcher.KeywordMatchingType) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Span(edu.uci.ics.texera.api.span.Span)

Example 97 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class DictionaryMatcher method filterRelevantSpans.

private Map<Integer, List<Span>> filterRelevantSpans(List<Span> spanList, List<Set<String>> queryTokenSet) {
    Map<Integer, List<Span>> resultMap = new HashMap<>();
    Map<String, List<Integer>> tokenMap = new HashMap<>();
    for (int i = 0; i < queryTokenSet.size(); i++) {
        for (String s : queryTokenSet.get(i)) {
            if (!tokenMap.containsKey(s)) {
                tokenMap.put(s, new ArrayList<>());
            }
            tokenMap.get(s).add(i);
        }
    }
    Iterator<Span> iterator = spanList.iterator();
    while (iterator.hasNext()) {
        Span span = iterator.next();
        if (tokenMap.keySet().contains(span.getKey())) {
            List<Integer> tokensetIndex = tokenMap.get(span.getKey());
            for (Integer index : tokensetIndex) {
                if (!resultMap.containsKey(index)) {
                    resultMap.put(index, new ArrayList<>());
                }
                resultMap.get(index).add(span);
            }
        }
    }
    return resultMap;
}
Also used : Span(edu.uci.ics.texera.api.span.Span)

Example 98 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class DictionaryMatcherSourceOperator method computeMatchingResults.

/**
 *  Maintain a HashMap </Tuple_ID, Tuple> to compute all the keyword
 *  matching results for each tuple.
 *
 * @param resultMap
 */
@SuppressWarnings("unchecked")
private void computeMatchingResults() {
    Tuple inputTuple;
    while (true) {
        while ((inputTuple = keywordSource.getNextTuple()) != null) {
            String tupleID = inputTuple.getField(SchemaConstants._ID).getValue().toString();
            ListField<Span> keywordResultsField = inputTuple.getField(predicate.getSpanListName(), ListField.class);
            List<Span> keywordResults = keywordResultsField.getValue();
            if (tupleResultMap.containsKey(tupleID)) {
                tupleResultMap.get(tupleID).addAll(keywordResults);
            } else {
                tupleIDMap.put(tupleID, new Tuple.Builder(inputTuple).remove(predicate.getSpanListName()).build());
                tupleResultMap.put(tupleID, new ArrayList<>(keywordResults));
            }
        }
        if ((currentDictionaryEntry = predicate.getDictionary().getNextEntry()) == null) {
            return;
        }
        keywordSource.close();
        KeywordSourcePredicate keywordSourcePredicate = new KeywordSourcePredicate(currentDictionaryEntry, predicate.getAttributeNames(), predicate.getAnalyzerString(), predicate.getKeywordMatchingType(), predicate.getTableName(), predicate.getSpanListName());
        keywordSource = new KeywordMatcherSourceOperator(keywordSourcePredicate);
        keywordSource.open();
    }
}
Also used : KeywordSourcePredicate(edu.uci.ics.texera.dataflow.keywordmatcher.KeywordSourcePredicate) Span(edu.uci.ics.texera.api.span.Span) Tuple(edu.uci.ics.texera.api.tuple.Tuple) KeywordMatcherSourceOperator(edu.uci.ics.texera.dataflow.keywordmatcher.KeywordMatcherSourceOperator)

Example 99 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class JoinDistancePredicate method joinTuples.

/**
 * This method is called by the Join operator to perform the join on the
 * tuples passed.
 *
 * @return New Tuple containing the result of join operation.
 */
@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws Exception {
    List<Span> newJoinSpanList = new ArrayList<>();
    /*
	     * We expect the values of all fields to be the same for innerTuple and outerTuple.
	     * We only checks _ID field, and field to be joined, since they are crucial to join operator.
	     * For other fields, we use the value from innerTuple.
	     * check if the _ID fields are the same
	     */
    if (!compareField(innerTuple, outerTuple, SchemaConstants._ID)) {
        return null;
    }
    // check if the fields to be joined are the same
    if (!compareField(innerTuple, outerTuple, this.joinAttributeName)) {
        return null;
    }
    /*
	     * If either/both tuples have no span information, return null.
	     * Check using try/catch if both the tuples have span information.
	     * If not return null; so we can process next tuple.
	     */
    ListField<Span> spanFieldOfInnerTuple = innerTuple.getField(SchemaConstants.SPAN_LIST);
    ListField<Span> spanFieldOfOuterTuple = outerTuple.getField(SchemaConstants.SPAN_LIST);
    List<Span> innerSpanList = null;
    List<Span> outerSpanList = null;
    // ListField
    if (spanFieldOfInnerTuple.getClass().equals(ListField.class)) {
        innerSpanList = spanFieldOfInnerTuple.getValue();
    }
    if (spanFieldOfOuterTuple.getClass().equals(ListField.class)) {
        outerSpanList = spanFieldOfOuterTuple.getValue();
    }
    Iterator<Span> outerSpanIter = outerSpanList.iterator();
    // the ones specified in the JoinPredicate during "sort merge"?)
    while (outerSpanIter.hasNext()) {
        Span outerSpan = outerSpanIter.next();
        // If not return null.
        if (!outerSpan.getAttributeName().equals(this.joinAttributeName)) {
            continue;
        }
        Iterator<Span> innerSpanIter = innerSpanList.iterator();
        while (innerSpanIter.hasNext()) {
            Span innerSpan = innerSpanIter.next();
            if (!innerSpan.getAttributeName().equals(this.joinAttributeName)) {
                continue;
            }
            Integer threshold = this.getThreshold();
            if (Math.abs(outerSpan.getStart() - innerSpan.getStart()) <= threshold && Math.abs(outerSpan.getEnd() - innerSpan.getEnd()) <= threshold) {
                Integer newSpanStartIndex = Math.min(innerSpan.getStart(), outerSpan.getStart());
                Integer newSpanEndIndex = Math.max(innerSpan.getEnd(), outerSpan.getEnd());
                String attributeName = this.joinAttributeName;
                String fieldValue = (String) innerTuple.getField(attributeName).getValue();
                String newFieldValue = fieldValue.substring(newSpanStartIndex, newSpanEndIndex);
                String spanKey = outerSpan.getKey() + "_" + innerSpan.getKey();
                Span newSpan = new Span(attributeName, newSpanStartIndex, newSpanEndIndex, spanKey, newFieldValue);
                newJoinSpanList.add(newSpan);
            }
        }
    }
    if (newJoinSpanList.isEmpty()) {
        return null;
    }
    // create output fields based on innerTuple's value
    List<Attribute> outputAttrList = outputSchema.getAttributes();
    List<IField> outputFields = outputAttrList.stream().filter(attr -> !attr.equals(SchemaConstants.SPAN_LIST_ATTRIBUTE)).map(attr -> attr.getName()).map(attributeName -> innerTuple.getField(attributeName, IField.class)).collect(Collectors.toList());
    outputFields.add(new ListField<>(newJoinSpanList));
    return new Tuple(outputSchema, outputFields.stream().toArray(IField[]::new));
}
Also used : JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) ListField(edu.uci.ics.texera.api.field.ListField) edu.uci.ics.texera.api.tuple(edu.uci.ics.texera.api.tuple) Iterator(java.util.Iterator) PropertyNameConstants(edu.uci.ics.texera.dataflow.common.PropertyNameConstants) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) PredicateBase(edu.uci.ics.texera.dataflow.common.PredicateBase) Collectors(java.util.stream.Collectors) Span(edu.uci.ics.texera.api.span.Span) ArrayList(java.util.ArrayList) List(java.util.List) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IOperator(edu.uci.ics.texera.api.dataflow.IOperator) IField(edu.uci.ics.texera.api.field.IField) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) Attribute(edu.uci.ics.texera.api.schema.Attribute) ArrayList(java.util.ArrayList) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span)

Example 100 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class SimilarityJoinPredicate method mergeTuples.

private Tuple mergeTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema, List<Span> mergeSpanList) {
    List<IField> resultFields = new ArrayList<>();
    for (String attrName : outputSchema.getAttributeNames()) {
        // generate a new _ID field for this tuple
        if (attrName.equals(SchemaConstants._ID)) {
            IDField newID = new IDField(UUID.randomUUID().toString());
            resultFields.add(newID);
        // use the generated spanList
        } else if (attrName.equals(SchemaConstants.SPAN_LIST)) {
            resultFields.add(new ListField<Span>(mergeSpanList));
        // put the payload of two tuples together
        } else if (attrName.equals(SchemaConstants.PAYLOAD)) {
            ListField<Span> innerPayloadField = innerTuple.getField(SchemaConstants.PAYLOAD);
            List<Span> innerPayload = innerPayloadField.getValue();
            ListField<Span> outerPayloadField = outerTuple.getField(SchemaConstants.PAYLOAD);
            List<Span> outerPayload = outerPayloadField.getValue();
            List<Span> resultPayload = new ArrayList<>();
            resultPayload.addAll(innerPayload.stream().map(span -> addFieldPrefix(span, INNER_PREFIX)).collect(Collectors.toList()));
            resultPayload.addAll(outerPayload.stream().map(span -> addFieldPrefix(span, "outer_")).collect(Collectors.toList()));
            resultFields.add(new ListField<Span>(resultPayload));
        // add other fields from inner/outer tuples
        } else {
            if (attrName.startsWith(INNER_PREFIX)) {
                resultFields.add(innerTuple.getField(attrName.substring(INNER_PREFIX.length())));
            } else if (attrName.startsWith(OUTER_PREFIX)) {
                resultFields.add(outerTuple.getField(attrName.substring(OUTER_PREFIX.length())));
            }
        }
    }
    return new Tuple(outputSchema, resultFields);
}
Also used : IDField(edu.uci.ics.texera.api.field.IDField) ListField(edu.uci.ics.texera.api.field.ListField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span)

Aggregations

Span (edu.uci.ics.texera.api.span.Span)130 ArrayList (java.util.ArrayList)104 IField (edu.uci.ics.texera.api.field.IField)100 Tuple (edu.uci.ics.texera.api.tuple.Tuple)99 Schema (edu.uci.ics.texera.api.schema.Schema)90 Test (org.junit.Test)84 TextField (edu.uci.ics.texera.api.field.TextField)78 IntegerField (edu.uci.ics.texera.api.field.IntegerField)66 StringField (edu.uci.ics.texera.api.field.StringField)64 DoubleField (edu.uci.ics.texera.api.field.DoubleField)55 DateField (edu.uci.ics.texera.api.field.DateField)52 SimpleDateFormat (java.text.SimpleDateFormat)52 Attribute (edu.uci.ics.texera.api.schema.Attribute)51 ListField (edu.uci.ics.texera.api.field.ListField)40 Dictionary (edu.uci.ics.texera.dataflow.dictionarymatcher.Dictionary)30 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)11 AttributeType (edu.uci.ics.texera.api.schema.AttributeType)11 KeywordMatcherSourceOperator (edu.uci.ics.texera.dataflow.keywordmatcher.KeywordMatcherSourceOperator)11 JoinDistancePredicate (edu.uci.ics.texera.dataflow.join.JoinDistancePredicate)9 Collectors (java.util.stream.Collectors)8