Search in sources :

Example 36 with ListField

use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.

the class DictionaryMatcherSourceOperator method getNextTuple.

@Override
public Tuple getNextTuple() throws TexeraException {
    if (cursor >= limit + offset) {
        return null;
    }
    if (predicate.getKeywordMatchingType() == KeywordMatchingType.PHRASE_INDEXBASED || predicate.getKeywordMatchingType() == KeywordMatchingType.CONJUNCTION_INDEXBASED) {
        // For each dictionary entry, get all results from KeywordMatcher.
        if (!resultMapPopulated) {
            computeMatchingResults();
            resultIterator = tupleIDMap.keySet().iterator();
            resultMapPopulated = true;
        }
        while (true) {
            if (resultIterator.hasNext()) {
                cursor++;
                String tupleID = resultIterator.next();
                Tuple resultTuple = new Tuple.Builder(tupleIDMap.get(tupleID)).add(predicate.getSpanListName(), AttributeType.LIST, new ListField<Span>(tupleResultMap.get(tupleID))).build();
                if (cursor > offset) {
                    return resultTuple;
                }
                continue;
            } else {
                return null;
            }
        }
    } else // Substring matching or regex matching (scan based)
    {
        while (true) {
            Tuple inputTuple;
            if ((inputTuple = dictionaryMatcher.getNextTuple()) != null) {
                cursor++;
                if (cursor > offset) {
                    return inputTuple;
                }
                continue;
            } else {
                return null;
            }
        }
    }
}
Also used : ListField(edu.uci.ics.texera.api.field.ListField) Tuple(edu.uci.ics.texera.api.tuple.Tuple)

Example 37 with ListField

use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.

the class JoinDistancePredicate method joinTuples.

/**
 * This method is called by the Join operator to perform the join on the
 * tuples passed.
 *
 * @return New Tuple containing the result of join operation.
 */
@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws Exception {
    List<Span> newJoinSpanList = new ArrayList<>();
    /*
	     * We expect the values of all fields to be the same for innerTuple and outerTuple.
	     * We only checks _ID field, and field to be joined, since they are crucial to join operator.
	     * For other fields, we use the value from innerTuple.
	     * check if the _ID fields are the same
	     */
    if (!compareField(innerTuple, outerTuple, SchemaConstants._ID)) {
        return null;
    }
    // check if the fields to be joined are the same
    if (!compareField(innerTuple, outerTuple, this.joinAttributeName)) {
        return null;
    }
    /*
	     * If either/both tuples have no span information, return null.
	     * Check using try/catch if both the tuples have span information.
	     * If not return null; so we can process next tuple.
	     */
    ListField<Span> spanFieldOfInnerTuple = innerTuple.getField(SchemaConstants.SPAN_LIST);
    ListField<Span> spanFieldOfOuterTuple = outerTuple.getField(SchemaConstants.SPAN_LIST);
    List<Span> innerSpanList = null;
    List<Span> outerSpanList = null;
    // ListField
    if (spanFieldOfInnerTuple.getClass().equals(ListField.class)) {
        innerSpanList = spanFieldOfInnerTuple.getValue();
    }
    if (spanFieldOfOuterTuple.getClass().equals(ListField.class)) {
        outerSpanList = spanFieldOfOuterTuple.getValue();
    }
    Iterator<Span> outerSpanIter = outerSpanList.iterator();
    // the ones specified in the JoinPredicate during "sort merge"?)
    while (outerSpanIter.hasNext()) {
        Span outerSpan = outerSpanIter.next();
        // If not return null.
        if (!outerSpan.getAttributeName().equals(this.joinAttributeName)) {
            continue;
        }
        Iterator<Span> innerSpanIter = innerSpanList.iterator();
        while (innerSpanIter.hasNext()) {
            Span innerSpan = innerSpanIter.next();
            if (!innerSpan.getAttributeName().equals(this.joinAttributeName)) {
                continue;
            }
            Integer threshold = this.getThreshold();
            if (Math.abs(outerSpan.getStart() - innerSpan.getStart()) <= threshold && Math.abs(outerSpan.getEnd() - innerSpan.getEnd()) <= threshold) {
                Integer newSpanStartIndex = Math.min(innerSpan.getStart(), outerSpan.getStart());
                Integer newSpanEndIndex = Math.max(innerSpan.getEnd(), outerSpan.getEnd());
                String attributeName = this.joinAttributeName;
                String fieldValue = (String) innerTuple.getField(attributeName).getValue();
                String newFieldValue = fieldValue.substring(newSpanStartIndex, newSpanEndIndex);
                String spanKey = outerSpan.getKey() + "_" + innerSpan.getKey();
                Span newSpan = new Span(attributeName, newSpanStartIndex, newSpanEndIndex, spanKey, newFieldValue);
                newJoinSpanList.add(newSpan);
            }
        }
    }
    if (newJoinSpanList.isEmpty()) {
        return null;
    }
    // create output fields based on innerTuple's value
    List<Attribute> outputAttrList = outputSchema.getAttributes();
    List<IField> outputFields = outputAttrList.stream().filter(attr -> !attr.equals(SchemaConstants.SPAN_LIST_ATTRIBUTE)).map(attr -> attr.getName()).map(attributeName -> innerTuple.getField(attributeName, IField.class)).collect(Collectors.toList());
    outputFields.add(new ListField<>(newJoinSpanList));
    return new Tuple(outputSchema, outputFields.stream().toArray(IField[]::new));
}
Also used : JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) ListField(edu.uci.ics.texera.api.field.ListField) edu.uci.ics.texera.api.tuple(edu.uci.ics.texera.api.tuple) Iterator(java.util.Iterator) ImmutableMap(com.google.common.collect.ImmutableMap) PropertyNameConstants(edu.uci.ics.texera.dataflow.common.PropertyNameConstants) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) PredicateBase(edu.uci.ics.texera.dataflow.common.PredicateBase) Collectors(java.util.stream.Collectors) ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) Span(edu.uci.ics.texera.api.span.Span) ArrayList(java.util.ArrayList) List(java.util.List) OperatorGroupConstants(edu.uci.ics.texera.dataflow.common.OperatorGroupConstants) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IField(edu.uci.ics.texera.api.field.IField) Map(java.util.Map) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) Attribute(edu.uci.ics.texera.api.schema.Attribute) ArrayList(java.util.ArrayList) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span)

Example 38 with ListField

use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.

the class DictionaryMatcher method appendConjunctionMatchingSpans4Dictionary.

private List<Span> appendConjunctionMatchingSpans4Dictionary(Tuple inputTuple, List<String> attributeNames, List<Set<String>> queryTokenSetList, List<String> queryList) throws DataflowException {
    List<Span> matchingResults = new ArrayList<>();
    ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
    List<Span> payload = payloadField.getValue();
    Map<Integer, List<Span>> relevantSpansMap = filterRelevantSpans(payload, queryTokenSetList);
    for (String attributeName : attributeNames) {
        AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        // for STRING type, check if the dictionary entries contains the complete fieldValue
        if (attributeType == AttributeType.STRING) {
            if (queryList.contains(fieldValue)) {
                Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
                matchingResults.add(span);
            }
        }
        // for TEXT type, every token in the query should be present in span
        if (attributeType == AttributeType.TEXT) {
            for (int index : relevantSpansMap.keySet()) {
                List<Span> fieldSpanList = relevantSpansMap.get(index).stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
                if (DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSetList.get(index))) {
                    matchingResults.addAll(fieldSpanList);
                }
            }
        }
    }
    return matchingResults;
}
Also used : ListField(edu.uci.ics.texera.api.field.ListField) java.util(java.util) Tuple(edu.uci.ics.texera.api.tuple.Tuple) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) Collectors(java.util.stream.Collectors) Span(edu.uci.ics.texera.api.span.Span) Matcher(java.util.regex.Matcher) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IOperator(edu.uci.ics.texera.api.dataflow.IOperator) AbstractSingleInputOperator(edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator) ErrorMessages(edu.uci.ics.texera.api.constants.ErrorMessages) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Pattern(java.util.regex.Pattern) DataflowUtils(edu.uci.ics.texera.dataflow.utils.DataflowUtils) KeywordMatchingType(edu.uci.ics.texera.dataflow.keywordmatcher.KeywordMatchingType) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Span(edu.uci.ics.texera.api.span.Span)

Example 39 with ListField

use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.

the class DataReader method constructTuple.

private Tuple constructTuple(int docID) throws IOException, ParseException {
    Document luceneDocument = luceneIndexSearcher.doc(docID);
    ArrayList<IField> docFields = documentToFields(luceneDocument);
    if (payloadAdded) {
        ArrayList<Span> payloadSpanList = buildPayloadFromTermVector(docFields, docID);
        ListField<Span> payloadField = new ListField<Span>(payloadSpanList);
        docFields.add(payloadField);
    }
    Tuple resultTuple = new Tuple(outputSchema, docFields.stream().toArray(IField[]::new));
    return resultTuple;
}
Also used : ListField(edu.uci.ics.texera.api.field.ListField) Document(org.apache.lucene.document.Document) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span)

Example 40 with ListField

use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.

the class NlpEntityOperator method processOneInputTuple.

@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws TexeraException {
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : predicate.getAttributeNames()) {
        IField field = inputTuple.getField(attributeName);
        matchingResults.addAll(extractNlpSpans(field, attributeName));
    }
    if (matchingResults.isEmpty()) {
        return null;
    }
    return new Tuple.Builder(inputTuple).add(predicate.getResultAttribute(), AttributeType.LIST, new ListField<Span>(matchingResults)).build();
}
Also used : ArrayList(java.util.ArrayList) ListField(edu.uci.ics.texera.api.field.ListField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span)

Aggregations

ListField (edu.uci.ics.texera.api.field.ListField)42 Span (edu.uci.ics.texera.api.span.Span)40 IField (edu.uci.ics.texera.api.field.IField)33 ArrayList (java.util.ArrayList)32 Tuple (edu.uci.ics.texera.api.tuple.Tuple)27 Schema (edu.uci.ics.texera.api.schema.Schema)26 Test (org.junit.Test)19 TextField (edu.uci.ics.texera.api.field.TextField)11 SchemaConstants (edu.uci.ics.texera.api.constants.SchemaConstants)8 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)8 AttributeType (edu.uci.ics.texera.api.schema.AttributeType)8 Collectors (java.util.stream.Collectors)8 TexeraException (edu.uci.ics.texera.api.exception.TexeraException)7 ErrorMessages (edu.uci.ics.texera.api.constants.ErrorMessages)6 DataflowUtils (edu.uci.ics.texera.dataflow.utils.DataflowUtils)6 java.util (java.util)6 AbstractSingleInputOperator (edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator)5 Attribute (edu.uci.ics.texera.api.schema.Attribute)4 JsonCreator (com.fasterxml.jackson.annotation.JsonCreator)2 JsonProperty (com.fasterxml.jackson.annotation.JsonProperty)2