Search in sources :

Example 6 with AttributeType

use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.

the class JoinDistancePredicate method generateIntersectionSchema.

/**
 * Create outputSchema, which is the intersection of innerOperator's schema and outerOperator's schema.
 * The attributes have to be exactly the same (name and type) to be intersected.
 *
 * InnerOperator's attributes and outerOperator's attributes must:
 * both contain the attributes to be joined.
 * both contain "_ID" attribute.
 * both contain "spanList" attribute.
 *
 * @return outputSchema
 */
private Schema generateIntersectionSchema(Schema innerOperatorSchema, Schema outerOperatorSchema) throws DataflowException {
    List<Attribute> innerAttributes = innerOperatorSchema.getAttributes();
    List<Attribute> outerAttributes = outerOperatorSchema.getAttributes();
    List<Attribute> intersectionAttributes = innerAttributes.stream().filter(attr -> outerAttributes.contains(attr)).collect(Collectors.toList());
    Schema intersectionSchema = new Schema(intersectionAttributes.stream().toArray(Attribute[]::new));
    // check if output schema contain necessary attributes
    if (intersectionSchema.getAttributes().isEmpty()) {
        throw new DataflowException("inner operator and outer operator don't share any common attributes");
    } else if (!intersectionSchema.containsAttribute(this.joinAttributeName)) {
        throw new DataflowException("inner operator or outer operator doesn't contain join attribute");
    } else if (!intersectionSchema.containsAttribute(SchemaConstants._ID)) {
        throw new DataflowException("inner operator or outer operator doesn't contain _ID attribute");
    } else if (!intersectionSchema.containsAttribute(SchemaConstants.SPAN_LIST)) {
        throw new DataflowException("inner operator or outer operator doesn't contain spanList attribute");
    }
    // check if join attribute is TEXT or STRING
    AttributeType joinAttrType = intersectionSchema.getAttribute(this.joinAttributeName).getType();
    if (joinAttrType != AttributeType.TEXT && joinAttrType != AttributeType.STRING) {
        throw new DataflowException(String.format("Join attribute %s must be either TEXT or STRING.", this.joinAttributeName));
    }
    return intersectionSchema;
}
Also used : JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) ListField(edu.uci.ics.texera.api.field.ListField) edu.uci.ics.texera.api.tuple(edu.uci.ics.texera.api.tuple) Iterator(java.util.Iterator) ImmutableMap(com.google.common.collect.ImmutableMap) PropertyNameConstants(edu.uci.ics.texera.dataflow.common.PropertyNameConstants) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) PredicateBase(edu.uci.ics.texera.dataflow.common.PredicateBase) Collectors(java.util.stream.Collectors) ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) Span(edu.uci.ics.texera.api.span.Span) ArrayList(java.util.ArrayList) List(java.util.List) OperatorGroupConstants(edu.uci.ics.texera.dataflow.common.OperatorGroupConstants) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IField(edu.uci.ics.texera.api.field.IField) Map(java.util.Map) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) Attribute(edu.uci.ics.texera.api.schema.Attribute) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowException(edu.uci.ics.texera.api.exception.DataflowException)

Example 7 with AttributeType

use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.

the class DictionaryMatcher method processOneInputTuple.

@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws TexeraException {
    if (inputTuple == null) {
        return null;
    }
    // add payload if needed before passing it to the matching functions
    if (addPayload) {
        Tuple.Builder tupleBuilderPayload = new Tuple.Builder(inputTuple);
        tupleBuilderPayload.add(SchemaConstants.PAYLOAD_ATTRIBUTE, new ListField<Span>(DataflowUtils.generatePayloadFromTuple(inputTuple, predicate.getAnalyzerString())));
        inputTuple = tupleBuilderPayload.build();
    }
    List<Span> matchingResults = null;
    if (predicate.getKeywordMatchingType() == KeywordMatchingType.CONJUNCTION_INDEXBASED) {
        ArrayList<String> dictionaryEntries = predicate.getDictionary().getDictionaryEntries();
        ArrayList<Set<String>> tokenSetsNoStopwords = predicate.getDictionary().getTokenSetsNoStopwords();
        matchingResults = appendConjunctionMatchingSpans4Dictionary(inputTuple, predicate.getAttributeNames(), tokenSetsNoStopwords, dictionaryEntries);
    } else if (predicate.getKeywordMatchingType() == KeywordMatchingType.PHRASE_INDEXBASED) {
        ArrayList<String> dictionaryEntries = predicate.getDictionary().getDictionaryEntries();
        ArrayList<List<String>> tokenListsNoStopwords = predicate.getDictionary().getTokenListsNoStopwords();
        ArrayList<List<String>> tokenListsWithStopwords = predicate.getDictionary().getTokenListsWithStopwords();
        ArrayList<Set<String>> tokenSetsNoStopwords = predicate.getDictionary().getTokenSetsNoStopwords();
        matchingResults = appendPhraseMatchingSpans4Dictionary(inputTuple, predicate.getAttributeNames(), tokenListsNoStopwords, tokenSetsNoStopwords, tokenListsWithStopwords, dictionaryEntries);
    } else if (predicate.getKeywordMatchingType() == KeywordMatchingType.SUBSTRING_SCANBASED) {
        matchingResults = new ArrayList<Span>();
        for (String attributeName : predicate.getAttributeNames()) {
            AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
            String fieldValue = inputTuple.getField(attributeName).getValue().toString();
            // types other than TEXT and STRING: throw Exception for now
            if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
                throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
            }
            List<ACTrie.Emit> matchingEmits = dictionaryTrie.parseText(fieldValue);
            if (!matchingEmits.isEmpty()) {
                for (ACTrie.Emit emit : matchingEmits) {
                    matchingResults.add(new Span(attributeName, emit.getStart(), emit.getEnd(), emit.getKeyword(), fieldValue.substring(emit.getStart(), emit.getEnd())));
                }
            }
        }
    } else if (predicate.getKeywordMatchingType() == KeywordMatchingType.REGEX) {
        ArrayList<Pattern> patternList = predicate.getDictionary().getPatternList();
        ArrayList<String> dictionaryEntries = predicate.getDictionary().getDictionaryEntries();
        matchingResults = new ArrayList<>();
        for (int i = 0; i < dictionaryEntries.size(); i++) {
            for (String attributeName : predicate.getAttributeNames()) {
                AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
                String fieldValue = inputTuple.getField(attributeName).getValue().toString();
                // types other than TEXT and STRING: throw Exception for now
                if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
                    throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
                }
                Matcher javaMatcher = patternList.get(i).matcher(fieldValue);
                while (javaMatcher.find()) {
                    int start = javaMatcher.start();
                    int end = javaMatcher.end();
                    matchingResults.add(new Span(attributeName, start, end, dictionaryEntries.get(i), fieldValue.substring(start, end)));
                }
            }
        }
    }
    if (matchingResults.isEmpty()) {
        return null;
    }
    Tuple.Builder tupleBuilder = new Tuple.Builder(inputTuple);
    if (addResultAttribute) {
        tupleBuilder.add(predicate.getSpanListName(), AttributeType.LIST, new ListField<Span>(matchingResults));
    }
    return tupleBuilder.build();
}
Also used : Matcher(java.util.regex.Matcher) Span(edu.uci.ics.texera.api.span.Span) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Tuple(edu.uci.ics.texera.api.tuple.Tuple)

Example 8 with AttributeType

use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.

the class DictionaryMatcher method appendPhraseMatchingSpans4Dictionary.

public List<Span> appendPhraseMatchingSpans4Dictionary(Tuple inputTuple, List<String> attributeNames, List<List<String>> queryTokenList, List<Set<String>> queryTokenSetList, List<List<String>> queryTokenListWithStopwords, List<String> queryList) throws DataflowException {
    List<Span> matchingResults = new ArrayList<>();
    ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
    List<Span> payload = payloadField.getValue();
    Map<Integer, List<Span>> relevantSpansMap = filterRelevantSpans(payload, queryTokenSetList);
    for (String attributeName : attributeNames) {
        AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        // for STRING type, the query should match the fieldValue completely
        if (attributeType == AttributeType.STRING) {
            if (queryList.contains(fieldValue)) {
                Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
                matchingResults.add(span);
            }
        }
        // for TEXT type, spans need to be reconstructed according to the phrase query.
        if (attributeType == AttributeType.TEXT) {
            for (int index : relevantSpansMap.keySet()) {
                List<Span> fieldSpanList = relevantSpansMap.get(index).stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
                if (fieldSpanList.isEmpty() || !DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSetList.get(index))) {
                    continue;
                }
                matchingResults.addAll(DataflowUtils.constructPhraseMatchingSpans(attributeName, fieldValue, queryList.get(index), fieldSpanList, queryTokenListWithStopwords.get(index), queryTokenList.get(index)));
            }
        }
    }
    return matchingResults;
}
Also used : ListField(edu.uci.ics.texera.api.field.ListField) java.util(java.util) Tuple(edu.uci.ics.texera.api.tuple.Tuple) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) Collectors(java.util.stream.Collectors) Span(edu.uci.ics.texera.api.span.Span) Matcher(java.util.regex.Matcher) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IOperator(edu.uci.ics.texera.api.dataflow.IOperator) AbstractSingleInputOperator(edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator) ErrorMessages(edu.uci.ics.texera.api.constants.ErrorMessages) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Pattern(java.util.regex.Pattern) DataflowUtils(edu.uci.ics.texera.dataflow.utils.DataflowUtils) KeywordMatchingType(edu.uci.ics.texera.dataflow.keywordmatcher.KeywordMatchingType) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Span(edu.uci.ics.texera.api.span.Span)

Example 9 with AttributeType

use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.

the class NltkSentimentOperator method transformToOutputSchema.

public Schema transformToOutputSchema(Schema... inputSchema) {
    if (inputSchema.length != 1)
        throw new TexeraException(String.format(ErrorMessages.NUMBER_OF_ARGUMENTS_DOES_NOT_MATCH, 1, inputSchema.length));
    // check if the input schema is presented
    if (!inputSchema[0].containsAttribute(predicate.getInputAttributeName())) {
        throw new TexeraException(String.format("input attribute %s is not in the input schema %s", predicate.getInputAttributeName(), inputSchema[0].getAttributeNames()));
    }
    // check if the attribute type is valid
    AttributeType inputAttributeType = inputSchema[0].getAttribute(predicate.getInputAttributeName()).getType();
    boolean isValidType = inputAttributeType.equals(AttributeType.STRING) || inputAttributeType.equals(AttributeType.TEXT);
    if (!isValidType) {
        throw new TexeraException(String.format("input attribute %s must have type String or Text, its actual type is %s", predicate.getInputAttributeName(), inputAttributeType));
    }
    return transformSchema(inputSchema[0]);
}
Also used : AttributeType(edu.uci.ics.texera.api.schema.AttributeType) TexeraException(edu.uci.ics.texera.api.exception.TexeraException)

Example 10 with AttributeType

use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.

the class NltkSentimentOperator method convertToTexeraSchema.

private Schema convertToTexeraSchema(org.apache.arrow.vector.types.pojo.Schema arrowSchema) {
    List<Attribute> texeraAttributes = new ArrayList<>();
    for (Field f : arrowSchema.getFields()) {
        String attributeName = f.getName();
        AttributeType attributeType;
        ArrowType arrowType = f.getFieldType().getType();
        switch(arrowType.getTypeID()) {
            case Int:
                attributeType = INTEGER;
                break;
            case FloatingPoint:
                attributeType = DOUBLE;
                break;
            case Bool:
                attributeType = BOOLEAN;
                break;
            case Utf8:
            case Null:
                attributeType = TEXT;
                break;
            case Date:
                attributeType = DATE;
                break;
            case Struct:
                // For now only Struct of DateTime
                attributeType = DATETIME;
                break;
            case List:
                attributeType = LIST;
                break;
            default:
                throw (new DataflowException("Unsupported data type " + arrowType.getTypeID() + " when converting back to Texera table."));
        }
        texeraAttributes.add(new Attribute(attributeName, attributeType));
    }
    return new Schema(texeraAttributes);
}
Also used : Field(org.apache.arrow.vector.types.pojo.Field) Attribute(edu.uci.ics.texera.api.schema.Attribute) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) ArrowType(org.apache.arrow.vector.types.pojo.ArrowType) DataflowException(edu.uci.ics.texera.api.exception.DataflowException)

Aggregations

AttributeType (edu.uci.ics.texera.api.schema.AttributeType)31 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)21 Schema (edu.uci.ics.texera.api.schema.Schema)16 TexeraException (edu.uci.ics.texera.api.exception.TexeraException)14 Attribute (edu.uci.ics.texera.api.schema.Attribute)13 Span (edu.uci.ics.texera.api.span.Span)10 Tuple (edu.uci.ics.texera.api.tuple.Tuple)7 SchemaConstants (edu.uci.ics.texera.api.constants.SchemaConstants)6 ListField (edu.uci.ics.texera.api.field.ListField)6 ArrayList (java.util.ArrayList)6 Collectors (java.util.stream.Collectors)6 ErrorMessages (edu.uci.ics.texera.api.constants.ErrorMessages)5 AbstractSingleInputOperator (edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator)5 DataflowUtils (edu.uci.ics.texera.dataflow.utils.DataflowUtils)5 IField (edu.uci.ics.texera.api.field.IField)4 java.util (java.util)4 Matcher (java.util.regex.Matcher)4 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)2 IOperator (edu.uci.ics.texera.api.dataflow.IOperator)2 KeywordMatchingType (edu.uci.ics.texera.dataflow.keywordmatcher.KeywordMatchingType)2