Search in sources :

Example 11 with AttributeType

use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.

the class KeywordMatcherSourceOperator method buildPhraseQuery.

private Query buildPhraseQuery() throws DataflowException {
    BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
    for (String attributeName : this.predicate.getAttributeNames()) {
        AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getType();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataflowException("KeywordPredicate: Fields other than STRING and TEXT are not supported yet");
        }
        if (attributeType == AttributeType.STRING) {
            Query termQuery = new TermQuery(new Term(attributeName, predicate.getQuery()));
            booleanQueryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
        }
        if (attributeType == AttributeType.TEXT) {
            if (queryTokenList.size() == 1) {
                Query termQuery = new TermQuery(new Term(attributeName, predicate.getQuery().toLowerCase()));
                booleanQueryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
            } else {
                PhraseQuery.Builder phraseQueryBuilder = new PhraseQuery.Builder();
                for (int i = 0; i < queryTokensWithStopwords.size(); i++) {
                    if (!StandardAnalyzer.STOP_WORDS_SET.contains(queryTokensWithStopwords.get(i))) {
                        phraseQueryBuilder.add(new Term(attributeName, queryTokensWithStopwords.get(i).toLowerCase()), i);
                    }
                }
                PhraseQuery phraseQuery = phraseQueryBuilder.build();
                booleanQueryBuilder.add(phraseQuery, BooleanClause.Occur.SHOULD);
            }
        }
    }
    return booleanQueryBuilder.build();
}
Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) PhraseQuery(org.apache.lucene.search.PhraseQuery) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Term(org.apache.lucene.index.Term)

Example 12 with AttributeType

use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.

the class KeywordMatcher method appendConjunctionMatchingSpans.

private List<Span> appendConjunctionMatchingSpans(Tuple inputTuple, List<String> attributeNames, Set<String> queryTokenSet, String queryKeyword) throws DataflowException {
    ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
    List<Span> payload = payloadField.getValue();
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : attributeNames) {
        AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        // for STRING type, the query should match the fieldValue completely
        if (attributeType == AttributeType.STRING) {
            if (queryKeyword.equals(fieldValue)) {
                Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
                matchingResults.add(span);
            }
        }
        // list for this field
        if (attributeType == AttributeType.TEXT) {
            List<Span> relevantSpans = filterRelevantSpans(payload, queryTokenSet);
            List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
            if (DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
                matchingResults.addAll(fieldSpanList);
            }
        }
    }
    return matchingResults;
}
Also used : ListField(edu.uci.ics.texera.api.field.ListField) java.util(java.util) Tuple(edu.uci.ics.texera.api.tuple.Tuple) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) Collectors(java.util.stream.Collectors) Span(edu.uci.ics.texera.api.span.Span) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) AbstractSingleInputOperator(edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator) ErrorMessages(edu.uci.ics.texera.api.constants.ErrorMessages) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowUtils(edu.uci.ics.texera.dataflow.utils.DataflowUtils) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Span(edu.uci.ics.texera.api.span.Span)

Example 13 with AttributeType

use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.

the class KeywordMatcher method appendSubstringMatchingSpans.

private List<Span> appendSubstringMatchingSpans(Tuple inputTuple, List<String> attributeNames, String queryKeyword) throws DataflowException {
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : attributeNames) {
        // AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        // for STRING type, the query should match the fieldValue completely
        if (attributeType == AttributeType.STRING) {
            if (fieldValue.equals(queryKeyword)) {
                matchingResults.add(new Span(attributeName, 0, queryKeyword.length(), queryKeyword, fieldValue));
            }
        }
        if (attributeType == AttributeType.TEXT) {
            String fieldValueLowerCase = fieldValue.toLowerCase();
            String queryKeywordLowerCase = queryKeyword.toLowerCase();
            for (int i = 0; i < fieldValueLowerCase.length(); i++) {
                int index = -1;
                if ((index = fieldValueLowerCase.indexOf(queryKeywordLowerCase, i)) != -1) {
                    matchingResults.add(new Span(attributeName, index, index + queryKeyword.length(), queryKeyword, fieldValue.substring(index, index + queryKeyword.length())));
                    i = index + 1;
                } else {
                    break;
                }
            }
        }
    }
    return matchingResults;
}
Also used : AttributeType(edu.uci.ics.texera.api.schema.AttributeType) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Span(edu.uci.ics.texera.api.span.Span)

Example 14 with AttributeType

use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.

the class KeywordMatcher method appendPhraseMatchingSpans.

private List<Span> appendPhraseMatchingSpans(Tuple inputTuple, List<String> attributeNames, List<String> queryTokenList, List<String> queryTokenListWithStopwords, String queryKeyword) throws DataflowException {
    ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
    List<Span> payload = payloadField.getValue();
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : attributeNames) {
        AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        // for STRING type, the query should match the fieldValue completely
        if (attributeType == AttributeType.STRING) {
            if (queryKeyword.equals(fieldValue)) {
                Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
                matchingResults.add(span);
            }
        }
        // phrase query
        if (attributeType == AttributeType.TEXT) {
            Set<String> queryTokenSet = new HashSet<>(queryTokenList);
            List<Span> relevantSpans = filterRelevantSpans(payload, queryTokenSet);
            List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
            if (!DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
                // in the spans
                continue;
            }
            matchingResults.addAll(DataflowUtils.constructPhraseMatchingSpans(attributeName, fieldValue, queryKeyword, fieldSpanList, queryTokenListWithStopwords, queryTokenList));
        }
    }
    return matchingResults;
}
Also used : ListField(edu.uci.ics.texera.api.field.ListField) java.util(java.util) Tuple(edu.uci.ics.texera.api.tuple.Tuple) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) Collectors(java.util.stream.Collectors) Span(edu.uci.ics.texera.api.span.Span) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) AbstractSingleInputOperator(edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator) ErrorMessages(edu.uci.ics.texera.api.constants.ErrorMessages) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowUtils(edu.uci.ics.texera.dataflow.utils.DataflowUtils) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Span(edu.uci.ics.texera.api.span.Span)

Example 15 with AttributeType

use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.

the class NlpSentimentOperator method transformToOutputSchema.

public Schema transformToOutputSchema(Schema... inputSchema) {
    if (inputSchema.length != 1)
        throw new TexeraException(String.format(ErrorMessages.NUMBER_OF_ARGUMENTS_DOES_NOT_MATCH, 1, inputSchema.length));
    // check if input schema is present
    if (!inputSchema[0].containsAttribute(predicate.getInputAttributeName())) {
        throw new TexeraException(String.format("input attribute %s is not in the input schema %s", predicate.getInputAttributeName(), inputSchema[0].getAttributeNames()));
    }
    // check if attribute type is valid
    AttributeType inputAttributeType = inputSchema[0].getAttribute(predicate.getInputAttributeName()).getType();
    boolean isValidType = inputAttributeType.equals(AttributeType.STRING) || inputAttributeType.equals(AttributeType.TEXT);
    if (!isValidType) {
        throw new TexeraException(String.format("input attribute %s must have type String or Text, its actual type is %s", predicate.getInputAttributeName(), inputAttributeType));
    }
    return transformSchema(inputSchema[0]);
}
Also used : AttributeType(edu.uci.ics.texera.api.schema.AttributeType) TexeraException(edu.uci.ics.texera.api.exception.TexeraException)

Aggregations

AttributeType (edu.uci.ics.texera.api.schema.AttributeType)31 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)21 Schema (edu.uci.ics.texera.api.schema.Schema)16 TexeraException (edu.uci.ics.texera.api.exception.TexeraException)14 Attribute (edu.uci.ics.texera.api.schema.Attribute)13 Span (edu.uci.ics.texera.api.span.Span)10 Tuple (edu.uci.ics.texera.api.tuple.Tuple)7 SchemaConstants (edu.uci.ics.texera.api.constants.SchemaConstants)6 ListField (edu.uci.ics.texera.api.field.ListField)6 ArrayList (java.util.ArrayList)6 Collectors (java.util.stream.Collectors)6 ErrorMessages (edu.uci.ics.texera.api.constants.ErrorMessages)5 AbstractSingleInputOperator (edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator)5 DataflowUtils (edu.uci.ics.texera.dataflow.utils.DataflowUtils)5 IField (edu.uci.ics.texera.api.field.IField)4 java.util (java.util)4 Matcher (java.util.regex.Matcher)4 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)2 IOperator (edu.uci.ics.texera.api.dataflow.IOperator)2 KeywordMatchingType (edu.uci.ics.texera.dataflow.keywordmatcher.KeywordMatchingType)2