Search in sources :

Example 16 with DataflowException

use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.

the class KeywordMatcher method appendSubstringMatchingSpans.

private List<Span> appendSubstringMatchingSpans(Tuple inputTuple, List<String> attributeNames, String queryKeyword) throws DataflowException {
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : attributeNames) {
        // AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        // for STRING type, the query should match the fieldValue completely
        if (attributeType == AttributeType.STRING) {
            if (fieldValue.equals(queryKeyword)) {
                matchingResults.add(new Span(attributeName, 0, queryKeyword.length(), queryKeyword, fieldValue));
            }
        }
        if (attributeType == AttributeType.TEXT) {
            String fieldValueLowerCase = fieldValue.toLowerCase();
            String queryKeywordLowerCase = queryKeyword.toLowerCase();
            for (int i = 0; i < fieldValueLowerCase.length(); i++) {
                int index = -1;
                if ((index = fieldValueLowerCase.indexOf(queryKeywordLowerCase, i)) != -1) {
                    matchingResults.add(new Span(attributeName, index, index + queryKeyword.length(), queryKeyword, fieldValue.substring(index, index + queryKeyword.length())));
                    i = index + 1;
                } else {
                    break;
                }
            }
        }
    }
    return matchingResults;
}
Also used : AttributeType(edu.uci.ics.texera.api.schema.AttributeType) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Span(edu.uci.ics.texera.api.span.Span)

Example 17 with DataflowException

use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.

the class KeywordMatcherSourceOperator method buildPhraseQuery.

private Query buildPhraseQuery() throws DataflowException {
    BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
    for (String attributeName : this.predicate.getAttributeNames()) {
        AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getType();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataflowException("KeywordPredicate: Fields other than STRING and TEXT are not supported yet");
        }
        if (attributeType == AttributeType.STRING) {
            Query termQuery = new TermQuery(new Term(attributeName, predicate.getQuery()));
            booleanQueryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
        }
        if (attributeType == AttributeType.TEXT) {
            if (queryTokenList.size() == 1) {
                Query termQuery = new TermQuery(new Term(attributeName, predicate.getQuery().toLowerCase()));
                booleanQueryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
            } else {
                PhraseQuery.Builder phraseQueryBuilder = new PhraseQuery.Builder();
                for (int i = 0; i < queryTokensWithStopwords.size(); i++) {
                    if (!StandardAnalyzer.STOP_WORDS_SET.contains(queryTokensWithStopwords.get(i))) {
                        phraseQueryBuilder.add(new Term(attributeName, queryTokensWithStopwords.get(i).toLowerCase()), i);
                    }
                }
                PhraseQuery phraseQuery = phraseQueryBuilder.build();
                booleanQueryBuilder.add(phraseQuery, BooleanClause.Occur.SHOULD);
            }
        }
    }
    return booleanQueryBuilder.build();
}
Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) PhraseQuery(org.apache.lucene.search.PhraseQuery) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Term(org.apache.lucene.index.Term)

Example 18 with DataflowException

use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.

the class EmojiSentimentOperator method open.

@Override
public void open() throws TexeraException {
    if (cursor != CLOSED) {
        return;
    }
    if (inputOperator == null) {
        throw new DataflowException(ErrorMessages.INPUT_OPERATOR_NOT_SPECIFIED);
    }
    inputOperator.open();
    Schema inputSchema = inputOperator.getOutputSchema();
    // check if input schema is present
    if (!inputSchema.containsAttribute(predicate.getInputAttributeName())) {
        throw new TexeraException(String.format("input attribute %s is not in the input schema %s", predicate.getInputAttributeName(), inputSchema.getAttributeNames()));
    }
    // check if attribute type is valid
    AttributeType inputAttributeType = inputSchema.getAttribute(predicate.getInputAttributeName()).getType();
    boolean isValidType = inputAttributeType.equals(AttributeType.STRING) || inputAttributeType.equals(AttributeType.TEXT);
    if (!isValidType) {
        throw new TexeraException(String.format("input attribute %s must have type String or Text, its actual type is %s", predicate.getInputAttributeName(), inputAttributeType));
    }
    // generate output schema by transforming the input schema
    outputSchema = transformSchema(inputOperator.getOutputSchema());
    cursor = OPENED;
}
Also used : AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) TexeraException(edu.uci.ics.texera.api.exception.TexeraException)

Example 19 with DataflowException

use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.

the class NltkSentimentOperator method open.

@Override
public void open() throws TexeraException {
    if (cursor != CLOSED) {
        return;
    }
    if (inputOperator == null) {
        throw new DataflowException(ErrorMessages.INPUT_OPERATOR_NOT_SPECIFIED);
    }
    inputOperator.open();
    Schema inputSchema = inputOperator.getOutputSchema();
    // check if the input schema is presented
    if (!inputSchema.containsAttribute(predicate.getInputAttributeName())) {
        throw new TexeraException(String.format("input attribute %s is not in the input schema %s", predicate.getInputAttributeName(), inputSchema.getAttributeNames()));
    }
    // check if the attribute type is valid
    AttributeType inputAttributeType = inputSchema.getAttribute(predicate.getInputAttributeName()).getType();
    boolean isValidType = inputAttributeType.equals(AttributeType.STRING) || inputAttributeType.equals(AttributeType.TEXT);
    if (!isValidType) {
        throw new TexeraException(String.format("input attribute %s must have type String or Text, its actual type is %s", predicate.getInputAttributeName(), inputAttributeType));
    }
    // generate output schema by transforming the input schema
    outputSchema = transformSchema(inputOperator.getOutputSchema());
    cursor = OPENED;
}
Also used : AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) TexeraException(edu.uci.ics.texera.api.exception.TexeraException)

Example 20 with DataflowException

use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.

the class ProjectionOperator method setUp.

@Override
protected void setUp() throws TexeraException {
    inputSchema = inputOperator.getOutputSchema();
    List<Attribute> outputAttributes = inputSchema.getAttributes().stream().filter(attr -> predicate.getProjectionFields().contains(attr.getName().toLowerCase())).collect(Collectors.toList());
    if (outputAttributes.size() != predicate.getProjectionFields().size()) {
        throw new DataflowException("input schema doesn't contain one of the attributes to be projected");
    }
    outputSchema = new Schema(outputAttributes.stream().toArray(Attribute[]::new));
}
Also used : List(java.util.List) IField(edu.uci.ics.texera.api.field.IField) AbstractSingleInputOperator(edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator) Tuple(edu.uci.ics.texera.api.tuple.Tuple) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) Collectors(java.util.stream.Collectors) Attribute(edu.uci.ics.texera.api.schema.Attribute) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowException(edu.uci.ics.texera.api.exception.DataflowException)

Aggregations

DataflowException (edu.uci.ics.texera.api.exception.DataflowException)56 TexeraException (edu.uci.ics.texera.api.exception.TexeraException)23 AttributeType (edu.uci.ics.texera.api.schema.AttributeType)20 Schema (edu.uci.ics.texera.api.schema.Schema)20 Tuple (edu.uci.ics.texera.api.tuple.Tuple)18 IOException (java.io.IOException)14 Span (edu.uci.ics.texera.api.span.Span)11 Collectors (java.util.stream.Collectors)10 SchemaConstants (edu.uci.ics.texera.api.constants.SchemaConstants)9 ArrayList (java.util.ArrayList)9 Attribute (edu.uci.ics.texera.api.schema.Attribute)8 IOperator (edu.uci.ics.texera.api.dataflow.IOperator)7 IField (edu.uci.ics.texera.api.field.IField)7 ListField (edu.uci.ics.texera.api.field.ListField)7 List (java.util.List)7 AbstractSingleInputOperator (edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator)6 ErrorMessages (edu.uci.ics.texera.api.constants.ErrorMessages)5 StorageException (edu.uci.ics.texera.api.exception.StorageException)5 IntegerField (edu.uci.ics.texera.api.field.IntegerField)4 DataflowUtils (edu.uci.ics.texera.dataflow.utils.DataflowUtils)4