Search in sources :

Example 11 with DataFlowException

use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.

the class ProjectionOperator method setUp.

@Override
protected void setUp() throws TextDBException {
    inputSchema = inputOperator.getOutputSchema();
    List<Attribute> outputAttributes = inputSchema.getAttributes().stream().filter(attr -> predicate.getProjectionFields().contains(attr.getAttributeName().toLowerCase())).collect(Collectors.toList());
    if (outputAttributes.size() != predicate.getProjectionFields().size()) {
        throw new DataFlowException("input schema doesn't contain one of the attributes to be projected");
    }
    outputSchema = new Schema(outputAttributes.stream().toArray(Attribute[]::new));
}
Also used : Schema(edu.uci.ics.textdb.api.schema.Schema) List(java.util.List) Attribute(edu.uci.ics.textdb.api.schema.Attribute) TextDBException(edu.uci.ics.textdb.api.exception.TextDBException) AbstractSingleInputOperator(edu.uci.ics.textdb.exp.common.AbstractSingleInputOperator) IField(edu.uci.ics.textdb.api.field.IField) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Collectors(java.util.stream.Collectors) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) Attribute(edu.uci.ics.textdb.api.schema.Attribute) Schema(edu.uci.ics.textdb.api.schema.Schema) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException)

Example 12 with DataFlowException

use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.

the class RegexMatcher method processOneInputTuple.

/**
     * This function returns a list of spans in the given tuple that match the
     * regex For example, given tuple ("george watson", "graduate student", 23,
     * "(949)888-8888") and regex "g[^\s]*", this function will return
     * [Span(name, 0, 6, "g[^\s]*", "george watson"), Span(position, 0, 8,
     * "g[^\s]*", "graduate student")]
     * 
     * @param tuple
     *            document in which search is performed
     * @return a list of spans describing the occurrence of a matching sequence
     *         in the document
     * @throws DataFlowException
     */
@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws DataFlowException {
    if (inputTuple == null) {
        return null;
    }
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : predicate.getAttributeNames()) {
        AttributeType attributeType = inputSchema.getAttribute(attributeName).getAttributeType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        switch(regexEngine) {
            case JavaRegex:
                matchingResults.addAll(javaRegexMatch(fieldValue, attributeName));
                break;
            case RE2J:
                matchingResults.addAll(re2jRegexMatch(fieldValue, attributeName));
                break;
        }
    }
    if (matchingResults.isEmpty()) {
        return null;
    }
    ListField<Span> spanListField = inputTuple.getField(predicate.getSpanListName());
    List<Span> spanList = spanListField.getValue();
    spanList.addAll(matchingResults);
    return inputTuple;
}
Also used : AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) ArrayList(java.util.ArrayList) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Span(edu.uci.ics.textdb.api.span.Span)

Example 13 with DataFlowException

use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.

the class DictionaryMatcher method open.

@Override
public void open() throws DataFlowException {
    if (cursor != CLOSED) {
        return;
    }
    try {
        if (inputOperator == null) {
            throw new DataFlowException(ErrorMessages.INPUT_OPERATOR_NOT_SPECIFIED);
        }
        predicate.getDictionary().resetCursor();
        currentDictionaryEntry = predicate.getDictionary().getNextEntry();
        if (currentDictionaryEntry == null) {
            throw new DataFlowException("Dictionary is empty");
        }
        keywordPredicate = new KeywordPredicate(currentDictionaryEntry, predicate.getAttributeNames(), predicate.getAnalyzerString(), predicate.getKeywordMatchingType(), predicate.getSpanListName());
        keywordMatcher = new KeywordMatcher(keywordPredicate);
        cacheOperator = new DictionaryTupleCacheOperator();
        cacheOperator.setInputOperator(inputOperator);
        keywordMatcher.setInputOperator(cacheOperator);
        cacheOperator.openAll();
        keywordMatcher.open();
        outputSchema = keywordMatcher.getOutputSchema();
    } catch (Exception e) {
        throw new DataFlowException(e.getMessage(), e);
    }
    cursor = OPENED;
}
Also used : DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) KeywordMatcher(edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatcher) KeywordPredicate(edu.uci.ics.textdb.exp.keywordmatcher.KeywordPredicate) TextDBException(edu.uci.ics.textdb.api.exception.TextDBException) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException)

Example 14 with DataFlowException

use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.

the class DictionaryMatcher method getNextTuple.

@Override
public Tuple getNextTuple() throws TextDBException {
    if (cursor == CLOSED) {
        throw new DataFlowException(ErrorMessages.OPERATOR_NOT_OPENED);
    }
    if (resultCursor >= limit + offset - 1) {
        return null;
    }
    Tuple sourceTuple;
    while (true) {
        // If there's result from current keywordMatcher, return it.
        if ((sourceTuple = keywordMatcher.getNextTuple()) != null) {
            resultCursor++;
            if (resultCursor >= offset) {
                return sourceTuple;
            }
            continue;
        }
        // return null if reach the end of dictionary.
        if ((currentDictionaryEntry = predicate.getDictionary().getNextEntry()) == null) {
            return null;
        }
        // Update the KeywordMatcher with the new dictionary entry.
        keywordMatcher.close();
        keywordPredicate = new KeywordPredicate(currentDictionaryEntry, predicate.getAttributeNames(), predicate.getAnalyzerString(), predicate.getKeywordMatchingType(), predicate.getSpanListName());
        keywordMatcher = new KeywordMatcher(keywordPredicate);
        keywordMatcher.setInputOperator(cacheOperator);
        keywordMatcher.open();
    }
}
Also used : DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) KeywordMatcher(edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatcher) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) KeywordPredicate(edu.uci.ics.textdb.exp.keywordmatcher.KeywordPredicate)

Example 15 with DataFlowException

use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.

the class DictionaryMatcherSourceOperator method open.

/**
     * @about Opens dictionary matcher. Must call open() before calling
     *        getNextTuple().
     */
@Override
public void open() throws DataFlowException {
    try {
        currentDictionaryEntry = predicate.getDictionary().getNextEntry();
        if (currentDictionaryEntry == null) {
            throw new DataFlowException("Dictionary is empty");
        }
        if (predicate.getKeywordMatchingType() == KeywordMatchingType.SUBSTRING_SCANBASED) {
            // For Substring matching, create a scan source operator.
            indexSource = new ScanBasedSourceOperator(new ScanSourcePredicate(predicate.getTableName()));
            indexSource.open();
            // Substring matching's output schema needs to contains span
            // list.
            inputSchema = indexSource.getOutputSchema();
            outputSchema = inputSchema;
            if (inputSchema.containsField(predicate.getSpanListName())) {
                throw new DataFlowException(ErrorMessages.DUPLICATE_ATTRIBUTE(predicate.getSpanListName(), inputSchema));
            }
            outputSchema = Utils.addAttributeToSchema(outputSchema, new Attribute(predicate.getSpanListName(), AttributeType.LIST));
        } else {
            // For other keyword matching types (conjunction and phrase),
            // create keyword matcher based on index.
            keywordSource = new KeywordMatcherSourceOperator(new KeywordSourcePredicate(currentDictionaryEntry, predicate.getAttributeNames(), predicate.getAnalyzerString(), predicate.getKeywordMatchingType(), predicate.getTableName(), predicate.getSpanListName()));
            keywordSource.open();
            // Other keyword matching types uses a KeywordMatcher, so the
            // output schema is the same as keywordMatcher's schema
            inputSchema = keywordSource.getOutputSchema();
            outputSchema = keywordSource.getOutputSchema();
        }
    } catch (Exception e) {
        throw new DataFlowException(e.getMessage(), e);
    }
}
Also used : KeywordSourcePredicate(edu.uci.ics.textdb.exp.keywordmatcher.KeywordSourcePredicate) Attribute(edu.uci.ics.textdb.api.schema.Attribute) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) ScanBasedSourceOperator(edu.uci.ics.textdb.exp.source.scan.ScanBasedSourceOperator) ScanSourcePredicate(edu.uci.ics.textdb.exp.source.scan.ScanSourcePredicate) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) TextDBException(edu.uci.ics.textdb.api.exception.TextDBException) KeywordMatcherSourceOperator(edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatcherSourceOperator)

Aggregations

DataFlowException (edu.uci.ics.textdb.api.exception.DataFlowException)34 TextDBException (edu.uci.ics.textdb.api.exception.TextDBException)13 AttributeType (edu.uci.ics.textdb.api.schema.AttributeType)12 Schema (edu.uci.ics.textdb.api.schema.Schema)11 Tuple (edu.uci.ics.textdb.api.tuple.Tuple)10 Attribute (edu.uci.ics.textdb.api.schema.Attribute)8 Span (edu.uci.ics.textdb.api.span.Span)7 ArrayList (java.util.ArrayList)7 SchemaConstants (edu.uci.ics.textdb.api.constants.SchemaConstants)6 List (java.util.List)6 Collectors (java.util.stream.Collectors)6 StorageException (edu.uci.ics.textdb.api.exception.StorageException)5 ListField (edu.uci.ics.textdb.api.field.ListField)5 IOException (java.io.IOException)5 IField (edu.uci.ics.textdb.api.field.IField)4 Utils (edu.uci.ics.textdb.api.utils.Utils)4 AbstractSingleInputOperator (edu.uci.ics.textdb.exp.common.AbstractSingleInputOperator)4 Iterator (java.util.Iterator)4 ErrorMessages (edu.uci.ics.textdb.api.constants.ErrorMessages)3 IOperator (edu.uci.ics.textdb.api.dataflow.IOperator)3