use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.
the class ProjectionOperator method setUp.
@Override
protected void setUp() throws TextDBException {
inputSchema = inputOperator.getOutputSchema();
List<Attribute> outputAttributes = inputSchema.getAttributes().stream().filter(attr -> predicate.getProjectionFields().contains(attr.getAttributeName().toLowerCase())).collect(Collectors.toList());
if (outputAttributes.size() != predicate.getProjectionFields().size()) {
throw new DataFlowException("input schema doesn't contain one of the attributes to be projected");
}
outputSchema = new Schema(outputAttributes.stream().toArray(Attribute[]::new));
}
use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.
the class RegexMatcher method processOneInputTuple.
/**
* This function returns a list of spans in the given tuple that match the
* regex For example, given tuple ("george watson", "graduate student", 23,
* "(949)888-8888") and regex "g[^\s]*", this function will return
* [Span(name, 0, 6, "g[^\s]*", "george watson"), Span(position, 0, 8,
* "g[^\s]*", "graduate student")]
*
* @param tuple
* document in which search is performed
* @return a list of spans describing the occurrence of a matching sequence
* in the document
* @throws DataFlowException
*/
@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws DataFlowException {
if (inputTuple == null) {
return null;
}
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : predicate.getAttributeNames()) {
AttributeType attributeType = inputSchema.getAttribute(attributeName).getAttributeType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
switch(regexEngine) {
case JavaRegex:
matchingResults.addAll(javaRegexMatch(fieldValue, attributeName));
break;
case RE2J:
matchingResults.addAll(re2jRegexMatch(fieldValue, attributeName));
break;
}
}
if (matchingResults.isEmpty()) {
return null;
}
ListField<Span> spanListField = inputTuple.getField(predicate.getSpanListName());
List<Span> spanList = spanListField.getValue();
spanList.addAll(matchingResults);
return inputTuple;
}
use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.
the class DictionaryMatcher method open.
@Override
public void open() throws DataFlowException {
if (cursor != CLOSED) {
return;
}
try {
if (inputOperator == null) {
throw new DataFlowException(ErrorMessages.INPUT_OPERATOR_NOT_SPECIFIED);
}
predicate.getDictionary().resetCursor();
currentDictionaryEntry = predicate.getDictionary().getNextEntry();
if (currentDictionaryEntry == null) {
throw new DataFlowException("Dictionary is empty");
}
keywordPredicate = new KeywordPredicate(currentDictionaryEntry, predicate.getAttributeNames(), predicate.getAnalyzerString(), predicate.getKeywordMatchingType(), predicate.getSpanListName());
keywordMatcher = new KeywordMatcher(keywordPredicate);
cacheOperator = new DictionaryTupleCacheOperator();
cacheOperator.setInputOperator(inputOperator);
keywordMatcher.setInputOperator(cacheOperator);
cacheOperator.openAll();
keywordMatcher.open();
outputSchema = keywordMatcher.getOutputSchema();
} catch (Exception e) {
throw new DataFlowException(e.getMessage(), e);
}
cursor = OPENED;
}
use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.
the class DictionaryMatcher method getNextTuple.
@Override
public Tuple getNextTuple() throws TextDBException {
if (cursor == CLOSED) {
throw new DataFlowException(ErrorMessages.OPERATOR_NOT_OPENED);
}
if (resultCursor >= limit + offset - 1) {
return null;
}
Tuple sourceTuple;
while (true) {
// If there's result from current keywordMatcher, return it.
if ((sourceTuple = keywordMatcher.getNextTuple()) != null) {
resultCursor++;
if (resultCursor >= offset) {
return sourceTuple;
}
continue;
}
// return null if reach the end of dictionary.
if ((currentDictionaryEntry = predicate.getDictionary().getNextEntry()) == null) {
return null;
}
// Update the KeywordMatcher with the new dictionary entry.
keywordMatcher.close();
keywordPredicate = new KeywordPredicate(currentDictionaryEntry, predicate.getAttributeNames(), predicate.getAnalyzerString(), predicate.getKeywordMatchingType(), predicate.getSpanListName());
keywordMatcher = new KeywordMatcher(keywordPredicate);
keywordMatcher.setInputOperator(cacheOperator);
keywordMatcher.open();
}
}
use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.
the class DictionaryMatcherSourceOperator method open.
/**
* @about Opens dictionary matcher. Must call open() before calling
* getNextTuple().
*/
@Override
public void open() throws DataFlowException {
try {
currentDictionaryEntry = predicate.getDictionary().getNextEntry();
if (currentDictionaryEntry == null) {
throw new DataFlowException("Dictionary is empty");
}
if (predicate.getKeywordMatchingType() == KeywordMatchingType.SUBSTRING_SCANBASED) {
// For Substring matching, create a scan source operator.
indexSource = new ScanBasedSourceOperator(new ScanSourcePredicate(predicate.getTableName()));
indexSource.open();
// Substring matching's output schema needs to contains span
// list.
inputSchema = indexSource.getOutputSchema();
outputSchema = inputSchema;
if (inputSchema.containsField(predicate.getSpanListName())) {
throw new DataFlowException(ErrorMessages.DUPLICATE_ATTRIBUTE(predicate.getSpanListName(), inputSchema));
}
outputSchema = Utils.addAttributeToSchema(outputSchema, new Attribute(predicate.getSpanListName(), AttributeType.LIST));
} else {
// For other keyword matching types (conjunction and phrase),
// create keyword matcher based on index.
keywordSource = new KeywordMatcherSourceOperator(new KeywordSourcePredicate(currentDictionaryEntry, predicate.getAttributeNames(), predicate.getAnalyzerString(), predicate.getKeywordMatchingType(), predicate.getTableName(), predicate.getSpanListName()));
keywordSource.open();
// Other keyword matching types uses a KeywordMatcher, so the
// output schema is the same as keywordMatcher's schema
inputSchema = keywordSource.getOutputSchema();
outputSchema = keywordSource.getOutputSchema();
}
} catch (Exception e) {
throw new DataFlowException(e.getMessage(), e);
}
}
Aggregations