use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class JoinTestHelper method getRegexMatcher.
public static RegexMatcher getRegexMatcher(String tableName, String query, String attrName) {
try {
ScanBasedSourceOperator scanBasedSourceOperator = new ScanBasedSourceOperator(new ScanSourcePredicate(tableName));
RegexMatcher regexMatcher = new RegexMatcher(new RegexPredicate(query, Arrays.asList(attrName), SchemaConstants.SPAN_LIST));
regexMatcher.setInputOperator(scanBasedSourceOperator);
return regexMatcher;
} catch (DataflowException e) {
e.printStackTrace();
return null;
}
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class DictionaryMatcher method appendPhraseMatchingSpans4Dictionary.
public List<Span> appendPhraseMatchingSpans4Dictionary(Tuple inputTuple, List<String> attributeNames, List<List<String>> queryTokenList, List<Set<String>> queryTokenSetList, List<List<String>> queryTokenListWithStopwords, List<String> queryList) throws DataflowException {
List<Span> matchingResults = new ArrayList<>();
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
Map<Integer, List<Span>> relevantSpansMap = filterRelevantSpans(payload, queryTokenSetList);
for (String attributeName : attributeNames) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (queryList.contains(fieldValue)) {
Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
matchingResults.add(span);
}
}
// for TEXT type, spans need to be reconstructed according to the phrase query.
if (attributeType == AttributeType.TEXT) {
for (int index : relevantSpansMap.keySet()) {
List<Span> fieldSpanList = relevantSpansMap.get(index).stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
if (fieldSpanList.isEmpty() || !DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSetList.get(index))) {
continue;
}
matchingResults.addAll(DataflowUtils.constructPhraseMatchingSpans(attributeName, fieldValue, queryList.get(index), fieldSpanList, queryTokenListWithStopwords.get(index), queryTokenList.get(index)));
}
}
}
return matchingResults;
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class DictionaryMatcher method processOneInputTuple.
@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws TexeraException {
if (inputTuple == null) {
return null;
}
// add payload if needed before passing it to the matching functions
if (addPayload) {
Tuple.Builder tupleBuilderPayload = new Tuple.Builder(inputTuple);
tupleBuilderPayload.add(SchemaConstants.PAYLOAD_ATTRIBUTE, new ListField<Span>(DataflowUtils.generatePayloadFromTuple(inputTuple, predicate.getAnalyzerString())));
inputTuple = tupleBuilderPayload.build();
}
List<Span> matchingResults = null;
if (predicate.getKeywordMatchingType() == KeywordMatchingType.CONJUNCTION_INDEXBASED) {
ArrayList<String> dictionaryEntries = predicate.getDictionary().getDictionaryEntries();
ArrayList<Set<String>> tokenSetsNoStopwords = predicate.getDictionary().getTokenSetsNoStopwords();
matchingResults = appendConjunctionMatchingSpans4Dictionary(inputTuple, predicate.getAttributeNames(), tokenSetsNoStopwords, dictionaryEntries);
} else if (predicate.getKeywordMatchingType() == KeywordMatchingType.PHRASE_INDEXBASED) {
ArrayList<String> dictionaryEntries = predicate.getDictionary().getDictionaryEntries();
ArrayList<List<String>> tokenListsNoStopwords = predicate.getDictionary().getTokenListsNoStopwords();
ArrayList<List<String>> tokenListsWithStopwords = predicate.getDictionary().getTokenListsWithStopwords();
ArrayList<Set<String>> tokenSetsNoStopwords = predicate.getDictionary().getTokenSetsNoStopwords();
matchingResults = appendPhraseMatchingSpans4Dictionary(inputTuple, predicate.getAttributeNames(), tokenListsNoStopwords, tokenSetsNoStopwords, tokenListsWithStopwords, dictionaryEntries);
} else if (predicate.getKeywordMatchingType() == KeywordMatchingType.SUBSTRING_SCANBASED) {
matchingResults = new ArrayList<Span>();
for (String attributeName : predicate.getAttributeNames()) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
List<ACTrie.Emit> matchingEmits = dictionaryTrie.parseText(fieldValue);
if (!matchingEmits.isEmpty()) {
for (ACTrie.Emit emit : matchingEmits) {
matchingResults.add(new Span(attributeName, emit.getStart(), emit.getEnd(), emit.getKeyword(), fieldValue.substring(emit.getStart(), emit.getEnd())));
}
}
}
} else if (predicate.getKeywordMatchingType() == KeywordMatchingType.REGEX) {
ArrayList<Pattern> patternList = predicate.getDictionary().getPatternList();
ArrayList<String> dictionaryEntries = predicate.getDictionary().getDictionaryEntries();
matchingResults = new ArrayList<>();
for (int i = 0; i < dictionaryEntries.size(); i++) {
for (String attributeName : predicate.getAttributeNames()) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
Matcher javaMatcher = patternList.get(i).matcher(fieldValue);
while (javaMatcher.find()) {
int start = javaMatcher.start();
int end = javaMatcher.end();
matchingResults.add(new Span(attributeName, start, end, dictionaryEntries.get(i), fieldValue.substring(start, end)));
}
}
}
}
if (matchingResults.isEmpty()) {
return null;
}
Tuple.Builder tupleBuilder = new Tuple.Builder(inputTuple);
if (addResultAttribute) {
tupleBuilder.add(predicate.getSpanListName(), AttributeType.LIST, new ListField<Span>(matchingResults));
}
return tupleBuilder.build();
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class FuzzyTokenMatcher method processOneInputTuple.
@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws TexeraException {
// add payload if needed before passing it to the matching functions
if (addPayload) {
Tuple.Builder tupleBuilderPayload = new Tuple.Builder(inputTuple);
tupleBuilderPayload.add(SchemaConstants.PAYLOAD_ATTRIBUTE, new ListField<Span>(DataflowUtils.generatePayloadFromTuple(inputTuple, predicate.getLuceneAnalyzerStr())));
inputTuple = tupleBuilderPayload.build();
}
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> relevantSpans = filterRelevantSpans(payloadField.getValue());
List<Span> matchingResults = new ArrayList<>();
/*
* The source operator returns spans even for those fields which did not
* satisfy the threshold criterion. So if two attributes A,B have 10 and
* 5 matching tokens, and we set threshold to 10, the number of spans
* returned is 15. So we need to filter those 5 spans for attribute B.
*/
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getType();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.TEXT && attributeType != AttributeType.STRING) {
throw new DataflowException("FuzzyTokenMatcher: Fields other than TEXT or STRING are not supported");
}
List<Span> fieldSpans = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).filter(span -> predicate.getQueryTokens().contains(span.getKey())).collect(Collectors.toList());
if (fieldSpans.size() >= predicate.getThreshold()) {
matchingResults.addAll(fieldSpans);
}
}
if (matchingResults.isEmpty()) {
return null;
}
Tuple.Builder tupleBuilder = new Tuple.Builder(inputTuple);
if (addResultAttribute) {
tupleBuilder.add(predicate.getSpanListName(), AttributeType.LIST, new ListField<Span>(matchingResults));
}
return tupleBuilder.build();
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class Join method getNextTuple.
/**
* Gets the next tuple which is a joint of two tuples which passed the
* criteria set in the JoinPredicate. <br>
* Example in JoinPredicate.java
*
* @return nextTuple
*/
@Override
public Tuple getNextTuple() throws TexeraException {
if (cursor == CLOSED) {
throw new DataflowException(ErrorMessages.OPERATOR_NOT_OPENED);
}
// load all tuples from inner operator into memory in the first time
if (innerTupleList == null) {
innerTupleList = new ArrayList<>();
Tuple tuple;
while ((tuple = innerOperator.getNextTuple()) != null) {
innerTupleList.add(tuple);
}
}
// load the first outer tuple
currentOuterTuple = outerOperator.getNextTuple();
// all outer tuples have been consumed
if (innerTupleList.isEmpty() || currentOuterTuple == null) {
return null;
}
if (resultCursor >= limit + offset - 1 || limit == 0) {
return null;
}
try {
Tuple resultTuple = null;
while (true) {
resultTuple = computeNextMatchingTuple();
if (resultTuple == null) {
break;
}
resultCursor++;
if (resultCursor >= offset) {
break;
}
}
return resultTuple;
} catch (Exception e) {
throw new DataflowException(e.getMessage(), e);
}
}
Aggregations