use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.
the class DictionaryMatcher method computeNextMatchingTuple.
@Override
protected Tuple computeNextMatchingTuple() throws TexeraException {
Tuple inputTuple;
Tuple resultTuple = null;
while ((inputTuple = inputOperator.getNextTuple()) != null) {
resultTuple = processOneInputTuple(inputTuple);
if (resultTuple != null) {
break;
}
}
return resultTuple;
}
use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.
the class DictionaryMatcher method processOneInputTuple.
@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws TexeraException {
if (inputTuple == null) {
return null;
}
// add payload if needed before passing it to the matching functions
if (addPayload) {
Tuple.Builder tupleBuilderPayload = new Tuple.Builder(inputTuple);
tupleBuilderPayload.add(SchemaConstants.PAYLOAD_ATTRIBUTE, new ListField<Span>(DataflowUtils.generatePayloadFromTuple(inputTuple, predicate.getAnalyzerString())));
inputTuple = tupleBuilderPayload.build();
}
List<Span> matchingResults = null;
if (predicate.getKeywordMatchingType() == KeywordMatchingType.CONJUNCTION_INDEXBASED) {
ArrayList<String> dictionaryEntries = predicate.getDictionary().getDictionaryEntries();
ArrayList<Set<String>> tokenSetsNoStopwords = predicate.getDictionary().getTokenSetsNoStopwords();
matchingResults = appendConjunctionMatchingSpans4Dictionary(inputTuple, predicate.getAttributeNames(), tokenSetsNoStopwords, dictionaryEntries);
} else if (predicate.getKeywordMatchingType() == KeywordMatchingType.PHRASE_INDEXBASED) {
ArrayList<String> dictionaryEntries = predicate.getDictionary().getDictionaryEntries();
ArrayList<List<String>> tokenListsNoStopwords = predicate.getDictionary().getTokenListsNoStopwords();
ArrayList<List<String>> tokenListsWithStopwords = predicate.getDictionary().getTokenListsWithStopwords();
ArrayList<Set<String>> tokenSetsNoStopwords = predicate.getDictionary().getTokenSetsNoStopwords();
matchingResults = appendPhraseMatchingSpans4Dictionary(inputTuple, predicate.getAttributeNames(), tokenListsNoStopwords, tokenSetsNoStopwords, tokenListsWithStopwords, dictionaryEntries);
} else if (predicate.getKeywordMatchingType() == KeywordMatchingType.SUBSTRING_SCANBASED) {
matchingResults = new ArrayList<Span>();
for (String attributeName : predicate.getAttributeNames()) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
List<ACTrie.Emit> matchingEmits = dictionaryTrie.parseText(fieldValue);
if (!matchingEmits.isEmpty()) {
for (ACTrie.Emit emit : matchingEmits) {
matchingResults.add(new Span(attributeName, emit.getStart(), emit.getEnd(), emit.getKeyword(), fieldValue.substring(emit.getStart(), emit.getEnd())));
}
}
}
} else if (predicate.getKeywordMatchingType() == KeywordMatchingType.REGEX) {
ArrayList<Pattern> patternList = predicate.getDictionary().getPatternList();
ArrayList<String> dictionaryEntries = predicate.getDictionary().getDictionaryEntries();
matchingResults = new ArrayList<>();
for (int i = 0; i < dictionaryEntries.size(); i++) {
for (String attributeName : predicate.getAttributeNames()) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
Matcher javaMatcher = patternList.get(i).matcher(fieldValue);
while (javaMatcher.find()) {
int start = javaMatcher.start();
int end = javaMatcher.end();
matchingResults.add(new Span(attributeName, start, end, dictionaryEntries.get(i), fieldValue.substring(start, end)));
}
}
}
}
if (matchingResults.isEmpty()) {
return null;
}
Tuple.Builder tupleBuilder = new Tuple.Builder(inputTuple);
if (addResultAttribute) {
tupleBuilder.add(predicate.getSpanListName(), AttributeType.LIST, new ListField<Span>(matchingResults));
}
return tupleBuilder.build();
}
use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.
the class FuzzyTokenMatcher method processOneInputTuple.
@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws TexeraException {
// add payload if needed before passing it to the matching functions
if (addPayload) {
Tuple.Builder tupleBuilderPayload = new Tuple.Builder(inputTuple);
tupleBuilderPayload.add(SchemaConstants.PAYLOAD_ATTRIBUTE, new ListField<Span>(DataflowUtils.generatePayloadFromTuple(inputTuple, predicate.getLuceneAnalyzerStr())));
inputTuple = tupleBuilderPayload.build();
}
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> relevantSpans = filterRelevantSpans(payloadField.getValue());
List<Span> matchingResults = new ArrayList<>();
/*
* The source operator returns spans even for those fields which did not
* satisfy the threshold criterion. So if two attributes A,B have 10 and
* 5 matching tokens, and we set threshold to 10, the number of spans
* returned is 15. So we need to filter those 5 spans for attribute B.
*/
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getType();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.TEXT && attributeType != AttributeType.STRING) {
throw new DataflowException("FuzzyTokenMatcher: Fields other than TEXT or STRING are not supported");
}
List<Span> fieldSpans = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).filter(span -> predicate.getQueryTokens().contains(span.getKey())).collect(Collectors.toList());
if (fieldSpans.size() >= predicate.getThreshold()) {
matchingResults.addAll(fieldSpans);
}
}
if (matchingResults.isEmpty()) {
return null;
}
Tuple.Builder tupleBuilder = new Tuple.Builder(inputTuple);
if (addResultAttribute) {
tupleBuilder.add(predicate.getSpanListName(), AttributeType.LIST, new ListField<Span>(matchingResults));
}
return tupleBuilder.build();
}
use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.
the class Join method computeNextMatchingTuple.
/*
* Called from getNextTuple() method in order to obtain the next tuple
* that satisfies the predicate.
*
* It returns null if there's no more tuples.
*/
private Tuple computeNextMatchingTuple() throws Exception {
if (innerTupleList.isEmpty()) {
return null;
}
Tuple nextTuple = null;
while (nextTuple == null) {
// if reach the end of inner tuple list
if (innerTupleListCursor >= innerTupleList.size()) {
// get next outer tuple
currentOuterTuple = outerOperator.getNextTuple();
if (currentOuterTuple == null) {
return null;
}
// reset cursor if outerTuple is not null
innerTupleListCursor = 0;
}
// compute next tuple
nextTuple = joinPredicate.joinTuples(innerTupleList.get(innerTupleListCursor), currentOuterTuple, outputSchema);
// increment cursor
innerTupleListCursor++;
}
return nextTuple;
}
use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.
the class Join method getNextTuple.
/**
* Gets the next tuple which is a joint of two tuples which passed the
* criteria set in the JoinPredicate. <br>
* Example in JoinPredicate.java
*
* @return nextTuple
*/
@Override
public Tuple getNextTuple() throws TexeraException {
if (cursor == CLOSED) {
throw new DataflowException(ErrorMessages.OPERATOR_NOT_OPENED);
}
// load all tuples from inner operator into memory in the first time
if (innerTupleList == null) {
innerTupleList = new ArrayList<>();
Tuple tuple;
while ((tuple = innerOperator.getNextTuple()) != null) {
innerTupleList.add(tuple);
}
}
// load the first outer tuple
currentOuterTuple = outerOperator.getNextTuple();
// all outer tuples have been consumed
if (innerTupleList.isEmpty() || currentOuterTuple == null) {
return null;
}
if (resultCursor >= limit + offset - 1 || limit == 0) {
return null;
}
try {
Tuple resultTuple = null;
while (true) {
resultTuple = computeNextMatchingTuple();
if (resultTuple == null) {
break;
}
resultCursor++;
if (resultCursor >= offset) {
break;
}
}
return resultTuple;
} catch (Exception e) {
throw new DataflowException(e.getMessage(), e);
}
}
Aggregations