use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.
the class SimilarityJoinPredicate method joinTuples.
@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws DataFlowException {
if (similarityThreshold == 0) {
return null;
}
// get the span list only with the joinAttributeName
ListField<Span> innerSpanListField = innerTuple.getField(SchemaConstants.SPAN_LIST);
List<Span> innerRelevantSpanList = innerSpanListField.getValue().stream().filter(span -> span.getAttributeName().equals(innerJoinAttrName)).collect(Collectors.toList());
ListField<Span> outerSpanListField = outerTuple.getField(SchemaConstants.SPAN_LIST);
List<Span> outerRelevantSpanList = outerSpanListField.getValue().stream().filter(span -> span.getAttributeName().equals(outerJoinAttrName)).collect(Collectors.toList());
// get a set of span's values (since multiple spans may have the same value)
Set<String> innerSpanValueSet = innerRelevantSpanList.stream().map(span -> span.getValue()).collect(Collectors.toSet());
Set<String> outerSpanValueSet = outerRelevantSpanList.stream().map(span -> span.getValue()).collect(Collectors.toSet());
// compute the result value set using the similarity function
Set<String> resultValueSet = new HashSet<>();
for (String innerString : innerSpanValueSet) {
for (String outerString : outerSpanValueSet) {
if (this.similarityFunc.calculateSimilarity(innerString, outerString) >= this.similarityThreshold) {
resultValueSet.add(innerString);
resultValueSet.add(outerString);
}
}
}
// return null if none of them are similar
if (resultValueSet.isEmpty()) {
return null;
}
// generate the result spans
List<Span> resultSpans = new ArrayList<>();
for (Span span : innerRelevantSpanList) {
if (resultValueSet.contains(span.getValue())) {
resultSpans.add(addFieldPrefix(span, INNER_PREFIX));
}
}
for (Span span : outerRelevantSpanList) {
if (resultValueSet.contains(span.getValue())) {
resultSpans.add(addFieldPrefix(span, OUTER_PREFIX));
}
}
return mergeTuples(innerTuple, outerTuple, outputSchema, resultSpans);
}
use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.
the class KeywordMatcher method computePhraseMatchingResult.
private List<Span> computePhraseMatchingResult(Tuple inputTuple) throws DataFlowException {
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
List<Span> relevantSpans = filterRelevantSpans(payload);
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (fieldValue.equals(predicate.getQuery())) {
matchingResults.add(new Span(attributeName, 0, predicate.getQuery().length(), predicate.getQuery(), fieldValue));
}
}
// phrase query
if (attributeType == AttributeType.TEXT) {
List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
if (!isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
// in the spans
continue;
}
// Sort current field's span list by token offset for later use
Collections.sort(fieldSpanList, (span1, span2) -> span1.getTokenOffset() - span2.getTokenOffset());
List<Integer> queryTokenOffset = new ArrayList<>();
for (int i = 0; i < queryTokensWithStopwords.size(); i++) {
if (queryTokenList.contains(queryTokensWithStopwords.get(i))) {
queryTokenOffset.add(i);
}
}
// maintains position of term being checked in
int iter = 0;
// spanForThisField list
while (iter < fieldSpanList.size()) {
if (iter > fieldSpanList.size() - queryTokenList.size()) {
break;
}
// Verify if span in the spanForThisField correspond to our
// phrase query, ie relative position offsets should be
// similar
// and the value should be same.
// flag to check if a
boolean isMismatchInSpan = false;
// To check all the terms in query are verified
for (int i = 0; i < queryTokenList.size() - 1; i++) {
Span first = fieldSpanList.get(iter + i);
Span second = fieldSpanList.get(iter + i + 1);
if (!(second.getTokenOffset() - first.getTokenOffset() == queryTokenOffset.get(i + 1) - queryTokenOffset.get(i) && first.getValue().equalsIgnoreCase(queryTokenList.get(i)) && second.getValue().equalsIgnoreCase(queryTokenList.get(i + 1)))) {
iter++;
isMismatchInSpan = true;
break;
}
}
if (isMismatchInSpan) {
continue;
}
int combinedSpanStartIndex = fieldSpanList.get(iter).getStart();
int combinedSpanEndIndex = fieldSpanList.get(iter + queryTokenList.size() - 1).getEnd();
Span combinedSpan = new Span(attributeName, combinedSpanStartIndex, combinedSpanEndIndex, predicate.getQuery(), fieldValue.substring(combinedSpanStartIndex, combinedSpanEndIndex));
matchingResults.add(combinedSpan);
iter = iter + queryTokenList.size();
}
}
}
return matchingResults;
}
use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.
the class Join method getNextTuple.
/**
* Gets the next tuple which is a joint of two tuples which passed the
* criteria set in the JoinPredicate. <br>
* Example in JoinPredicate.java
*
* @return nextTuple
*/
@Override
public Tuple getNextTuple() throws TextDBException {
if (cursor == CLOSED) {
throw new DataFlowException(ErrorMessages.OPERATOR_NOT_OPENED);
}
// load all tuples from inner operator into memory in the first time
if (innerTupleList == null) {
innerTupleList = new ArrayList<>();
Tuple tuple;
while ((tuple = innerOperator.getNextTuple()) != null) {
innerTupleList.add(tuple);
}
}
// load the first outer tuple
currentOuterTuple = outerOperator.getNextTuple();
// all outer tuples have been consumed
if (innerTupleList.isEmpty() || currentOuterTuple == null) {
return null;
}
if (resultCursor >= limit + offset - 1 || limit == 0) {
return null;
}
try {
Tuple resultTuple = null;
while (true) {
resultTuple = computeNextMatchingTuple();
if (resultTuple == null) {
break;
}
resultCursor++;
if (resultCursor >= offset) {
break;
}
}
return resultTuple;
} catch (Exception e) {
throw new DataFlowException(e.getMessage(), e);
}
}
use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.
the class JoinDistancePredicate method generateIntersectionSchema.
/**
* Create outputSchema, which is the intersection of innerOperator's schema and outerOperator's schema.
* The attributes have to be exactly the same (name and type) to be intersected.
*
* InnerOperator's attributes and outerOperator's attributes must:
* both contain the attributes to be joined.
* both contain "_ID" attribute.
* both contain "spanList" attribute.
*
* @return outputSchema
*/
private Schema generateIntersectionSchema(Schema innerOperatorSchema, Schema outerOperatorSchema) throws DataFlowException {
List<Attribute> innerAttributes = innerOperatorSchema.getAttributes();
List<Attribute> outerAttributes = outerOperatorSchema.getAttributes();
List<Attribute> intersectionAttributes = innerAttributes.stream().filter(attr -> outerAttributes.contains(attr)).collect(Collectors.toList());
Schema intersectionSchema = new Schema(intersectionAttributes.stream().toArray(Attribute[]::new));
// check if output schema contain necessary attributes
if (intersectionSchema.getAttributes().isEmpty()) {
throw new DataFlowException("inner operator and outer operator don't share any common attributes");
} else if (intersectionSchema.getAttribute(this.joinAttributeName) == null) {
throw new DataFlowException("inner operator or outer operator doesn't contain join attribute");
} else if (intersectionSchema.getAttribute(SchemaConstants._ID) == null) {
throw new DataFlowException("inner operator or outer operator doesn't contain _ID attribute");
} else if (intersectionSchema.getAttribute(SchemaConstants.SPAN_LIST) == null) {
throw new DataFlowException("inner operator or outer operator doesn't contain spanList attribute");
}
// check if join attribute is TEXT or STRING
AttributeType joinAttrType = intersectionSchema.getAttribute(this.joinAttributeName).getAttributeType();
if (joinAttrType != AttributeType.TEXT && joinAttrType != AttributeType.STRING) {
throw new DataFlowException(String.format("Join attribute %s must be either TEXT or STRING.", this.joinAttributeName));
}
return intersectionSchema;
}
use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.
the class AbstractSink method processTuples.
@Override
public void processTuples() throws TextDBException {
if (cursor == CLOSED) {
throw new DataFlowException(ErrorMessages.OPERATOR_NOT_OPENED);
}
Tuple nextTuple;
while ((nextTuple = inputOperator.getNextTuple()) != null) {
processOneTuple(nextTuple);
cursor++;
}
}
Aggregations