use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class Join method open.
@Override
public void open() throws TexeraException {
if (cursor != CLOSED) {
return;
}
if (innerOperator == null) {
throw new DataflowException("Inner Input Operator is not set.");
}
if (outerOperator == null) {
throw new DataflowException("Outer Input Operator is not set.");
}
// generate output schema from schema of inner and outer operator
innerOperator.open();
Schema innerOperatorSchema = innerOperator.getOutputSchema();
outerOperator.open();
Schema outerOperatorSchema = outerOperator.getOutputSchema();
this.outputSchema = joinPredicate.generateOutputSchema(innerOperatorSchema, outerOperatorSchema);
cursor = OPENED;
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class JoinDistancePredicate method generateIntersectionSchema.
/**
* Create outputSchema, which is the intersection of innerOperator's schema and outerOperator's schema.
* The attributes have to be exactly the same (name and type) to be intersected.
*
* InnerOperator's attributes and outerOperator's attributes must:
* both contain the attributes to be joined.
* both contain "_ID" attribute.
* both contain "spanList" attribute.
*
* @return outputSchema
*/
private Schema generateIntersectionSchema(Schema innerOperatorSchema, Schema outerOperatorSchema) throws DataflowException {
List<Attribute> innerAttributes = innerOperatorSchema.getAttributes();
List<Attribute> outerAttributes = outerOperatorSchema.getAttributes();
List<Attribute> intersectionAttributes = innerAttributes.stream().filter(attr -> outerAttributes.contains(attr)).collect(Collectors.toList());
Schema intersectionSchema = new Schema(intersectionAttributes.stream().toArray(Attribute[]::new));
// check if output schema contain necessary attributes
if (intersectionSchema.getAttributes().isEmpty()) {
throw new DataflowException("inner operator and outer operator don't share any common attributes");
} else if (!intersectionSchema.containsAttribute(this.joinAttributeName)) {
throw new DataflowException("inner operator or outer operator doesn't contain join attribute");
} else if (!intersectionSchema.containsAttribute(SchemaConstants._ID)) {
throw new DataflowException("inner operator or outer operator doesn't contain _ID attribute");
} else if (!intersectionSchema.containsAttribute(SchemaConstants.SPAN_LIST)) {
throw new DataflowException("inner operator or outer operator doesn't contain spanList attribute");
}
// check if join attribute is TEXT or STRING
AttributeType joinAttrType = intersectionSchema.getAttribute(this.joinAttributeName).getType();
if (joinAttrType != AttributeType.TEXT && joinAttrType != AttributeType.STRING) {
throw new DataflowException(String.format("Join attribute %s must be either TEXT or STRING.", this.joinAttributeName));
}
return intersectionSchema;
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class SimilarityJoinPredicate method joinTuples.
@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws DataflowException {
if (similarityThreshold == 0) {
return null;
}
// get the span list only with the joinAttributeName
ListField<Span> innerSpanListField = innerTuple.getField(SchemaConstants.SPAN_LIST);
List<Span> innerRelevantSpanList = innerSpanListField.getValue().stream().filter(span -> span.getAttributeName().equals(innerJoinAttrName)).collect(Collectors.toList());
ListField<Span> outerSpanListField = outerTuple.getField(SchemaConstants.SPAN_LIST);
List<Span> outerRelevantSpanList = outerSpanListField.getValue().stream().filter(span -> span.getAttributeName().equals(outerJoinAttrName)).collect(Collectors.toList());
// get a set of span's values (since multiple spans may have the same value)
Set<String> innerSpanValueSet = innerRelevantSpanList.stream().map(span -> span.getValue()).collect(Collectors.toSet());
Set<String> outerSpanValueSet = outerRelevantSpanList.stream().map(span -> span.getValue()).collect(Collectors.toSet());
// compute the result value set using the similarity function
Set<String> resultValueSet = new HashSet<>();
for (String innerString : innerSpanValueSet) {
for (String outerString : outerSpanValueSet) {
if (this.similarityFunc.calculateSimilarity(innerString, outerString) >= this.similarityThreshold) {
resultValueSet.add(innerString);
resultValueSet.add(outerString);
}
}
}
// return null if none of them are similar
if (resultValueSet.isEmpty()) {
return null;
}
// generate the result spans
List<Span> resultSpans = new ArrayList<>();
for (Span span : innerRelevantSpanList) {
if (resultValueSet.contains(span.getValue())) {
resultSpans.add(addFieldPrefix(span, INNER_PREFIX));
}
}
for (Span span : outerRelevantSpanList) {
if (resultValueSet.contains(span.getValue())) {
resultSpans.add(addFieldPrefix(span, OUTER_PREFIX));
}
}
return mergeTuples(innerTuple, outerTuple, outputSchema, resultSpans);
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class KeywordMatcher method appendPhraseMatchingSpans.
private List<Span> appendPhraseMatchingSpans(Tuple inputTuple, List<String> attributeNames, List<String> queryTokenList, List<String> queryTokenListWithStopwords, String queryKeyword) throws DataflowException {
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : attributeNames) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (queryKeyword.equals(fieldValue)) {
Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
matchingResults.add(span);
}
}
// phrase query
if (attributeType == AttributeType.TEXT) {
Set<String> queryTokenSet = new HashSet<>(queryTokenList);
List<Span> relevantSpans = filterRelevantSpans(payload, queryTokenSet);
List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
if (!DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
// in the spans
continue;
}
matchingResults.addAll(DataflowUtils.constructPhraseMatchingSpans(attributeName, fieldValue, queryKeyword, fieldSpanList, queryTokenListWithStopwords, queryTokenList));
}
}
return matchingResults;
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class KeywordMatcher method appendConjunctionMatchingSpans.
private List<Span> appendConjunctionMatchingSpans(Tuple inputTuple, List<String> attributeNames, Set<String> queryTokenSet, String queryKeyword) throws DataflowException {
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : attributeNames) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (queryKeyword.equals(fieldValue)) {
Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
matchingResults.add(span);
}
}
// list for this field
if (attributeType == AttributeType.TEXT) {
List<Span> relevantSpans = filterRelevantSpans(payload, queryTokenSet);
List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
if (DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
matchingResults.addAll(fieldSpanList);
}
}
}
return matchingResults;
}
Aggregations