use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class DictionaryMatcher method appendConjunctionMatchingSpans4Dictionary.
private List<Span> appendConjunctionMatchingSpans4Dictionary(Tuple inputTuple, List<String> attributeNames, List<Set<String>> queryTokenSetList, List<String> queryList) throws DataflowException {
List<Span> matchingResults = new ArrayList<>();
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
Map<Integer, List<Span>> relevantSpansMap = filterRelevantSpans(payload, queryTokenSetList);
for (String attributeName : attributeNames) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, check if the dictionary entries contains the complete fieldValue
if (attributeType == AttributeType.STRING) {
if (queryList.contains(fieldValue)) {
Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
matchingResults.add(span);
}
}
// for TEXT type, every token in the query should be present in span
if (attributeType == AttributeType.TEXT) {
for (int index : relevantSpansMap.keySet()) {
List<Span> fieldSpanList = relevantSpansMap.get(index).stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
if (DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSetList.get(index))) {
matchingResults.addAll(fieldSpanList);
}
}
}
}
return matchingResults;
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class FuzzyTokenMatcherSourceOperator method createLuceneQueryObject.
public static Query createLuceneQueryObject(FuzzyTokenPredicate predicate) throws DataflowException {
try {
/*
* By default the boolean query takes 1024 # of clauses as the max
* limit. Since our input query has no limitaion on the number of
* tokens, we have to put a check.
*/
if (predicate.getThreshold() > 1024)
BooleanQuery.setMaxClauseCount(predicate.getThreshold() + 1);
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.setMinimumNumberShouldMatch(predicate.getThreshold());
MultiFieldQueryParser qp = new MultiFieldQueryParser(predicate.getAttributeNames().stream().toArray(String[]::new), LuceneAnalyzerConstants.getLuceneAnalyzer(predicate.getLuceneAnalyzerStr()));
for (String s : predicate.getQueryTokens()) {
builder.add(qp.parse(s), Occur.SHOULD);
}
return builder.build();
} catch (ParseException e) {
throw new DataflowException(e);
}
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class Join method close.
@Override
public void close() throws TexeraException {
if (cursor == CLOSED) {
return;
}
try {
innerOperator.close();
outerOperator.close();
} catch (Exception e) {
throw new DataflowException(e.getMessage(), e);
}
// Set the inner tuple list back to null on close.
innerTupleList = null;
innerTupleListCursor = 0;
cursor = CLOSED;
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class KeywordMatcherSourceOperator method buildConjunctionQuery.
private Query buildConjunctionQuery() throws DataflowException {
BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getType();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordPredicate: Fields other than STRING and TEXT are not supported yet");
}
if (attributeType == AttributeType.STRING) {
Query termQuery = new TermQuery(new Term(attributeName, predicate.getQuery()));
booleanQueryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
}
if (attributeType == AttributeType.TEXT) {
BooleanQuery.Builder fieldQueryBuilder = new BooleanQuery.Builder();
for (String token : queryTokenSet) {
Query termQuery = new TermQuery(new Term(attributeName, token.toLowerCase()));
fieldQueryBuilder.add(termQuery, BooleanClause.Occur.MUST);
}
booleanQueryBuilder.add(fieldQueryBuilder.build(), BooleanClause.Occur.SHOULD);
}
}
return booleanQueryBuilder.build();
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class NlpSentimentOperator method open.
@Override
public void open() throws TexeraException {
if (cursor != CLOSED) {
return;
}
if (inputOperator == null) {
throw new DataflowException(ErrorMessages.INPUT_OPERATOR_NOT_SPECIFIED);
}
inputOperator.open();
Schema inputSchema = inputOperator.getOutputSchema();
// check if input schema is present
if (!inputSchema.containsAttribute(predicate.getInputAttributeName())) {
throw new TexeraException(String.format("input attribute %s is not in the input schema %s", predicate.getInputAttributeName(), inputSchema.getAttributeNames()));
}
// check if attribute type is valid
AttributeType inputAttributeType = inputSchema.getAttribute(predicate.getInputAttributeName()).getType();
boolean isValidType = inputAttributeType.equals(AttributeType.STRING) || inputAttributeType.equals(AttributeType.TEXT);
if (!isValidType) {
throw new TexeraException(String.format("input attribute %s must have type String or Text, its actual type is %s", predicate.getInputAttributeName(), inputAttributeType));
}
// generate output schema by transforming the input schema
outputSchema = transformSchema(inputOperator.getOutputSchema());
cursor = OPENED;
// setup NLP sentiment analysis pipeline
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, parse, sentiment");
sentimentPipeline = new StanfordCoreNLP(props);
}
Aggregations