use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.
the class KeywordMatcherSourceOperator method buildPhraseQuery.
private Query buildPhraseQuery() throws DataflowException {
BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getType();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordPredicate: Fields other than STRING and TEXT are not supported yet");
}
if (attributeType == AttributeType.STRING) {
Query termQuery = new TermQuery(new Term(attributeName, predicate.getQuery()));
booleanQueryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
}
if (attributeType == AttributeType.TEXT) {
if (queryTokenList.size() == 1) {
Query termQuery = new TermQuery(new Term(attributeName, predicate.getQuery().toLowerCase()));
booleanQueryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
} else {
PhraseQuery.Builder phraseQueryBuilder = new PhraseQuery.Builder();
for (int i = 0; i < queryTokensWithStopwords.size(); i++) {
if (!StandardAnalyzer.STOP_WORDS_SET.contains(queryTokensWithStopwords.get(i))) {
phraseQueryBuilder.add(new Term(attributeName, queryTokensWithStopwords.get(i).toLowerCase()), i);
}
}
PhraseQuery phraseQuery = phraseQueryBuilder.build();
booleanQueryBuilder.add(phraseQuery, BooleanClause.Occur.SHOULD);
}
}
}
return booleanQueryBuilder.build();
}
use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.
the class KeywordMatcher method appendConjunctionMatchingSpans.
private List<Span> appendConjunctionMatchingSpans(Tuple inputTuple, List<String> attributeNames, Set<String> queryTokenSet, String queryKeyword) throws DataflowException {
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : attributeNames) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (queryKeyword.equals(fieldValue)) {
Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
matchingResults.add(span);
}
}
// list for this field
if (attributeType == AttributeType.TEXT) {
List<Span> relevantSpans = filterRelevantSpans(payload, queryTokenSet);
List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
if (DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
matchingResults.addAll(fieldSpanList);
}
}
}
return matchingResults;
}
use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.
the class KeywordMatcher method appendSubstringMatchingSpans.
private List<Span> appendSubstringMatchingSpans(Tuple inputTuple, List<String> attributeNames, String queryKeyword) throws DataflowException {
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : attributeNames) {
// AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (fieldValue.equals(queryKeyword)) {
matchingResults.add(new Span(attributeName, 0, queryKeyword.length(), queryKeyword, fieldValue));
}
}
if (attributeType == AttributeType.TEXT) {
String fieldValueLowerCase = fieldValue.toLowerCase();
String queryKeywordLowerCase = queryKeyword.toLowerCase();
for (int i = 0; i < fieldValueLowerCase.length(); i++) {
int index = -1;
if ((index = fieldValueLowerCase.indexOf(queryKeywordLowerCase, i)) != -1) {
matchingResults.add(new Span(attributeName, index, index + queryKeyword.length(), queryKeyword, fieldValue.substring(index, index + queryKeyword.length())));
i = index + 1;
} else {
break;
}
}
}
}
return matchingResults;
}
use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.
the class KeywordMatcher method appendPhraseMatchingSpans.
private List<Span> appendPhraseMatchingSpans(Tuple inputTuple, List<String> attributeNames, List<String> queryTokenList, List<String> queryTokenListWithStopwords, String queryKeyword) throws DataflowException {
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : attributeNames) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (queryKeyword.equals(fieldValue)) {
Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
matchingResults.add(span);
}
}
// phrase query
if (attributeType == AttributeType.TEXT) {
Set<String> queryTokenSet = new HashSet<>(queryTokenList);
List<Span> relevantSpans = filterRelevantSpans(payload, queryTokenSet);
List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
if (!DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
// in the spans
continue;
}
matchingResults.addAll(DataflowUtils.constructPhraseMatchingSpans(attributeName, fieldValue, queryKeyword, fieldSpanList, queryTokenListWithStopwords, queryTokenList));
}
}
return matchingResults;
}
use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.
the class NlpSentimentOperator method transformToOutputSchema.
public Schema transformToOutputSchema(Schema... inputSchema) {
if (inputSchema.length != 1)
throw new TexeraException(String.format(ErrorMessages.NUMBER_OF_ARGUMENTS_DOES_NOT_MATCH, 1, inputSchema.length));
// check if input schema is present
if (!inputSchema[0].containsAttribute(predicate.getInputAttributeName())) {
throw new TexeraException(String.format("input attribute %s is not in the input schema %s", predicate.getInputAttributeName(), inputSchema[0].getAttributeNames()));
}
// check if attribute type is valid
AttributeType inputAttributeType = inputSchema[0].getAttribute(predicate.getInputAttributeName()).getType();
boolean isValidType = inputAttributeType.equals(AttributeType.STRING) || inputAttributeType.equals(AttributeType.TEXT);
if (!isValidType) {
throw new TexeraException(String.format("input attribute %s must have type String or Text, its actual type is %s", predicate.getInputAttributeName(), inputAttributeType));
}
return transformSchema(inputSchema[0]);
}
Aggregations