use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.
the class KeywordMatcher method computePhraseMatchingResult.
private List<Span> computePhraseMatchingResult(Tuple inputTuple) throws DataFlowException {
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
List<Span> relevantSpans = filterRelevantSpans(payload);
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (fieldValue.equals(predicate.getQuery())) {
matchingResults.add(new Span(attributeName, 0, predicate.getQuery().length(), predicate.getQuery(), fieldValue));
}
}
// phrase query
if (attributeType == AttributeType.TEXT) {
List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
if (!isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
// in the spans
continue;
}
// Sort current field's span list by token offset for later use
Collections.sort(fieldSpanList, (span1, span2) -> span1.getTokenOffset() - span2.getTokenOffset());
List<Integer> queryTokenOffset = new ArrayList<>();
for (int i = 0; i < queryTokensWithStopwords.size(); i++) {
if (queryTokenList.contains(queryTokensWithStopwords.get(i))) {
queryTokenOffset.add(i);
}
}
// maintains position of term being checked in
int iter = 0;
// spanForThisField list
while (iter < fieldSpanList.size()) {
if (iter > fieldSpanList.size() - queryTokenList.size()) {
break;
}
// Verify if span in the spanForThisField correspond to our
// phrase query, ie relative position offsets should be
// similar
// and the value should be same.
// flag to check if a
boolean isMismatchInSpan = false;
// To check all the terms in query are verified
for (int i = 0; i < queryTokenList.size() - 1; i++) {
Span first = fieldSpanList.get(iter + i);
Span second = fieldSpanList.get(iter + i + 1);
if (!(second.getTokenOffset() - first.getTokenOffset() == queryTokenOffset.get(i + 1) - queryTokenOffset.get(i) && first.getValue().equalsIgnoreCase(queryTokenList.get(i)) && second.getValue().equalsIgnoreCase(queryTokenList.get(i + 1)))) {
iter++;
isMismatchInSpan = true;
break;
}
}
if (isMismatchInSpan) {
continue;
}
int combinedSpanStartIndex = fieldSpanList.get(iter).getStart();
int combinedSpanEndIndex = fieldSpanList.get(iter + queryTokenList.size() - 1).getEnd();
Span combinedSpan = new Span(attributeName, combinedSpanStartIndex, combinedSpanEndIndex, predicate.getQuery(), fieldValue.substring(combinedSpanStartIndex, combinedSpanEndIndex));
matchingResults.add(combinedSpan);
iter = iter + queryTokenList.size();
}
}
}
return matchingResults;
}
use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.
the class JoinDistancePredicate method joinTuples.
/**
* This method is called by the Join operator to perform the join on the
* tuples passed.
*
* @return New Tuple containing the result of join operation.
*/
@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws Exception {
List<Span> newJoinSpanList = new ArrayList<>();
/*
* We expect the values of all fields to be the same for innerTuple and outerTuple.
* We only checks _ID field, and field to be joined, since they are crucial to join operator.
* For other fields, we use the value from innerTuple.
* check if the _ID fields are the same
*/
if (!compareField(innerTuple, outerTuple, SchemaConstants._ID)) {
return null;
}
// check if the fields to be joined are the same
if (!compareField(innerTuple, outerTuple, this.joinAttributeName)) {
return null;
}
/*
* If either/both tuples have no span information, return null.
* Check using try/catch if both the tuples have span information.
* If not return null; so we can process next tuple.
*/
ListField<Span> spanFieldOfInnerTuple = innerTuple.getField(SchemaConstants.SPAN_LIST);
ListField<Span> spanFieldOfOuterTuple = outerTuple.getField(SchemaConstants.SPAN_LIST);
List<Span> innerSpanList = null;
List<Span> outerSpanList = null;
// ListField
if (spanFieldOfInnerTuple.getClass().equals(ListField.class)) {
innerSpanList = spanFieldOfInnerTuple.getValue();
}
if (spanFieldOfOuterTuple.getClass().equals(ListField.class)) {
outerSpanList = spanFieldOfOuterTuple.getValue();
}
Iterator<Span> outerSpanIter = outerSpanList.iterator();
// the ones specified in the JoinPredicate during "sort merge"?)
while (outerSpanIter.hasNext()) {
Span outerSpan = outerSpanIter.next();
// If not return null.
if (!outerSpan.getAttributeName().equals(this.joinAttributeName)) {
continue;
}
Iterator<Span> innerSpanIter = innerSpanList.iterator();
while (innerSpanIter.hasNext()) {
Span innerSpan = innerSpanIter.next();
if (!innerSpan.getAttributeName().equals(this.joinAttributeName)) {
continue;
}
Integer threshold = this.getThreshold();
if (Math.abs(outerSpan.getStart() - innerSpan.getStart()) <= threshold && Math.abs(outerSpan.getEnd() - innerSpan.getEnd()) <= threshold) {
Integer newSpanStartIndex = Math.min(innerSpan.getStart(), outerSpan.getStart());
Integer newSpanEndIndex = Math.max(innerSpan.getEnd(), outerSpan.getEnd());
String attributeName = this.joinAttributeName;
String fieldValue = (String) innerTuple.getField(attributeName).getValue();
String newFieldValue = fieldValue.substring(newSpanStartIndex, newSpanEndIndex);
String spanKey = outerSpan.getKey() + "_" + innerSpan.getKey();
Span newSpan = new Span(attributeName, newSpanStartIndex, newSpanEndIndex, spanKey, newFieldValue);
newJoinSpanList.add(newSpan);
}
}
}
if (newJoinSpanList.isEmpty()) {
return null;
}
// create output fields based on innerTuple's value
List<Attribute> outputAttrList = outputSchema.getAttributes();
List<IField> outputFields = outputAttrList.stream().filter(attr -> !attr.equals(SchemaConstants.SPAN_LIST_ATTRIBUTE)).map(attr -> attr.getAttributeName()).map(attributeName -> innerTuple.getField(attributeName, IField.class)).collect(Collectors.toList());
outputFields.add(new ListField<>(newJoinSpanList));
return new Tuple(outputSchema, outputFields.stream().toArray(IField[]::new));
}
use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.
the class KeywordMatcher method computeSubstringMatchingResult.
private List<Span> computeSubstringMatchingResult(Tuple inputTuple) throws DataFlowException {
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (fieldValue.equals(predicate.getQuery())) {
matchingResults.add(new Span(attributeName, 0, predicate.getQuery().length(), predicate.getQuery(), fieldValue));
}
}
if (attributeType == AttributeType.TEXT) {
String regex = predicate.getQuery().toLowerCase();
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(fieldValue.toLowerCase());
while (matcher.find()) {
int start = matcher.start();
int end = matcher.end();
matchingResults.add(new Span(attributeName, start, end, predicate.getQuery(), fieldValue.substring(start, end)));
}
}
}
return matchingResults;
}
use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.
the class NlpEntityOperator method processOneInputTuple.
@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws TextDBException {
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : predicate.getAttributeNames()) {
IField field = inputTuple.getField(attributeName);
matchingResults.addAll(extractNlpSpans(field, attributeName));
}
if (matchingResults.isEmpty()) {
return null;
}
ListField<Span> spanListField = inputTuple.getField(predicate.getSpanListName());
List<Span> spanList = spanListField.getValue();
spanList.addAll(matchingResults);
return inputTuple;
}
use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.
the class RegexMatcher method re2jRegexMatch.
private List<Span> re2jRegexMatch(String fieldValue, String attributeName) {
List<Span> matchingResults = new ArrayList<>();
com.google.re2j.Matcher re2jMatcher = this.re2jPattern.matcher(fieldValue);
while (re2jMatcher.find()) {
int start = re2jMatcher.start();
int end = re2jMatcher.end();
matchingResults.add(new Span(attributeName, start, end, this.predicate.getRegex(), fieldValue.substring(start, end)));
}
return matchingResults;
}
Aggregations