use of edu.uci.ics.textdb.api.field.ListField in project textdb by TextDB.
the class SimilarityJoinPredicate method joinTuples.
@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws DataFlowException {
if (similarityThreshold == 0) {
return null;
}
// get the span list only with the joinAttributeName
ListField<Span> innerSpanListField = innerTuple.getField(SchemaConstants.SPAN_LIST);
List<Span> innerRelevantSpanList = innerSpanListField.getValue().stream().filter(span -> span.getAttributeName().equals(innerJoinAttrName)).collect(Collectors.toList());
ListField<Span> outerSpanListField = outerTuple.getField(SchemaConstants.SPAN_LIST);
List<Span> outerRelevantSpanList = outerSpanListField.getValue().stream().filter(span -> span.getAttributeName().equals(outerJoinAttrName)).collect(Collectors.toList());
// get a set of span's values (since multiple spans may have the same value)
Set<String> innerSpanValueSet = innerRelevantSpanList.stream().map(span -> span.getValue()).collect(Collectors.toSet());
Set<String> outerSpanValueSet = outerRelevantSpanList.stream().map(span -> span.getValue()).collect(Collectors.toSet());
// compute the result value set using the similarity function
Set<String> resultValueSet = new HashSet<>();
for (String innerString : innerSpanValueSet) {
for (String outerString : outerSpanValueSet) {
if (this.similarityFunc.calculateSimilarity(innerString, outerString) >= this.similarityThreshold) {
resultValueSet.add(innerString);
resultValueSet.add(outerString);
}
}
}
// return null if none of them are similar
if (resultValueSet.isEmpty()) {
return null;
}
// generate the result spans
List<Span> resultSpans = new ArrayList<>();
for (Span span : innerRelevantSpanList) {
if (resultValueSet.contains(span.getValue())) {
resultSpans.add(addFieldPrefix(span, INNER_PREFIX));
}
}
for (Span span : outerRelevantSpanList) {
if (resultValueSet.contains(span.getValue())) {
resultSpans.add(addFieldPrefix(span, OUTER_PREFIX));
}
}
return mergeTuples(innerTuple, outerTuple, outputSchema, resultSpans);
}
use of edu.uci.ics.textdb.api.field.ListField in project textdb by TextDB.
the class KeywordMatcher method computePhraseMatchingResult.
private List<Span> computePhraseMatchingResult(Tuple inputTuple) throws DataFlowException {
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
List<Span> relevantSpans = filterRelevantSpans(payload);
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (fieldValue.equals(predicate.getQuery())) {
matchingResults.add(new Span(attributeName, 0, predicate.getQuery().length(), predicate.getQuery(), fieldValue));
}
}
// phrase query
if (attributeType == AttributeType.TEXT) {
List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
if (!isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
// in the spans
continue;
}
// Sort current field's span list by token offset for later use
Collections.sort(fieldSpanList, (span1, span2) -> span1.getTokenOffset() - span2.getTokenOffset());
List<Integer> queryTokenOffset = new ArrayList<>();
for (int i = 0; i < queryTokensWithStopwords.size(); i++) {
if (queryTokenList.contains(queryTokensWithStopwords.get(i))) {
queryTokenOffset.add(i);
}
}
// maintains position of term being checked in
int iter = 0;
// spanForThisField list
while (iter < fieldSpanList.size()) {
if (iter > fieldSpanList.size() - queryTokenList.size()) {
break;
}
// Verify if span in the spanForThisField correspond to our
// phrase query, ie relative position offsets should be
// similar
// and the value should be same.
// flag to check if a
boolean isMismatchInSpan = false;
// To check all the terms in query are verified
for (int i = 0; i < queryTokenList.size() - 1; i++) {
Span first = fieldSpanList.get(iter + i);
Span second = fieldSpanList.get(iter + i + 1);
if (!(second.getTokenOffset() - first.getTokenOffset() == queryTokenOffset.get(i + 1) - queryTokenOffset.get(i) && first.getValue().equalsIgnoreCase(queryTokenList.get(i)) && second.getValue().equalsIgnoreCase(queryTokenList.get(i + 1)))) {
iter++;
isMismatchInSpan = true;
break;
}
}
if (isMismatchInSpan) {
continue;
}
int combinedSpanStartIndex = fieldSpanList.get(iter).getStart();
int combinedSpanEndIndex = fieldSpanList.get(iter + queryTokenList.size() - 1).getEnd();
Span combinedSpan = new Span(attributeName, combinedSpanStartIndex, combinedSpanEndIndex, predicate.getQuery(), fieldValue.substring(combinedSpanStartIndex, combinedSpanEndIndex));
matchingResults.add(combinedSpan);
iter = iter + queryTokenList.size();
}
}
}
return matchingResults;
}
use of edu.uci.ics.textdb.api.field.ListField in project textdb by TextDB.
the class JoinDistancePredicate method joinTuples.
/**
* This method is called by the Join operator to perform the join on the
* tuples passed.
*
* @return New Tuple containing the result of join operation.
*/
@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws Exception {
List<Span> newJoinSpanList = new ArrayList<>();
/*
* We expect the values of all fields to be the same for innerTuple and outerTuple.
* We only checks _ID field, and field to be joined, since they are crucial to join operator.
* For other fields, we use the value from innerTuple.
* check if the _ID fields are the same
*/
if (!compareField(innerTuple, outerTuple, SchemaConstants._ID)) {
return null;
}
// check if the fields to be joined are the same
if (!compareField(innerTuple, outerTuple, this.joinAttributeName)) {
return null;
}
/*
* If either/both tuples have no span information, return null.
* Check using try/catch if both the tuples have span information.
* If not return null; so we can process next tuple.
*/
ListField<Span> spanFieldOfInnerTuple = innerTuple.getField(SchemaConstants.SPAN_LIST);
ListField<Span> spanFieldOfOuterTuple = outerTuple.getField(SchemaConstants.SPAN_LIST);
List<Span> innerSpanList = null;
List<Span> outerSpanList = null;
// ListField
if (spanFieldOfInnerTuple.getClass().equals(ListField.class)) {
innerSpanList = spanFieldOfInnerTuple.getValue();
}
if (spanFieldOfOuterTuple.getClass().equals(ListField.class)) {
outerSpanList = spanFieldOfOuterTuple.getValue();
}
Iterator<Span> outerSpanIter = outerSpanList.iterator();
// the ones specified in the JoinPredicate during "sort merge"?)
while (outerSpanIter.hasNext()) {
Span outerSpan = outerSpanIter.next();
// If not return null.
if (!outerSpan.getAttributeName().equals(this.joinAttributeName)) {
continue;
}
Iterator<Span> innerSpanIter = innerSpanList.iterator();
while (innerSpanIter.hasNext()) {
Span innerSpan = innerSpanIter.next();
if (!innerSpan.getAttributeName().equals(this.joinAttributeName)) {
continue;
}
Integer threshold = this.getThreshold();
if (Math.abs(outerSpan.getStart() - innerSpan.getStart()) <= threshold && Math.abs(outerSpan.getEnd() - innerSpan.getEnd()) <= threshold) {
Integer newSpanStartIndex = Math.min(innerSpan.getStart(), outerSpan.getStart());
Integer newSpanEndIndex = Math.max(innerSpan.getEnd(), outerSpan.getEnd());
String attributeName = this.joinAttributeName;
String fieldValue = (String) innerTuple.getField(attributeName).getValue();
String newFieldValue = fieldValue.substring(newSpanStartIndex, newSpanEndIndex);
String spanKey = outerSpan.getKey() + "_" + innerSpan.getKey();
Span newSpan = new Span(attributeName, newSpanStartIndex, newSpanEndIndex, spanKey, newFieldValue);
newJoinSpanList.add(newSpan);
}
}
}
if (newJoinSpanList.isEmpty()) {
return null;
}
// create output fields based on innerTuple's value
List<Attribute> outputAttrList = outputSchema.getAttributes();
List<IField> outputFields = outputAttrList.stream().filter(attr -> !attr.equals(SchemaConstants.SPAN_LIST_ATTRIBUTE)).map(attr -> attr.getAttributeName()).map(attributeName -> innerTuple.getField(attributeName, IField.class)).collect(Collectors.toList());
outputFields.add(new ListField<>(newJoinSpanList));
return new Tuple(outputSchema, outputFields.stream().toArray(IField[]::new));
}
use of edu.uci.ics.textdb.api.field.ListField in project textdb by TextDB.
the class SpanTupleTest method testGetters.
@Test
public void testGetters() throws ParseException {
// create data tuple first
Attribute[] attributes = new Attribute[TestConstants.ATTRIBUTES_PEOPLE.length + 1];
for (int count = 0; count < attributes.length - 1; count++) {
attributes[count] = TestConstants.ATTRIBUTES_PEOPLE[count];
}
attributes[attributes.length - 1] = SchemaConstants.SPAN_LIST_ATTRIBUTE;
List<IField> fields = new ArrayList<IField>(Arrays.asList(new IField[] { new StringField("bruce"), new StringField("lee"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("bruce was born in new york city and was grown up in los angeles") }));
IField spanField = createSpanListField();
fields.add(spanField);
spanTuple = new Tuple(new Schema(attributes), fields.toArray(new IField[fields.size()]));
IField spanFieldRetrieved = spanTuple.getField(SchemaConstants.SPAN_LIST);
Assert.assertTrue(spanFieldRetrieved instanceof ListField);
Assert.assertSame(spanField, spanFieldRetrieved);
}
use of edu.uci.ics.textdb.api.field.ListField in project textdb by TextDB.
the class SpanTupleTest method createSpanListField.
private IField createSpanListField() {
List<Span> list = new ArrayList<Span>();
// The key value will be:
// For RegexMatcher : "n.*k"
// For NamedEntityMatcher : LOCATION
// For DictionaryMatcher: "new york" - For DictionaryMatcher the key and
// value are same
// For KeyWordMatcher: "new york" - the value can be "new" or "york"
Span span1 = new Span("description", 18, 26, "LOCATION", "new york");
Span span2 = new Span("description", 52, 63, "LOCATION", "los angeles");
list.add(span1);
list.add(span2);
IField spanListField = new ListField<Span>(list);
return spanListField;
}
Aggregations