use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.
the class DictionaryMatcherTest method testMultipleWordsQueryUsingPhrase.
/**
* Scenario S-10:verifies ITuple returned by DictionaryMatcher and multiple
* word queries using PHRASE OPERATOR
*/
@Test
public void testMultipleWordsQueryUsingPhrase() throws Exception {
ArrayList<String> names = new ArrayList<String>(Arrays.asList("george lin lin"));
Dictionary dictionary = new Dictionary(names);
// create a data tuple first
List<Span> list = new ArrayList<Span>();
Span span = new Span("firstName", 0, 14, "george lin lin", "george lin lin");
list.add(span);
Attribute[] schemaAttributes = new Attribute[TestConstants.ATTRIBUTES_PEOPLE.length + 1];
for (int count = 0; count < schemaAttributes.length - 1; count++) {
schemaAttributes[count] = TestConstants.ATTRIBUTES_PEOPLE[count];
}
schemaAttributes[schemaAttributes.length - 1] = RESULTS_ATTRIBUTE;
IField[] fields1 = { new StringField("george lin lin"), new StringField("lin clooney"), new IntegerField(43), new DoubleField(6.06), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1973")), new TextField("Lin Clooney is Short and lin clooney is Angry"), new ListField<Span>(list) };
Tuple tuple1 = new Tuple(new Schema(schemaAttributes), fields1);
List<Tuple> expectedResults = new ArrayList<Tuple>();
expectedResults.add(tuple1);
List<String> attributeNames = Arrays.asList(TestConstants.FIRST_NAME, TestConstants.LAST_NAME, TestConstants.DESCRIPTION);
List<Tuple> returnedResults = DictionaryMatcherTestHelper.getQueryResults(PEOPLE_TABLE, dictionary, attributeNames, KeywordMatchingType.PHRASE_INDEXBASED);
boolean contains = TestUtils.equals(expectedResults, returnedResults);
Assert.assertTrue(contains);
}
use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.
the class DictionaryMatcherTest method testMatchingWithLimit.
public void testMatchingWithLimit() throws Exception {
ArrayList<String> word = new ArrayList<String>(Arrays.asList("angry"));
Dictionary dictionary = new Dictionary(word);
Attribute[] schemaAttributes = new Attribute[TestConstants.ATTRIBUTES_PEOPLE.length + 1];
for (int count = 0; count < schemaAttributes.length - 1; count++) {
schemaAttributes[count] = TestConstants.ATTRIBUTES_PEOPLE[count];
}
schemaAttributes[schemaAttributes.length - 1] = RESULTS_ATTRIBUTE;
Span span1 = new Span("description", 5, 10, "angry", "Angry", 1);
Span span2 = new Span("description", 6, 11, "angry", "Angry", 1);
Span span3 = new Span("description", 40, 45, "angry", "Angry", 8);
Span span4 = new Span("description", 6, 11, "angry", "angry", 1);
List<Span> list1 = new ArrayList<>();
list1.add(span1);
IField[] fields1 = { new StringField("bruce"), new StringField("john Lee"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("Tall Angry"), new ListField<>(list1) };
List<Span> list2 = new ArrayList<>();
list2.add(span2);
IField[] fields2 = { new StringField("brad lie angelina"), new StringField("pitt"), new IntegerField(44), new DoubleField(6.10), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-12-1972")), new TextField("White Angry"), new ListField<>(list2) };
List<Span> list3 = new ArrayList<>();
list3.add(span3);
IField[] fields3 = { new StringField("george lin lin"), new StringField("lin clooney"), new IntegerField(43), new DoubleField(6.06), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1973")), new TextField("Lin Clooney is Short and lin clooney is Angry"), new ListField<>(list3) };
List<Span> list4 = new ArrayList<>();
list4.add(span4);
IField[] fields4 = { new StringField("Mary brown"), new StringField("Lake Forest"), new IntegerField(42), new DoubleField(5.99), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1974")), new TextField("Short angry"), new ListField<>(list4) };
Tuple tuple1 = new Tuple(new Schema(schemaAttributes), fields1);
Tuple tuple2 = new Tuple(new Schema(schemaAttributes), fields2);
Tuple tuple3 = new Tuple(new Schema(schemaAttributes), fields3);
Tuple tuple4 = new Tuple(new Schema(schemaAttributes), fields4);
List<String> attributeNames = Arrays.asList(TestConstants.FIRST_NAME, TestConstants.LAST_NAME, TestConstants.DESCRIPTION);
List<Tuple> expectedList = new ArrayList<>();
List<Tuple> resultList = DictionaryMatcherTestHelper.getQueryResults(PEOPLE_TABLE, dictionary, attributeNames, KeywordMatchingType.PHRASE_INDEXBASED, 3, 0);
expectedList.add(tuple1);
expectedList.add(tuple2);
expectedList.add(tuple3);
expectedList.add(tuple4);
Assert.assertEquals(expectedList.size(), 4);
Assert.assertEquals(resultList.size(), 3);
Assert.assertTrue(expectedList.containsAll(resultList));
}
use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.
the class DictionaryMatcherTest method testSingleWordQueryInStringFieldUsingPhrase.
/**
* Scenario: verifies GetNextTuple of DictionaryMatcher and multiple word
* queries in String Field using PHRASEOPERATOR
*/
@Test
public void testSingleWordQueryInStringFieldUsingPhrase() throws Exception {
ArrayList<String> names = new ArrayList<String>(Arrays.asList("john Lee", "bruce"));
Dictionary dictionary = new Dictionary(names);
// create a data tuple first
List<Span> list1 = new ArrayList<Span>();
List<Span> list2 = new ArrayList<Span>();
Span span1 = new Span("lastName", 0, 8, "john Lee", "john Lee");
Span span2 = new Span("firstName", 0, 5, "bruce", "bruce");
list1.add(span1);
list2.add(span2);
Attribute[] schemaAttributes = new Attribute[TestConstants.ATTRIBUTES_PEOPLE.length + 1];
for (int count = 0; count < schemaAttributes.length - 1; count++) {
schemaAttributes[count] = TestConstants.ATTRIBUTES_PEOPLE[count];
}
schemaAttributes[schemaAttributes.length - 1] = RESULTS_ATTRIBUTE;
IField[] fields1 = { new StringField("bruce"), new StringField("john Lee"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("Tall Angry"), new ListField<Span>(list1) };
Tuple tuple1 = new Tuple(new Schema(schemaAttributes), fields1);
IField[] fields2 = { new StringField("bruce"), new StringField("john Lee"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("Tall Angry"), new ListField<Span>(list2) };
Tuple tuple2 = new Tuple(new Schema(schemaAttributes), fields2);
List<Tuple> expectedResults = new ArrayList<Tuple>();
expectedResults.add(tuple1);
expectedResults.add(tuple2);
List<String> attributeNames = Arrays.asList(TestConstants.FIRST_NAME, TestConstants.LAST_NAME, TestConstants.DESCRIPTION);
List<Tuple> returnedResults = DictionaryMatcherTestHelper.getQueryResults(PEOPLE_TABLE, dictionary, attributeNames, KeywordMatchingType.PHRASE_INDEXBASED);
boolean contains = TestUtils.equals(expectedResults, returnedResults);
Assert.assertTrue(contains);
}
use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.
the class SimilarityJoinPredicate method mergeTuples.
private Tuple mergeTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema, List<Span> mergeSpanList) {
List<IField> resultFields = new ArrayList<>();
for (String attrName : outputSchema.getAttributeNames()) {
// generate a new _ID field for this tuple
if (attrName.equals(SchemaConstants._ID)) {
IDField newID = new IDField(UUID.randomUUID().toString());
resultFields.add(newID);
// use the generated spanList
} else if (attrName.equals(SchemaConstants.SPAN_LIST)) {
resultFields.add(new ListField<Span>(mergeSpanList));
// put the payload of two tuples together
} else if (attrName.equals(SchemaConstants.PAYLOAD)) {
ListField<Span> innerPayloadField = innerTuple.getField(SchemaConstants.PAYLOAD);
List<Span> innerPayload = innerPayloadField.getValue();
ListField<Span> outerPayloadField = outerTuple.getField(SchemaConstants.PAYLOAD);
List<Span> outerPayload = outerPayloadField.getValue();
List<Span> resultPayload = new ArrayList<>();
resultPayload.addAll(innerPayload.stream().map(span -> addFieldPrefix(span, INNER_PREFIX)).collect(Collectors.toList()));
resultPayload.addAll(outerPayload.stream().map(span -> addFieldPrefix(span, "outer_")).collect(Collectors.toList()));
// add other fields from inner/outer tuples
} else {
if (attrName.startsWith(INNER_PREFIX)) {
resultFields.add(innerTuple.getField(attrName.substring(INNER_PREFIX.length())));
} else if (attrName.startsWith(OUTER_PREFIX)) {
resultFields.add(outerTuple.getField(attrName.substring(OUTER_PREFIX.length())));
}
}
}
return new Tuple(outputSchema, resultFields.stream().toArray(IField[]::new));
}
use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.
the class SimilarityJoinPredicate method joinTuples.
@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws DataFlowException {
if (similarityThreshold == 0) {
return null;
}
// get the span list only with the joinAttributeName
ListField<Span> innerSpanListField = innerTuple.getField(SchemaConstants.SPAN_LIST);
List<Span> innerRelevantSpanList = innerSpanListField.getValue().stream().filter(span -> span.getAttributeName().equals(innerJoinAttrName)).collect(Collectors.toList());
ListField<Span> outerSpanListField = outerTuple.getField(SchemaConstants.SPAN_LIST);
List<Span> outerRelevantSpanList = outerSpanListField.getValue().stream().filter(span -> span.getAttributeName().equals(outerJoinAttrName)).collect(Collectors.toList());
// get a set of span's values (since multiple spans may have the same value)
Set<String> innerSpanValueSet = innerRelevantSpanList.stream().map(span -> span.getValue()).collect(Collectors.toSet());
Set<String> outerSpanValueSet = outerRelevantSpanList.stream().map(span -> span.getValue()).collect(Collectors.toSet());
// compute the result value set using the similarity function
Set<String> resultValueSet = new HashSet<>();
for (String innerString : innerSpanValueSet) {
for (String outerString : outerSpanValueSet) {
if (this.similarityFunc.calculateSimilarity(innerString, outerString) >= this.similarityThreshold) {
resultValueSet.add(innerString);
resultValueSet.add(outerString);
}
}
}
// return null if none of them are similar
if (resultValueSet.isEmpty()) {
return null;
}
// generate the result spans
List<Span> resultSpans = new ArrayList<>();
for (Span span : innerRelevantSpanList) {
if (resultValueSet.contains(span.getValue())) {
resultSpans.add(addFieldPrefix(span, INNER_PREFIX));
}
}
for (Span span : outerRelevantSpanList) {
if (resultValueSet.contains(span.getValue())) {
resultSpans.add(addFieldPrefix(span, OUTER_PREFIX));
}
}
return mergeTuples(innerTuple, outerTuple, outputSchema, resultSpans);
}
Aggregations