Search in sources :

Example 26 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class FuzzyTokenMatcherTest method TestFuzzyTokenMatcher1.

@Test
public void TestFuzzyTokenMatcher1() throws Exception {
    String query = "Twelve Angry Men";
    // The ratio of tokens that need to be matched
    double threshold = 0.5;
    ArrayList<String> attributeNames = new ArrayList<>();
    attributeNames.add(TestConstants.DESCRIPTION);
    Schema schema = new Schema.Builder().add(TestConstants.SCHEMA_PEOPLE).add(RESULTS_ATTR).build();
    List<Span> spanList1 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 5, 10, "angry", "Angry", 1));
    IField[] fields1 = { new StringField("bruce"), new StringField("john Lee"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("Tall Angry"), new ListField<Span>(spanList1) };
    List<Span> spanList2 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 6, 11, "angry", "Angry", 1));
    IField[] fields2 = { new StringField("brad lie angelina"), new StringField("pitt"), new IntegerField(44), new DoubleField(6.10), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-12-1972")), new TextField("White Angry"), new ListField<Span>(spanList2) };
    List<Span> spanList3 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 40, 45, "angry", "Angry", 8));
    IField[] fields3 = { new StringField("george lin lin"), new StringField("lin clooney"), new IntegerField(43), new DoubleField(6.06), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1973")), new TextField("Lin Clooney is Short and lin clooney is Angry"), new ListField<Span>(spanList3) };
    List<Span> spanList4 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 6, 11, "angry", "angry", 1));
    IField[] fields4 = { new StringField("Mary brown"), new StringField("Lake Forest"), new IntegerField(42), new DoubleField(5.99), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1974")), new TextField("Short angry"), new ListField<Span>(spanList4) };
    Tuple tuple1 = new Tuple(schema, fields1);
    Tuple tuple2 = new Tuple(schema, fields2);
    Tuple tuple3 = new Tuple(schema, fields3);
    Tuple tuple4 = new Tuple(schema, fields4);
    List<Tuple> expectedResultList = new ArrayList<>();
    expectedResultList.add(tuple1);
    expectedResultList.add(tuple2);
    expectedResultList.add(tuple3);
    expectedResultList.add(tuple4);
    List<Tuple> results = FuzzyTokenMatcherTestHelper.getQueryResults(PEOPLE_TABLE, query, threshold, attributeNames);
    boolean contains = TestUtils.equals(expectedResultList, results);
    Assert.assertTrue(contains);
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.texera.api.field.IntegerField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) StringField(edu.uci.ics.texera.api.field.StringField) TextField(edu.uci.ics.texera.api.field.TextField) DateField(edu.uci.ics.texera.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.texera.api.field.DoubleField) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 27 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class FuzzyTokenMatcherTest method TestFuzzyTokenMatcherWithThresholdVariation.

@Test
public void TestFuzzyTokenMatcherWithThresholdVariation() throws Exception {
    String query = "Twelve Angry Men Cafe";
    double threshold = 0.25;
    ArrayList<String> attributeNames = new ArrayList<>();
    attributeNames.add(TestConstants.DESCRIPTION);
    Schema schema = new Schema.Builder().add(TestConstants.SCHEMA_PEOPLE).add(RESULTS_ATTR).build();
    List<Span> spanList1 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 5, 10, "angry", "Angry", 1));
    IField[] fields1 = { new StringField("bruce"), new StringField("john Lee"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("Tall Angry"), new ListField<Span>(spanList1) };
    List<Span> spanList2 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 6, 11, "angry", "Angry", 1));
    IField[] fields2 = { new StringField("brad lie angelina"), new StringField("pitt"), new IntegerField(44), new DoubleField(6.10), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-12-1972")), new TextField("White Angry"), new ListField<Span>(spanList2) };
    List<Span> spanList3 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 40, 45, "angry", "Angry", 8));
    IField[] fields3 = { new StringField("george lin lin"), new StringField("lin clooney"), new IntegerField(43), new DoubleField(6.06), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1973")), new TextField("Lin Clooney is Short and lin clooney is Angry"), new ListField<Span>(spanList3) };
    List<Span> spanList4 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 6, 11, "angry", "angry", 1));
    IField[] fields4 = { new StringField("Mary brown"), new StringField("Lake Forest"), new IntegerField(42), new DoubleField(5.99), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1974")), new TextField("Short angry"), new ListField<Span>(spanList4) };
    Tuple tuple1 = new Tuple(schema, fields1);
    Tuple tuple2 = new Tuple(schema, fields2);
    Tuple tuple3 = new Tuple(schema, fields3);
    Tuple tuple4 = new Tuple(schema, fields4);
    List<Tuple> expectedResultList = new ArrayList<>();
    expectedResultList.add(tuple1);
    expectedResultList.add(tuple2);
    expectedResultList.add(tuple3);
    expectedResultList.add(tuple4);
    List<Tuple> results = FuzzyTokenMatcherTestHelper.getQueryResults(PEOPLE_TABLE, query, threshold, attributeNames);
    boolean contains = TestUtils.equals(expectedResultList, results);
    Assert.assertTrue(contains);
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.texera.api.field.IntegerField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) StringField(edu.uci.ics.texera.api.field.StringField) TextField(edu.uci.ics.texera.api.field.TextField) DateField(edu.uci.ics.texera.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.texera.api.field.DoubleField) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 28 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class FuzzyTokenMatcherTest method TestFuzzyTokenMatcherWithLargeTokens.

@Test
public void TestFuzzyTokenMatcherWithLargeTokens() throws Exception {
    String query = "Twelve Angry Men Came Cafe Have Coffee Eat Chocolate Burger Fries SandWidch Cool Food Drinks American drama film elements film noir adapted teleplay same name Reginald Rose Written Rose directed  Sidney Lumet trial film tells story jury made deliberate guilt acquittal defendant basis reasonable doubt United States verdict most criminal ";
    double threshold = 0.02;
    ArrayList<String> attributeNames = new ArrayList<>();
    attributeNames.add(TestConstants.DESCRIPTION);
    Schema schema = new Schema.Builder().add(TestConstants.SCHEMA_PEOPLE).add(RESULTS_ATTR).build();
    List<Span> spanList1 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 5, 10, "angry", "Angry", 1));
    IField[] fields1 = { new StringField("bruce"), new StringField("john Lee"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("Tall Angry"), new ListField<Span>(spanList1) };
    List<Span> spanList2 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 6, 11, "angry", "Angry", 1));
    IField[] fields2 = { new StringField("brad lie angelina"), new StringField("pitt"), new IntegerField(44), new DoubleField(6.10), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-12-1972")), new TextField("White Angry"), new ListField<Span>(spanList2) };
    List<Span> spanList3 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 40, 45, "angry", "Angry", 8));
    IField[] fields3 = { new StringField("george lin lin"), new StringField("lin clooney"), new IntegerField(43), new DoubleField(6.06), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1973")), new TextField("Lin Clooney is Short and lin clooney is Angry"), new ListField<Span>(spanList3) };
    List<Span> spanList4 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 6, 11, "angry", "angry", 1));
    IField[] fields4 = { new StringField("Mary brown"), new StringField("Lake Forest"), new IntegerField(42), new DoubleField(5.99), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1974")), new TextField("Short angry"), new ListField<Span>(spanList4) };
    Tuple tuple1 = new Tuple(schema, fields1);
    Tuple tuple2 = new Tuple(schema, fields2);
    Tuple tuple3 = new Tuple(schema, fields3);
    Tuple tuple4 = new Tuple(schema, fields4);
    List<Tuple> expectedResultList = new ArrayList<>();
    expectedResultList.add(tuple1);
    expectedResultList.add(tuple2);
    expectedResultList.add(tuple3);
    expectedResultList.add(tuple4);
    List<Tuple> results = FuzzyTokenMatcherTestHelper.getQueryResults(PEOPLE_TABLE, query, threshold, attributeNames);
    boolean contains = TestUtils.equals(expectedResultList, results);
    Assert.assertTrue(contains);
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.texera.api.field.IntegerField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) StringField(edu.uci.ics.texera.api.field.StringField) TextField(edu.uci.ics.texera.api.field.TextField) DateField(edu.uci.ics.texera.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.texera.api.field.DoubleField) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 29 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class FuzzyTokenMatcherTest method TestFuzzyTokenMatcherWithLimit.

@Test
public void TestFuzzyTokenMatcherWithLimit() throws Exception {
    String query = "Twelve Angry Men";
    // The ratio of tokens that need to be matched
    double threshold = 0.5;
    ArrayList<String> attributeNames = new ArrayList<>();
    attributeNames.add(TestConstants.DESCRIPTION);
    Schema schema = new Schema.Builder().add(TestConstants.SCHEMA_PEOPLE).add(RESULTS_ATTR).build();
    List<Span> list = new ArrayList<>();
    Span span = new Span(TestConstants.DESCRIPTION, 5, 10, "angry", "Angry", 1);
    list.add(span);
    IField[] fields1 = { new StringField("bruce"), new StringField("john Lee"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("Tall Angry"), new ListField<>(list) };
    list = new ArrayList<>();
    span = new Span(TestConstants.DESCRIPTION, 6, 11, "angry", "Angry", 1);
    list.add(span);
    IField[] fields2 = { new StringField("brad lie angelina"), new StringField("pitt"), new IntegerField(44), new DoubleField(6.10), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-12-1972")), new TextField("White Angry"), new ListField<>(list) };
    list = new ArrayList<>();
    span = new Span(TestConstants.DESCRIPTION, 40, 45, "angry", "Angry", 8);
    list.add(span);
    IField[] fields3 = { new StringField("george lin lin"), new StringField("lin clooney"), new IntegerField(43), new DoubleField(6.06), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1973")), new TextField("Lin Clooney is Short and lin clooney is Angry"), new ListField<>(list) };
    list = new ArrayList<>();
    span = new Span(TestConstants.DESCRIPTION, 6, 11, "angry", "angry", 1);
    list.add(span);
    IField[] fields4 = { new StringField("Mary brown"), new StringField("Lake Forest"), new IntegerField(42), new DoubleField(5.99), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1974")), new TextField("Short angry"), new ListField<>(list) };
    Tuple tuple1 = new Tuple(schema, fields1);
    Tuple tuple2 = new Tuple(schema, fields2);
    Tuple tuple3 = new Tuple(schema, fields3);
    Tuple tuple4 = new Tuple(schema, fields4);
    List<Tuple> expectedResultList = new ArrayList<>();
    expectedResultList.add(tuple1);
    expectedResultList.add(tuple2);
    expectedResultList.add(tuple3);
    expectedResultList.add(tuple4);
    List<Tuple> results = FuzzyTokenMatcherTestHelper.getQueryResults(PEOPLE_TABLE, query, threshold, attributeNames, 2, 0);
    Assert.assertEquals(expectedResultList.size(), 4);
    Assert.assertEquals(results.size(), 2);
    Assert.assertTrue(TestUtils.containsAll(expectedResultList, results));
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.texera.api.field.IntegerField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) StringField(edu.uci.ics.texera.api.field.StringField) TextField(edu.uci.ics.texera.api.field.TextField) DateField(edu.uci.ics.texera.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.texera.api.field.DoubleField) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 30 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class JoinDistanceTest method testSpansOverlapAndWithinThreshold.

/*
     * This case tests for the spans to be joined have some overlap and both
     * |(span 1 spanStartIndex) - (span 2 spanStartIndex)|,
     * |(span 1 spanEndIndex) - (span 2 spanEndIndex)| are within threshold.
     * 
     * e.g.
     * [<75, 97>]
     * [<92, 109>]
     * threshold = 20 (within threshold)
     * result: [<75, 109>]
     * 
     * Test result: The list contains a tuple with all the fields and a span
     * list consisting of the joined span. The joined span is made up of the
     * field name, start and stop index (computed as <min(span1 spanStartIndex,
     * span2 spanStartIndex), max(span1 spanEndIndex, span2 spanEndIndex)>)
     * key (combination of span1 key and span2 key) and value (combination of 
     * span1 value and span2 value).
     */
@Test
public void testSpansOverlapAndWithinThreshold() throws Exception {
    JoinTestHelper.insertToTable(BOOK_TABLE, JoinTestConstants.bookGroup1.get(0));
    KeywordMatcherSourceOperator keywordSourceOuter = JoinTestHelper.getKeywordSource(BOOK_TABLE, "gastrointestinal tract", phrase);
    KeywordMatcherSourceOperator keywordSourceInner = JoinTestHelper.getKeywordSource(BOOK_TABLE, "tract interesting", phrase);
    List<Tuple> resultList = JoinTestHelper.getJoinDistanceResults(keywordSourceInner, keywordSourceOuter, new JoinDistancePredicate(JoinTestConstants.REVIEW, 20), Integer.MAX_VALUE, 0);
    Schema resultSchema = new Schema.Builder().add(JoinTestConstants.BOOK_SCHEMA).add(SchemaConstants.SPAN_LIST_ATTRIBUTE).build();
    List<Span> spanList = new ArrayList<>();
    Span span1 = new Span(JoinTestConstants.REVIEW, 75, 109, "gastrointestinal tract_" + "tract interesting", "gastrointestinal " + "tract interesting");
    spanList.add(span1);
    IField[] book1 = { new IntegerField(52), new StringField("Mary Roach"), new StringField("Grunt: The Curious Science of Humans at War"), new IntegerField(288), new TextField("It takes a special kind " + "of writer to make topics ranging from death to our " + "gastrointestinal tract interesting (sometimes " + "hilariously so), and pop science writer Mary Roach is " + "always up to the task."), new ListField<>(spanList) };
    Tuple expectedTuple = new Tuple(resultSchema, book1);
    List<Tuple> expectedResult = new ArrayList<>();
    expectedResult.add(expectedTuple);
    Assert.assertEquals(1, resultList.size());
    Assert.assertTrue(TestUtils.equals(expectedResult, resultList));
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.texera.api.field.IntegerField) IField(edu.uci.ics.texera.api.field.IField) JoinDistancePredicate(edu.uci.ics.texera.dataflow.join.JoinDistancePredicate) Span(edu.uci.ics.texera.api.span.Span) KeywordMatcherSourceOperator(edu.uci.ics.texera.dataflow.keywordmatcher.KeywordMatcherSourceOperator) StringField(edu.uci.ics.texera.api.field.StringField) TextField(edu.uci.ics.texera.api.field.TextField) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Aggregations

Tuple (edu.uci.ics.texera.api.tuple.Tuple)280 ArrayList (java.util.ArrayList)167 Test (org.junit.Test)158 Schema (edu.uci.ics.texera.api.schema.Schema)108 IField (edu.uci.ics.texera.api.field.IField)106 Span (edu.uci.ics.texera.api.span.Span)99 TextField (edu.uci.ics.texera.api.field.TextField)90 StringField (edu.uci.ics.texera.api.field.StringField)83 IntegerField (edu.uci.ics.texera.api.field.IntegerField)80 Attribute (edu.uci.ics.texera.api.schema.Attribute)75 DoubleField (edu.uci.ics.texera.api.field.DoubleField)59 DateField (edu.uci.ics.texera.api.field.DateField)55 SimpleDateFormat (java.text.SimpleDateFormat)53 DataWriter (edu.uci.ics.texera.storage.DataWriter)32 Dictionary (edu.uci.ics.texera.dataflow.dictionarymatcher.Dictionary)30 ListField (edu.uci.ics.texera.api.field.ListField)27 ScanBasedSourceOperator (edu.uci.ics.texera.dataflow.source.scan.ScanBasedSourceOperator)21 ScanSourcePredicate (edu.uci.ics.texera.dataflow.source.scan.ScanSourcePredicate)21 KeywordMatcherSourceOperator (edu.uci.ics.texera.dataflow.keywordmatcher.KeywordMatcherSourceOperator)20 RelationManager (edu.uci.ics.texera.storage.RelationManager)19