Search in sources :

Example 36 with StringField

use of edu.uci.ics.texera.api.field.StringField in project textdb by TextDB.

the class DictionaryMatcherTest method testSingleWordQueryInTextFieldUsingPhraseChinese.

/**
 * Scenario: verifies GetNextTuple of DictionaryMatcher and single word
 * queries in Text Field using PHRASE OPERATOR in Chinese.
 */
@Test
public void testSingleWordQueryInTextFieldUsingPhraseChinese() throws Exception {
    ArrayList<String> names = new ArrayList<String>(Arrays.asList("北京大学"));
    Dictionary dictionary = new Dictionary(names);
    // create a data tuple first
    List<Span> list = new ArrayList<Span>();
    Span span = new Span("description", 0, 4, "北京大学", "北京大学");
    list.add(span);
    Attribute[] schemaAttributes = new Attribute[TestConstantsChinese.ATTRIBUTES_PEOPLE.length + 1];
    for (int count = 0; count < schemaAttributes.length - 1; count++) {
        schemaAttributes[count] = TestConstantsChinese.ATTRIBUTES_PEOPLE[count];
    }
    schemaAttributes[schemaAttributes.length - 1] = RESULTS_ATTRIBUTE;
    IField[] fields1 = { new StringField("无忌"), new StringField("长孙"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("北京大学电气工程学院"), new ListField<Span>(list) };
    IField[] fields2 = { new StringField("孔明"), new StringField("洛克贝尔"), new IntegerField(42), new DoubleField(5.99), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1974")), new TextField("北京大学计算机学院"), new ListField<Span>(list) };
    Tuple tuple1 = new Tuple(new Schema(schemaAttributes), fields1);
    Tuple tuple2 = new Tuple(new Schema(schemaAttributes), fields2);
    List<Tuple> expectedResults = new ArrayList<Tuple>();
    expectedResults.add(tuple1);
    expectedResults.add(tuple2);
    List<String> attributeNames = Arrays.asList(TestConstantsChinese.FIRST_NAME, TestConstantsChinese.LAST_NAME, TestConstantsChinese.DESCRIPTION);
    List<Tuple> returnedResults = DictionaryMatcherTestHelper.getQueryResults(CHINESE_TABLE, dictionary, attributeNames, KeywordMatchingType.PHRASE_INDEXBASED);
    boolean contains = TestUtils.equals(expectedResults, returnedResults);
    Assert.assertTrue(contains);
}
Also used : Dictionary(edu.uci.ics.texera.dataflow.dictionarymatcher.Dictionary) Attribute(edu.uci.ics.texera.api.schema.Attribute) Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.texera.api.field.IntegerField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) StringField(edu.uci.ics.texera.api.field.StringField) TextField(edu.uci.ics.texera.api.field.TextField) DateField(edu.uci.ics.texera.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.texera.api.field.DoubleField) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 37 with StringField

use of edu.uci.ics.texera.api.field.StringField in project textdb by TextDB.

the class FuzzyTokenMatcherTest method TestFuzzyTokenMatcherWithLargeTokens.

@Test
public void TestFuzzyTokenMatcherWithLargeTokens() throws Exception {
    String query = "Twelve Angry Men Came Cafe Have Coffee Eat Chocolate Burger Fries SandWidch Cool Food Drinks American drama film elements film noir adapted teleplay same name Reginald Rose Written Rose directed  Sidney Lumet trial film tells story jury made deliberate guilt acquittal defendant basis reasonable doubt United States verdict most criminal ";
    double threshold = 0.02;
    ArrayList<String> attributeNames = new ArrayList<>();
    attributeNames.add(TestConstants.DESCRIPTION);
    Schema schema = new Schema.Builder().add(TestConstants.SCHEMA_PEOPLE).add(RESULTS_ATTR).build();
    List<Span> spanList1 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 5, 10, "angry", "Angry", 1));
    IField[] fields1 = { new StringField("bruce"), new StringField("john Lee"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("Tall Angry"), new ListField<Span>(spanList1) };
    List<Span> spanList2 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 6, 11, "angry", "Angry", 1));
    IField[] fields2 = { new StringField("brad lie angelina"), new StringField("pitt"), new IntegerField(44), new DoubleField(6.10), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-12-1972")), new TextField("White Angry"), new ListField<Span>(spanList2) };
    List<Span> spanList3 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 40, 45, "angry", "Angry", 8));
    IField[] fields3 = { new StringField("george lin lin"), new StringField("lin clooney"), new IntegerField(43), new DoubleField(6.06), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1973")), new TextField("Lin Clooney is Short and lin clooney is Angry"), new ListField<Span>(spanList3) };
    List<Span> spanList4 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 6, 11, "angry", "angry", 1));
    IField[] fields4 = { new StringField("Mary brown"), new StringField("Lake Forest"), new IntegerField(42), new DoubleField(5.99), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1974")), new TextField("Short angry"), new ListField<Span>(spanList4) };
    Tuple tuple1 = new Tuple(schema, fields1);
    Tuple tuple2 = new Tuple(schema, fields2);
    Tuple tuple3 = new Tuple(schema, fields3);
    Tuple tuple4 = new Tuple(schema, fields4);
    List<Tuple> expectedResultList = new ArrayList<>();
    expectedResultList.add(tuple1);
    expectedResultList.add(tuple2);
    expectedResultList.add(tuple3);
    expectedResultList.add(tuple4);
    List<Tuple> results = FuzzyTokenMatcherTestHelper.getQueryResults(PEOPLE_TABLE, query, threshold, attributeNames);
    boolean contains = TestUtils.equals(expectedResultList, results);
    Assert.assertTrue(contains);
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.texera.api.field.IntegerField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) StringField(edu.uci.ics.texera.api.field.StringField) TextField(edu.uci.ics.texera.api.field.TextField) DateField(edu.uci.ics.texera.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.texera.api.field.DoubleField) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 38 with StringField

use of edu.uci.ics.texera.api.field.StringField in project textdb by TextDB.

the class FuzzyTokenMatcherTest method TestFuzzyTokenMatcherWithLimitOffset.

@Test
public void TestFuzzyTokenMatcherWithLimitOffset() throws Exception {
    String query = "Twelve Angry Men";
    // The ratio of tokens that need to be matched
    double threshold = 0.5;
    ArrayList<String> attributeNames = new ArrayList<>();
    attributeNames.add(TestConstants.DESCRIPTION);
    Schema schema = new Schema.Builder().add(TestConstants.SCHEMA_PEOPLE).add(RESULTS_ATTR).build();
    List<Span> list = new ArrayList<>();
    Span span = new Span(TestConstants.DESCRIPTION, 5, 10, "angry", "Angry", 1);
    list.add(span);
    IField[] fields1 = { new StringField("bruce"), new StringField("john Lee"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("Tall Angry"), new ListField<>(list) };
    list = new ArrayList<>();
    span = new Span(TestConstants.DESCRIPTION, 6, 11, "angry", "Angry", 1);
    list.add(span);
    IField[] fields2 = { new StringField("brad lie angelina"), new StringField("pitt"), new IntegerField(44), new DoubleField(6.10), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-12-1972")), new TextField("White Angry"), new ListField<>(list) };
    list = new ArrayList<>();
    span = new Span(TestConstants.DESCRIPTION, 40, 45, "angry", "Angry", 8);
    list.add(span);
    IField[] fields3 = { new StringField("george lin lin"), new StringField("lin clooney"), new IntegerField(43), new DoubleField(6.06), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1973")), new TextField("Lin Clooney is Short and lin clooney is Angry"), new ListField<>(list) };
    list = new ArrayList<>();
    span = new Span(TestConstants.DESCRIPTION, 6, 11, "angry", "angry", 1);
    list.add(span);
    IField[] fields4 = { new StringField("Mary brown"), new StringField("Lake Forest"), new IntegerField(42), new DoubleField(5.99), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1974")), new TextField("Short angry"), new ListField<>(list) };
    Tuple tuple1 = new Tuple(schema, fields1);
    Tuple tuple2 = new Tuple(schema, fields2);
    Tuple tuple3 = new Tuple(schema, fields3);
    Tuple tuple4 = new Tuple(schema, fields4);
    List<Tuple> expectedResultList = new ArrayList<>();
    expectedResultList.add(tuple1);
    expectedResultList.add(tuple2);
    expectedResultList.add(tuple3);
    expectedResultList.add(tuple4);
    List<Tuple> results = FuzzyTokenMatcherTestHelper.getQueryResults(PEOPLE_TABLE, query, threshold, attributeNames, 2, 1);
    Assert.assertEquals(expectedResultList.size(), 4);
    Assert.assertEquals(results.size(), 2);
    Assert.assertTrue(TestUtils.containsAll(expectedResultList, results));
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.texera.api.field.IntegerField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) StringField(edu.uci.ics.texera.api.field.StringField) TextField(edu.uci.ics.texera.api.field.TextField) DateField(edu.uci.ics.texera.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.texera.api.field.DoubleField) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 39 with StringField

use of edu.uci.ics.texera.api.field.StringField in project textdb by TextDB.

the class FuzzyTokenMatcherTest method TestFuzzyTokenMatcherWithLimit.

@Test
public void TestFuzzyTokenMatcherWithLimit() throws Exception {
    String query = "Twelve Angry Men";
    // The ratio of tokens that need to be matched
    double threshold = 0.5;
    ArrayList<String> attributeNames = new ArrayList<>();
    attributeNames.add(TestConstants.DESCRIPTION);
    Schema schema = new Schema.Builder().add(TestConstants.SCHEMA_PEOPLE).add(RESULTS_ATTR).build();
    List<Span> list = new ArrayList<>();
    Span span = new Span(TestConstants.DESCRIPTION, 5, 10, "angry", "Angry", 1);
    list.add(span);
    IField[] fields1 = { new StringField("bruce"), new StringField("john Lee"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("Tall Angry"), new ListField<>(list) };
    list = new ArrayList<>();
    span = new Span(TestConstants.DESCRIPTION, 6, 11, "angry", "Angry", 1);
    list.add(span);
    IField[] fields2 = { new StringField("brad lie angelina"), new StringField("pitt"), new IntegerField(44), new DoubleField(6.10), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-12-1972")), new TextField("White Angry"), new ListField<>(list) };
    list = new ArrayList<>();
    span = new Span(TestConstants.DESCRIPTION, 40, 45, "angry", "Angry", 8);
    list.add(span);
    IField[] fields3 = { new StringField("george lin lin"), new StringField("lin clooney"), new IntegerField(43), new DoubleField(6.06), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1973")), new TextField("Lin Clooney is Short and lin clooney is Angry"), new ListField<>(list) };
    list = new ArrayList<>();
    span = new Span(TestConstants.DESCRIPTION, 6, 11, "angry", "angry", 1);
    list.add(span);
    IField[] fields4 = { new StringField("Mary brown"), new StringField("Lake Forest"), new IntegerField(42), new DoubleField(5.99), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1974")), new TextField("Short angry"), new ListField<>(list) };
    Tuple tuple1 = new Tuple(schema, fields1);
    Tuple tuple2 = new Tuple(schema, fields2);
    Tuple tuple3 = new Tuple(schema, fields3);
    Tuple tuple4 = new Tuple(schema, fields4);
    List<Tuple> expectedResultList = new ArrayList<>();
    expectedResultList.add(tuple1);
    expectedResultList.add(tuple2);
    expectedResultList.add(tuple3);
    expectedResultList.add(tuple4);
    List<Tuple> results = FuzzyTokenMatcherTestHelper.getQueryResults(PEOPLE_TABLE, query, threshold, attributeNames, 2, 0);
    Assert.assertEquals(expectedResultList.size(), 4);
    Assert.assertEquals(results.size(), 2);
    Assert.assertTrue(TestUtils.containsAll(expectedResultList, results));
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.texera.api.field.IntegerField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) StringField(edu.uci.ics.texera.api.field.StringField) TextField(edu.uci.ics.texera.api.field.TextField) DateField(edu.uci.ics.texera.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.texera.api.field.DoubleField) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 40 with StringField

use of edu.uci.ics.texera.api.field.StringField in project textdb by TextDB.

the class FuzzyTokenMatcherTest method TestFuzzyTokenMatcher1.

@Test
public void TestFuzzyTokenMatcher1() throws Exception {
    String query = "Twelve Angry Men";
    // The ratio of tokens that need to be matched
    double threshold = 0.5;
    ArrayList<String> attributeNames = new ArrayList<>();
    attributeNames.add(TestConstants.DESCRIPTION);
    Schema schema = new Schema.Builder().add(TestConstants.SCHEMA_PEOPLE).add(RESULTS_ATTR).build();
    List<Span> spanList1 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 5, 10, "angry", "Angry", 1));
    IField[] fields1 = { new StringField("bruce"), new StringField("john Lee"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("Tall Angry"), new ListField<Span>(spanList1) };
    List<Span> spanList2 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 6, 11, "angry", "Angry", 1));
    IField[] fields2 = { new StringField("brad lie angelina"), new StringField("pitt"), new IntegerField(44), new DoubleField(6.10), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-12-1972")), new TextField("White Angry"), new ListField<Span>(spanList2) };
    List<Span> spanList3 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 40, 45, "angry", "Angry", 8));
    IField[] fields3 = { new StringField("george lin lin"), new StringField("lin clooney"), new IntegerField(43), new DoubleField(6.06), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1973")), new TextField("Lin Clooney is Short and lin clooney is Angry"), new ListField<Span>(spanList3) };
    List<Span> spanList4 = Arrays.asList(new Span(TestConstants.DESCRIPTION, 6, 11, "angry", "angry", 1));
    IField[] fields4 = { new StringField("Mary brown"), new StringField("Lake Forest"), new IntegerField(42), new DoubleField(5.99), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1974")), new TextField("Short angry"), new ListField<Span>(spanList4) };
    Tuple tuple1 = new Tuple(schema, fields1);
    Tuple tuple2 = new Tuple(schema, fields2);
    Tuple tuple3 = new Tuple(schema, fields3);
    Tuple tuple4 = new Tuple(schema, fields4);
    List<Tuple> expectedResultList = new ArrayList<>();
    expectedResultList.add(tuple1);
    expectedResultList.add(tuple2);
    expectedResultList.add(tuple3);
    expectedResultList.add(tuple4);
    List<Tuple> results = FuzzyTokenMatcherTestHelper.getQueryResults(PEOPLE_TABLE, query, threshold, attributeNames);
    boolean contains = TestUtils.equals(expectedResultList, results);
    Assert.assertTrue(contains);
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.texera.api.field.IntegerField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) StringField(edu.uci.ics.texera.api.field.StringField) TextField(edu.uci.ics.texera.api.field.TextField) DateField(edu.uci.ics.texera.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.texera.api.field.DoubleField) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Aggregations

StringField (edu.uci.ics.texera.api.field.StringField)103 Tuple (edu.uci.ics.texera.api.tuple.Tuple)94 IField (edu.uci.ics.texera.api.field.IField)87 IntegerField (edu.uci.ics.texera.api.field.IntegerField)87 TextField (edu.uci.ics.texera.api.field.TextField)79 Schema (edu.uci.ics.texera.api.schema.Schema)75 ArrayList (java.util.ArrayList)74 Test (org.junit.Test)70 Span (edu.uci.ics.texera.api.span.Span)64 DoubleField (edu.uci.ics.texera.api.field.DoubleField)63 DateField (edu.uci.ics.texera.api.field.DateField)60 Attribute (edu.uci.ics.texera.api.schema.Attribute)60 SimpleDateFormat (java.text.SimpleDateFormat)58 Dictionary (edu.uci.ics.texera.dataflow.dictionarymatcher.Dictionary)29 JoinDistancePredicate (edu.uci.ics.texera.dataflow.join.JoinDistancePredicate)9 KeywordMatcherSourceOperator (edu.uci.ics.texera.dataflow.keywordmatcher.KeywordMatcherSourceOperator)9 JsonNode (com.fasterxml.jackson.databind.JsonNode)8 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)8 IDField (edu.uci.ics.texera.api.field.IDField)5 IOperator (edu.uci.ics.texera.api.dataflow.IOperator)4