Search in sources :

Example 1 with Tuple

use of edu.uci.ics.textdb.api.tuple.Tuple in project textdb by TextDB.

the class Utils method removeFields.

/**
    * Remove one or more fields from a tuple.
    * 
    * @param tuple
    * @param removeFields
    * @return
    */
public static Tuple removeFields(Tuple tuple, String... removeFields) {
    List<String> removeFieldList = Arrays.asList(removeFields);
    List<Integer> removedFeidsIndex = removeFieldList.stream().map(attributeName -> tuple.getSchema().getIndex(attributeName)).collect(Collectors.toList());
    Attribute[] newAttrs = tuple.getSchema().getAttributes().stream().filter(attr -> (!removeFieldList.contains(attr.getAttributeName()))).toArray(Attribute[]::new);
    Schema newSchema = new Schema(newAttrs);
    IField[] newFields = IntStream.range(0, tuple.getSchema().getAttributes().size()).filter(index -> (!removedFeidsIndex.contains(index))).mapToObj(index -> tuple.getField(index)).toArray(IField[]::new);
    return new Tuple(newSchema, newFields);
}
Also used : IntStream(java.util.stream.IntStream) SchemaConstants(edu.uci.ics.textdb.api.constants.SchemaConstants) DataConstants(edu.uci.ics.textdb.api.constants.DataConstants) Arrays(java.util.Arrays) Attribute(edu.uci.ics.textdb.api.schema.Attribute) TextdbProject(edu.uci.ics.textdb.api.constants.DataConstants.TextdbProject) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) Schema(edu.uci.ics.textdb.api.schema.Schema) List(java.util.List) Paths(java.nio.file.Paths) IField(edu.uci.ics.textdb.api.field.IField) StorageException(edu.uci.ics.textdb.api.exception.StorageException) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) Attribute(edu.uci.ics.textdb.api.schema.Attribute) Schema(edu.uci.ics.textdb.api.schema.Schema) IField(edu.uci.ics.textdb.api.field.IField) Tuple(edu.uci.ics.textdb.api.tuple.Tuple)

Example 2 with Tuple

use of edu.uci.ics.textdb.api.tuple.Tuple in project textdb by TextDB.

the class SimilarityJoinTest method test1.

/*
     * Tests the Similarity Join Predicate on two similar words:
     *   Donald J. Trump
     *   Donald Trump
     * Under the condition of similarity (NormalizedLevenshtein) > 0.8, these two words should match.
     *
     */
@Test
public void test1() throws TextDBException {
    JoinTestHelper.insertToTable(NEWS_TABLE_OUTER, JoinTestConstants.getNewsTuples().get(0));
    JoinTestHelper.insertToTable(NEWS_TABLE_INNER, JoinTestConstants.getNewsTuples().get(1));
    String trumpRegex = "[Dd]onald.{1,5}[Tt]rump";
    RegexMatcher regexMatcherInner = JoinTestHelper.getRegexMatcher(JoinTestHelper.NEWS_TABLE_INNER, trumpRegex, JoinTestConstants.NEWS_BODY);
    RegexMatcher regexMatcherOuter = JoinTestHelper.getRegexMatcher(JoinTestHelper.NEWS_TABLE_OUTER, trumpRegex, JoinTestConstants.NEWS_BODY);
    SimilarityJoinPredicate similarityJoinPredicate = new SimilarityJoinPredicate(JoinTestConstants.NEWS_BODY, 0.8);
    List<Tuple> results = JoinTestHelper.getJoinDistanceResults(regexMatcherInner, regexMatcherOuter, similarityJoinPredicate, Integer.MAX_VALUE, 0);
    Schema joinInputSchema = Utils.addAttributeToSchema(JoinTestConstants.NEWS_SCHEMA, SchemaConstants.SPAN_LIST_ATTRIBUTE);
    Schema resultSchema = similarityJoinPredicate.generateOutputSchema(joinInputSchema, joinInputSchema);
    List<Span> resultSpanList = Arrays.asList(new Span("inner_" + JoinTestConstants.NEWS_BODY, 5, 20, trumpRegex, "Donald J. Trump", -1), new Span("outer_" + JoinTestConstants.NEWS_BODY, 18, 30, trumpRegex, "Donald Trump", -1));
    Tuple resultTuple = new Tuple(resultSchema, new IDField(UUID.randomUUID().toString()), new IntegerField(2), new TextField("Alternative Facts and the Costs of Trump-Branded Reality"), new TextField("When Donald J. Trump swore the presidential oath on Friday, he assumed " + "responsibility not only for the levers of government but also for one of " + "the United States’ most valuable assets, battered though it may be: its credibility. " + "The country’s sentimental reverence for truth and its jealously guarded press freedoms, " + "while never perfect, have been as important to its global standing as the strength of " + "its military and the reliability of its currency. It’s the bedrock of that " + "American exceptionalism we’ve heard so much about for so long."), new IntegerField(1), new TextField("UCI marchers protest as Trump begins his presidency"), new TextField("a few hours after Donald Trump was sworn in Friday as the nation’s 45th president, " + "a line of more than 100 UC Irvine faculty members and students took to the campus " + "in pouring rain to demonstrate their opposition to his policies on immigration and " + "other issues and urge other opponents to keep organizing during Trump’s presidency."), new ListField<>(resultSpanList));
    Assert.assertTrue(TestUtils.equals(Arrays.asList(resultTuple), results));
}
Also used : IDField(edu.uci.ics.textdb.api.field.IDField) SimilarityJoinPredicate(edu.uci.ics.textdb.exp.join.SimilarityJoinPredicate) Schema(edu.uci.ics.textdb.api.schema.Schema) TextField(edu.uci.ics.textdb.api.field.TextField) RegexMatcher(edu.uci.ics.textdb.exp.regexmatcher.RegexMatcher) IntegerField(edu.uci.ics.textdb.api.field.IntegerField) Span(edu.uci.ics.textdb.api.span.Span) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) Test(org.junit.Test)

Example 3 with Tuple

use of edu.uci.ics.textdb.api.tuple.Tuple in project textdb by TextDB.

the class KeywordConjunctionTest method testSingleWordQueryInTextFieldChinese.

/**
     * Verifies GetNextTuple of Keyword Matcher and single word queries in Text
     * Field using Chinese.
     * 
     * @throws Exception
     */
@Test
public void testSingleWordQueryInTextFieldChinese() throws Exception {
    // Prepare the query
    String query = "北京大学";
    ArrayList<String> attributeNames = new ArrayList<>();
    attributeNames.add(TestConstantsChinese.FIRST_NAME);
    attributeNames.add(TestConstantsChinese.LAST_NAME);
    attributeNames.add(TestConstantsChinese.DESCRIPTION);
    // Prepare the expected result list
    List<Span> list = new ArrayList<>();
    Span span = new Span("description", 0, 4, "北京大学", "北京大学", 0);
    list.add(span);
    Attribute[] schemaAttributes = new Attribute[TestConstantsChinese.ATTRIBUTES_PEOPLE.length + 1];
    for (int count = 0; count < schemaAttributes.length - 1; count++) {
        schemaAttributes[count] = TestConstantsChinese.ATTRIBUTES_PEOPLE[count];
    }
    schemaAttributes[schemaAttributes.length - 1] = new Attribute(RESULTS, AttributeType.LIST);
    IField[] fields1 = { new StringField("无忌"), new StringField("长孙"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("北京大学电气工程学院"), new ListField<>(list) };
    IField[] fields2 = { new StringField("孔明"), new StringField("洛克贝尔"), new IntegerField(42), new DoubleField(5.99), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1974")), new TextField("北京大学计算机学院"), new ListField<>(list) };
    Tuple tuple1 = new Tuple(new Schema(schemaAttributes), fields1);
    Tuple tuple2 = new Tuple(new Schema(schemaAttributes), fields2);
    List<Tuple> expectedResultList = new ArrayList<>();
    expectedResultList.add(tuple1);
    expectedResultList.add(tuple2);
    // Perform the query
    List<Tuple> resultList = KeywordTestHelper.getQueryResults(CHINESE_TABLE, query, attributeNames, conjunction, Integer.MAX_VALUE, 0);
    // check the results
    boolean contains = TestUtils.equals(expectedResultList, resultList);
    Assert.assertTrue(contains);
}
Also used : Attribute(edu.uci.ics.textdb.api.schema.Attribute) Schema(edu.uci.ics.textdb.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.textdb.api.field.IntegerField) IField(edu.uci.ics.textdb.api.field.IField) Span(edu.uci.ics.textdb.api.span.Span) StringField(edu.uci.ics.textdb.api.field.StringField) TextField(edu.uci.ics.textdb.api.field.TextField) DateField(edu.uci.ics.textdb.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.textdb.api.field.DoubleField) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) Test(org.junit.Test)

Example 4 with Tuple

use of edu.uci.ics.textdb.api.tuple.Tuple in project textdb by TextDB.

the class KeywordConjunctionTest method testMultipleWordsQuery.

/**
     * Verifies the List<ITuple> returned by Keyword Matcher on multiple-word
     * queries
     * 
     * @throws Exception
     */
@Test
public void testMultipleWordsQuery() throws Exception {
    // Prepare the query
    String query = "george lin lin";
    ArrayList<String> attributeNames = new ArrayList<>();
    attributeNames.add(TestConstants.FIRST_NAME);
    attributeNames.add(TestConstants.LAST_NAME);
    attributeNames.add(TestConstants.DESCRIPTION);
    // Prepare the expected result list
    List<Span> list = new ArrayList<Span>();
    Span span1 = new Span("firstName", 0, 14, "george lin lin", "george lin lin");
    list.add(span1);
    Attribute[] schemaAttributes = new Attribute[TestConstants.ATTRIBUTES_PEOPLE.length + 1];
    for (int count = 0; count < schemaAttributes.length - 1; count++) {
        schemaAttributes[count] = TestConstants.ATTRIBUTES_PEOPLE[count];
    }
    schemaAttributes[schemaAttributes.length - 1] = new Attribute(RESULTS, AttributeType.LIST);
    IField[] fields1 = { new StringField("george lin lin"), new StringField("lin clooney"), new IntegerField(43), new DoubleField(6.06), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1973")), new TextField("Lin Clooney is Short and lin clooney is Angry"), new ListField<>(list) };
    Tuple tuple1 = new Tuple(new Schema(schemaAttributes), fields1);
    List<Tuple> expectedResultList = new ArrayList<>();
    expectedResultList.add(tuple1);
    // Perform the query
    List<Tuple> resultList = KeywordTestHelper.getQueryResults(PEOPLE_TABLE, query, attributeNames, conjunction);
    // check the results
    boolean contains = TestUtils.equals(expectedResultList, resultList);
    Assert.assertTrue(contains);
}
Also used : Attribute(edu.uci.ics.textdb.api.schema.Attribute) Schema(edu.uci.ics.textdb.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.textdb.api.field.IntegerField) IField(edu.uci.ics.textdb.api.field.IField) Span(edu.uci.ics.textdb.api.span.Span) StringField(edu.uci.ics.textdb.api.field.StringField) TextField(edu.uci.ics.textdb.api.field.TextField) DateField(edu.uci.ics.textdb.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.textdb.api.field.DoubleField) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) Test(org.junit.Test)

Example 5 with Tuple

use of edu.uci.ics.textdb.api.tuple.Tuple in project textdb by TextDB.

the class KeywordConjunctionTest method testWordInMultipleFieldsQuery.

/**
     * Verifies: data source has multiple attributes, and an entity can appear
     * in all the fields and multiple times.
     * 
     * @throws Exception
     */
@Test
public void testWordInMultipleFieldsQuery() throws Exception {
    // Prepare the query
    String query = "lin clooney";
    ArrayList<String> attributeNames = new ArrayList<>();
    attributeNames.add(TestConstants.FIRST_NAME);
    attributeNames.add(TestConstants.LAST_NAME);
    attributeNames.add(TestConstants.DESCRIPTION);
    // Prepare the expected result list
    List<Span> list = new ArrayList<>();
    Span span1 = new Span("lastName", 0, 11, "lin clooney", "lin clooney");
    Span span2 = new Span("description", 0, 3, "lin", "Lin", 0);
    Span span3 = new Span("description", 25, 28, "lin", "lin", 5);
    Span span4 = new Span("description", 4, 11, "clooney", "Clooney", 1);
    Span span5 = new Span("description", 29, 36, "clooney", "clooney", 6);
    list.add(span1);
    list.add(span2);
    list.add(span3);
    list.add(span4);
    list.add(span5);
    Attribute[] schemaAttributes = new Attribute[TestConstants.ATTRIBUTES_PEOPLE.length + 1];
    for (int count = 0; count < schemaAttributes.length - 1; count++) {
        schemaAttributes[count] = TestConstants.ATTRIBUTES_PEOPLE[count];
    }
    schemaAttributes[schemaAttributes.length - 1] = new Attribute(RESULTS, AttributeType.LIST);
    IField[] fields1 = { new StringField("george lin lin"), new StringField("lin clooney"), new IntegerField(43), new DoubleField(6.06), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1973")), new TextField("Lin Clooney is Short and lin clooney is Angry"), new ListField<>(list) };
    Tuple tuple1 = new Tuple(new Schema(schemaAttributes), fields1);
    List<Tuple> expectedResultList = new ArrayList<>();
    expectedResultList.add(tuple1);
    // Perform the query
    List<Tuple> resultList = KeywordTestHelper.getQueryResults(PEOPLE_TABLE, query, attributeNames, conjunction);
    // check the results
    boolean contains = TestUtils.equals(expectedResultList, resultList);
    Assert.assertTrue(contains);
}
Also used : Attribute(edu.uci.ics.textdb.api.schema.Attribute) Schema(edu.uci.ics.textdb.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.textdb.api.field.IntegerField) IField(edu.uci.ics.textdb.api.field.IField) Span(edu.uci.ics.textdb.api.span.Span) StringField(edu.uci.ics.textdb.api.field.StringField) TextField(edu.uci.ics.textdb.api.field.TextField) DateField(edu.uci.ics.textdb.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.textdb.api.field.DoubleField) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) Test(org.junit.Test)

Aggregations

Tuple (edu.uci.ics.textdb.api.tuple.Tuple)234 ArrayList (java.util.ArrayList)144 Test (org.junit.Test)135 IField (edu.uci.ics.textdb.api.field.IField)102 Schema (edu.uci.ics.textdb.api.schema.Schema)95 TextField (edu.uci.ics.textdb.api.field.TextField)92 StringField (edu.uci.ics.textdb.api.field.StringField)85 Attribute (edu.uci.ics.textdb.api.schema.Attribute)84 IntegerField (edu.uci.ics.textdb.api.field.IntegerField)80 Span (edu.uci.ics.textdb.api.span.Span)80 DoubleField (edu.uci.ics.textdb.api.field.DoubleField)64 DateField (edu.uci.ics.textdb.api.field.DateField)61 SimpleDateFormat (java.text.SimpleDateFormat)60 DataWriter (edu.uci.ics.textdb.storage.DataWriter)31 Dictionary (edu.uci.ics.textdb.exp.dictionarymatcher.Dictionary)25 KeywordMatcherSourceOperator (edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatcherSourceOperator)20 RelationManager (edu.uci.ics.textdb.storage.RelationManager)19 JoinDistancePredicate (edu.uci.ics.textdb.exp.join.JoinDistancePredicate)18 ScanBasedSourceOperator (edu.uci.ics.textdb.exp.source.scan.ScanBasedSourceOperator)17 ScanSourcePredicate (edu.uci.ics.textdb.exp.source.scan.ScanSourcePredicate)17