Search in sources :

Example 16 with KeywordMatcherSourceOperator

use of edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatcherSourceOperator in project textdb by TextDB.

the class JoinTestHelper method getKeywordSource.

/**
     * Provides a KeywordMatcherSourceOperator for a test table given a keyword.
     * ( KeywordMatcher is used in most of Join test cases )
     * @param tableName
     * @param query
     * @param matchingType
     * @return
     * @throws TextDBException
     */
public static KeywordMatcherSourceOperator getKeywordSource(String tableName, String query, KeywordMatchingType matchingType) throws TextDBException {
    KeywordSourcePredicate keywordSourcePredicate = new KeywordSourcePredicate(query, Arrays.asList(JoinTestConstants.AUTHOR, JoinTestConstants.TITLE, JoinTestConstants.REVIEW), RelationManager.getRelationManager().getTableAnalyzerString(tableName), matchingType, tableName, SchemaConstants.SPAN_LIST);
    KeywordMatcherSourceOperator keywordSource = new KeywordMatcherSourceOperator(keywordSourcePredicate);
    return keywordSource;
}
Also used : KeywordSourcePredicate(edu.uci.ics.textdb.exp.keywordmatcher.KeywordSourcePredicate) KeywordMatcherSourceOperator(edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatcherSourceOperator)

Example 17 with KeywordMatcherSourceOperator

use of edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatcherSourceOperator in project textdb by TextDB.

the class DictionaryMatcherSourceOperator method getNextTuple.

/**
     * @about Gets the next matched tuple. <br>
     *        Returns the tuple with results in spanList. <br>
     * 
     *        Performs SCAN, KEYWORD_BASIC, or KEYWORD_PHRASE depends on the
     *        dictionary predicate. <br>
     * 
     *        DictionaryOperatorType.SCAN: <br>
     *        Scan the tuples using ScanSourceOperator. <br>
     *        For each tuple, loop through the dictionary and find results. <br>
     *        We assume the dictionary is smaller than the data at the source
     *        operator, we treat the data source as the outer relation to reduce
     *        the number of disk IOs. <br>
     * 
     *        DictionaryOperatorType.KEYWORD_BASIC, KEYWORD_PHRASE: <br>
     *        Use KeywordMatcher to find results. <br>
     * 
     *        KEYWORD_BASIC corresponds to KeywordOperatorType.BASIC, which
     *        performs keyword search on the document. The input query is
     *        tokenized. The order of the tokens doesn't matter. <br>
     * 
     *        KEYWORD_PHRASE corresponds to KeywordOperatorType.PHRASE, which
     *        performs phrase search on the document. The input query is
     *        tokenized. The order of the tokens does matter. Stopwords are
     *        treated as placeholders to indicate an arbitary token. <br>
     * 
     */
@Override
public Tuple getNextTuple() throws TextDBException {
    if (resultCursor >= limit + offset - 1) {
        return null;
    }
    if (predicate.getKeywordMatchingType() == KeywordMatchingType.PHRASE_INDEXBASED || predicate.getKeywordMatchingType() == KeywordMatchingType.CONJUNCTION_INDEXBASED) {
        while (true) {
            // If there's result from current keywordMatcher, return it.
            if ((sourceTuple = keywordSource.getNextTuple()) != null) {
                resultCursor++;
                if (resultCursor >= offset) {
                    return sourceTuple;
                }
                continue;
            }
            // return null if reach the end of dictionary.
            if ((currentDictionaryEntry = predicate.getDictionary().getNextEntry()) == null) {
                return null;
            }
            // Construct a new KeywordMatcher with the new dictionary entry.
            KeywordMatchingType keywordMatchingType;
            if (predicate.getKeywordMatchingType() == KeywordMatchingType.PHRASE_INDEXBASED) {
                keywordMatchingType = KeywordMatchingType.PHRASE_INDEXBASED;
            } else {
                keywordMatchingType = KeywordMatchingType.CONJUNCTION_INDEXBASED;
            }
            keywordSource.close();
            KeywordSourcePredicate keywordSourcePredicate = new KeywordSourcePredicate(currentDictionaryEntry, predicate.getAttributeNames(), predicate.getAnalyzerString(), keywordMatchingType, predicate.getTableName(), predicate.getSpanListName());
            keywordSource = new KeywordMatcherSourceOperator(keywordSourcePredicate);
            keywordSource.open();
        }
    } else // Substring matching (based on scan)
    {
        Tuple sourceTuple;
        Tuple resultTuple = null;
        while ((sourceTuple = indexSource.getNextTuple()) != null) {
            sourceTuple = DataflowUtils.getSpanTuple(sourceTuple.getFields(), new ArrayList<Span>(), outputSchema);
            resultTuple = computeMatchingResult(currentDictionaryEntry, sourceTuple);
            if (resultTuple != null) {
                resultCursor++;
            }
            if (resultTuple != null && resultCursor >= offset) {
                break;
            }
        }
        return resultTuple;
    }
}
Also used : KeywordSourcePredicate(edu.uci.ics.textdb.exp.keywordmatcher.KeywordSourcePredicate) ArrayList(java.util.ArrayList) KeywordMatchingType(edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatchingType) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) KeywordMatcherSourceOperator(edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatcherSourceOperator)

Example 18 with KeywordMatcherSourceOperator

use of edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatcherSourceOperator in project textdb by TextDB.

the class JoinDistanceTest method testSpansOverlapAndWithinThreshold.

/*
     * This case tests for the spans to be joined have some overlap and both
     * |(span 1 spanStartIndex) - (span 2 spanStartIndex)|,
     * |(span 1 spanEndIndex) - (span 2 spanEndIndex)| are within threshold.
     * 
     * e.g.
     * [<75, 97>]
     * [<92, 109>]
     * threshold = 20 (within threshold)
     * result: [<75, 109>]
     * 
     * Test result: The list contains a tuple with all the fields and a span
     * list consisting of the joined span. The joined span is made up of the
     * field name, start and stop index (computed as <min(span1 spanStartIndex,
     * span2 spanStartIndex), max(span1 spanEndIndex, span2 spanEndIndex)>)
     * key (combination of span1 key and span2 key) and value (combination of 
     * span1 value and span2 value).
     */
@Test
public void testSpansOverlapAndWithinThreshold() throws Exception {
    JoinTestHelper.insertToTable(BOOK_TABLE, JoinTestConstants.bookGroup1.get(0));
    KeywordMatcherSourceOperator keywordSourceOuter = JoinTestHelper.getKeywordSource(BOOK_TABLE, "gastrointestinal tract", phrase);
    KeywordMatcherSourceOperator keywordSourceInner = JoinTestHelper.getKeywordSource(BOOK_TABLE, "tract interesting", phrase);
    List<Tuple> resultList = JoinTestHelper.getJoinDistanceResults(keywordSourceInner, keywordSourceOuter, new JoinDistancePredicate(JoinTestConstants.REVIEW, 20), Integer.MAX_VALUE, 0);
    Schema resultSchema = Utils.createSpanSchema(JoinTestConstants.BOOK_SCHEMA);
    List<Span> spanList = new ArrayList<>();
    Span span1 = new Span(JoinTestConstants.REVIEW, 75, 109, "gastrointestinal tract_" + "tract interesting", "gastrointestinal " + "tract interesting");
    spanList.add(span1);
    IField[] book1 = { new IntegerField(52), new StringField("Mary Roach"), new StringField("Grunt: The Curious Science of Humans at War"), new IntegerField(288), new TextField("It takes a special kind " + "of writer to make topics ranging from death to our " + "gastrointestinal tract interesting (sometimes " + "hilariously so), and pop science writer Mary Roach is " + "always up to the task."), new ListField<>(spanList) };
    Tuple expectedTuple = new Tuple(resultSchema, book1);
    List<Tuple> expectedResult = new ArrayList<>();
    expectedResult.add(expectedTuple);
    Assert.assertEquals(1, resultList.size());
    Assert.assertTrue(TestUtils.equals(expectedResult, resultList));
}
Also used : Schema(edu.uci.ics.textdb.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.textdb.api.field.IntegerField) IField(edu.uci.ics.textdb.api.field.IField) JoinDistancePredicate(edu.uci.ics.textdb.exp.join.JoinDistancePredicate) Span(edu.uci.ics.textdb.api.span.Span) KeywordMatcherSourceOperator(edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatcherSourceOperator) StringField(edu.uci.ics.textdb.api.field.StringField) TextField(edu.uci.ics.textdb.api.field.TextField) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) Test(org.junit.Test)

Example 19 with KeywordMatcherSourceOperator

use of edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatcherSourceOperator in project textdb by TextDB.

the class JoinDistanceTest method testOneSpanEncompassesOtherAndDifferenceLessThanThreshold.

// This case tests for the scenario when one of the spans to be joined encompasses the other span
// and both |(span 1 spanStartIndex) - (span 2 spanStartIndex)|,
// |(span 1 spanEndIndex) - (span 2 spanEndIndex)| are within threshold.
// e.g.
// [<11, 18>]
// [<3, 33>]
// threshold = 20 (within threshold)
// Test result: The bigger span should be returned.
// [<3, 33>]
@Test
public void testOneSpanEncompassesOtherAndDifferenceLessThanThreshold() throws Exception {
    JoinTestHelper.insertToTable(BOOK_TABLE, JoinTestConstants.bookGroup1.get(0));
    KeywordMatcherSourceOperator keywordSourceOuter = JoinTestHelper.getKeywordSource(BOOK_TABLE, "special", conjunction);
    KeywordMatcherSourceOperator keywordSourceInner = JoinTestHelper.getKeywordSource(BOOK_TABLE, "takes a special kind of writer", phrase);
    List<Tuple> resultList = JoinTestHelper.getJoinDistanceResults(keywordSourceInner, keywordSourceOuter, new JoinDistancePredicate(JoinTestConstants.REVIEW, 20), Integer.MAX_VALUE, 0);
    Schema resultSchema = Utils.createSpanSchema(JoinTestConstants.BOOK_SCHEMA);
    List<Span> spanList = new ArrayList<>();
    Span span1 = new Span(JoinTestConstants.REVIEW, 3, 33, "special_takes a special " + "kind of writer", "takes a special " + "kind of writer");
    spanList.add(span1);
    IField[] book1 = { new IntegerField(52), new StringField("Mary Roach"), new StringField("Grunt: The Curious Science of Humans at War"), new IntegerField(288), new TextField("It takes a special kind " + "of writer to make topics ranging from death to our " + "gastrointestinal tract interesting (sometimes " + "hilariously so), and pop science writer Mary Roach is " + "always up to the task."), new ListField<>(spanList) };
    Tuple expectedTuple = new Tuple(resultSchema, book1);
    List<Tuple> expectedResult = new ArrayList<>();
    expectedResult.add(expectedTuple);
    Assert.assertEquals(1, resultList.size());
    Assert.assertTrue(TestUtils.equals(expectedResult, resultList));
}
Also used : Schema(edu.uci.ics.textdb.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.textdb.api.field.IntegerField) IField(edu.uci.ics.textdb.api.field.IField) JoinDistancePredicate(edu.uci.ics.textdb.exp.join.JoinDistancePredicate) Span(edu.uci.ics.textdb.api.span.Span) KeywordMatcherSourceOperator(edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatcherSourceOperator) StringField(edu.uci.ics.textdb.api.field.StringField) TextField(edu.uci.ics.textdb.api.field.TextField) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) Test(org.junit.Test)

Example 20 with KeywordMatcherSourceOperator

use of edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatcherSourceOperator in project textdb by TextDB.

the class JoinDistanceTest method testIdsMatchFieldsMatchSpanExceedThreshold.

/*
     * This case tests for the scenario when the difference of keyword spans
     * to be joined is greater than the threshold.
     * 
     * [<11, 18>]
     * [<42, 48>]
     * threshold = 20 (beyond threshold)
     * 
     * Test result: An empty list is returned.
     */
@Test
public void testIdsMatchFieldsMatchSpanExceedThreshold() throws Exception {
    JoinTestHelper.insertToTable(BOOK_TABLE, JoinTestConstants.bookGroup1.get(0));
    KeywordMatcherSourceOperator keywordSourceOuter = JoinTestHelper.getKeywordSource(BOOK_TABLE, "special", conjunction);
    KeywordMatcherSourceOperator keywordSourceInner = JoinTestHelper.getKeywordSource(BOOK_TABLE, "topics", conjunction);
    List<Tuple> resultList = JoinTestHelper.getJoinDistanceResults(keywordSourceInner, keywordSourceOuter, new JoinDistancePredicate(JoinTestConstants.REVIEW, 20), Integer.MAX_VALUE, 0);
    Assert.assertEquals(0, resultList.size());
}
Also used : JoinDistancePredicate(edu.uci.ics.textdb.exp.join.JoinDistancePredicate) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) KeywordMatcherSourceOperator(edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatcherSourceOperator) Test(org.junit.Test)

Aggregations

KeywordMatcherSourceOperator (edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatcherSourceOperator)26 Test (org.junit.Test)22 Tuple (edu.uci.ics.textdb.api.tuple.Tuple)20 JoinDistancePredicate (edu.uci.ics.textdb.exp.join.JoinDistancePredicate)19 ArrayList (java.util.ArrayList)12 Span (edu.uci.ics.textdb.api.span.Span)10 IField (edu.uci.ics.textdb.api.field.IField)9 IntegerField (edu.uci.ics.textdb.api.field.IntegerField)9 StringField (edu.uci.ics.textdb.api.field.StringField)9 TextField (edu.uci.ics.textdb.api.field.TextField)9 Schema (edu.uci.ics.textdb.api.schema.Schema)9 KeywordSourcePredicate (edu.uci.ics.textdb.exp.keywordmatcher.KeywordSourcePredicate)4 IOperator (edu.uci.ics.textdb.api.dataflow.IOperator)3 ISink (edu.uci.ics.textdb.api.dataflow.ISink)3 Plan (edu.uci.ics.textdb.api.engine.Plan)3 Join (edu.uci.ics.textdb.exp.join.Join)3 RegexMatcher (edu.uci.ics.textdb.exp.regexmatcher.RegexMatcher)3 TupleSink (edu.uci.ics.textdb.exp.sink.tuple.TupleSink)3 Attribute (edu.uci.ics.textdb.api.schema.Attribute)2 OneToNBroadcastConnector (edu.uci.ics.textdb.exp.connector.OneToNBroadcastConnector)2