Search in sources :

Example 66 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class DataflowUtils method constructPhraseMatchingSpans.

/**
 * This function is used to generate the SpanList for phrase matching type in both dictionarymatcher and keywordmatcher.
 * @param attributeName
 * @param fieldValue
 * @param queryKeyword
 * @param fieldSpanList
 * @param queryTokenListWithStopwords
 * @param queryTokenList
 * @return
 */
public static List<Span> constructPhraseMatchingSpans(String attributeName, String fieldValue, String queryKeyword, List<Span> fieldSpanList, List<String> queryTokenListWithStopwords, List<String> queryTokenList) {
    List<Span> matchingResults = new ArrayList<>();
    // Sort current field's span list by token offset for later use
    Collections.sort(fieldSpanList, (span1, span2) -> span1.getTokenOffset() - span2.getTokenOffset());
    List<Integer> queryTokenOffset = new ArrayList<>();
    for (int i = 0; i < queryTokenListWithStopwords.size(); i++) {
        if (queryTokenList.contains(queryTokenListWithStopwords.get(i))) {
            queryTokenOffset.add(i);
        }
    }
    // maintains position of term being checked in
    int iter = 0;
    // spanForThisField list
    while (iter < fieldSpanList.size()) {
        if (iter > fieldSpanList.size() - queryTokenList.size()) {
            break;
        }
        // Verify if span in the spanForThisField correspond to our
        // phrase query, ie relative position offsets should be
        // similar
        // and the value should be same.
        // flag to check if a
        boolean isMismatchInSpan = false;
        // To check all the terms in query are verified
        for (int i = 0; i < queryTokenList.size() - 1; i++) {
            Span first = fieldSpanList.get(iter + i);
            Span second = fieldSpanList.get(iter + i + 1);
            if (!(second.getTokenOffset() - first.getTokenOffset() == queryTokenOffset.get(i + 1) - queryTokenOffset.get(i) && first.getValue().equalsIgnoreCase(queryTokenList.get(i)) && second.getValue().equalsIgnoreCase(queryTokenList.get(i + 1)))) {
                iter++;
                isMismatchInSpan = true;
                break;
            }
        }
        if (isMismatchInSpan) {
            continue;
        }
        int combinedSpanStartIndex = fieldSpanList.get(iter).getStart();
        int combinedSpanEndIndex = fieldSpanList.get(iter + queryTokenList.size() - 1).getEnd();
        Span combinedSpan = new Span(attributeName, combinedSpanStartIndex, combinedSpanEndIndex, queryKeyword, fieldValue.substring(combinedSpanStartIndex, combinedSpanEndIndex));
        matchingResults.add(combinedSpan);
        iter = iter + queryTokenList.size();
    }
    return matchingResults;
}
Also used : Span(edu.uci.ics.texera.api.span.Span)

Example 67 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class WordCountOperator method computeWordCount.

private void computeWordCount() throws TexeraException {
    Tuple tuple;
    HashMap<String, Integer> wordCountMap = new HashMap<>();
    while ((tuple = this.inputOperator.getNextTuple()) != null) {
        if (addPayload) {
            tuple = new Tuple.Builder(tuple).add(SchemaConstants.PAYLOAD_ATTRIBUTE, new ListField<Span>(DataflowUtils.generatePayloadFromTuple(tuple, predicate.getLuceneAnalyzerString()))).build();
        }
        ListField<Span> payloadField = tuple.getField("payload");
        List<Span> payloadSpanList = payloadField.getValue();
        for (Span span : payloadSpanList) {
            if (span.getAttributeName().equals(predicate.getAttribute())) {
                String key = span.getValue().toLowerCase();
                wordCountMap.put(key, wordCountMap.get(key) == null ? 1 : wordCountMap.get(key) + 1);
            }
        }
    }
    sortedWordCountMap = wordCountMap.entrySet().stream().sorted((e1, e2) -> e2.getValue().compareTo(e1.getValue())).collect(Collectors.toList());
    wordCountIterator = sortedWordCountMap.iterator();
}
Also used : HashMap(java.util.HashMap) Span(edu.uci.ics.texera.api.span.Span) Tuple(edu.uci.ics.texera.api.tuple.Tuple)

Example 68 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class DictionaryMatcherTest method testMultipleWordsQueryUsingPhrase.

/**
 * Scenario S-10:verifies ITuple returned by DictionaryMatcher and multiple
 * word queries using PHRASE OPERATOR
 */
@Test
public void testMultipleWordsQueryUsingPhrase() throws Exception {
    ArrayList<String> names = new ArrayList<String>(Arrays.asList("george lin lin"));
    Dictionary dictionary = new Dictionary(names);
    // create a data tuple first
    List<Span> list = new ArrayList<Span>();
    Span span = new Span("firstName", 0, 14, "george lin lin", "george lin lin");
    list.add(span);
    Attribute[] schemaAttributes = new Attribute[TestConstants.ATTRIBUTES_PEOPLE.length + 1];
    for (int count = 0; count < schemaAttributes.length - 1; count++) {
        schemaAttributes[count] = TestConstants.ATTRIBUTES_PEOPLE[count];
    }
    schemaAttributes[schemaAttributes.length - 1] = RESULTS_ATTRIBUTE;
    IField[] fields1 = { new StringField("george lin lin"), new StringField("lin clooney"), new IntegerField(43), new DoubleField(6.06), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1973")), new TextField("Lin Clooney is Short and lin clooney is Angry"), new ListField<Span>(list) };
    Tuple tuple1 = new Tuple(new Schema(schemaAttributes), fields1);
    List<Tuple> expectedResults = new ArrayList<Tuple>();
    expectedResults.add(tuple1);
    List<String> attributeNames = Arrays.asList(TestConstants.FIRST_NAME, TestConstants.LAST_NAME, TestConstants.DESCRIPTION);
    List<Tuple> returnedResults = DictionaryMatcherTestHelper.getQueryResults(PEOPLE_TABLE, dictionary, attributeNames, KeywordMatchingType.PHRASE_INDEXBASED);
    boolean contains = TestUtils.equals(expectedResults, returnedResults);
    Assert.assertTrue(contains);
}
Also used : Dictionary(edu.uci.ics.texera.dataflow.dictionarymatcher.Dictionary) Attribute(edu.uci.ics.texera.api.schema.Attribute) Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.texera.api.field.IntegerField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) StringField(edu.uci.ics.texera.api.field.StringField) TextField(edu.uci.ics.texera.api.field.TextField) DateField(edu.uci.ics.texera.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.texera.api.field.DoubleField) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 69 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class DictionaryMatcherTest method testMultipleWordQueryInTextFieldUsingScan2.

@Test
public void testMultipleWordQueryInTextFieldUsingScan2() throws Exception {
    ArrayList<String> names = new ArrayList<String>(Arrays.asList("tall fair"));
    Dictionary dictionary = new Dictionary(names);
    // create a data tuple first
    List<Span> list1 = new ArrayList<Span>();
    Span span1 = new Span("description", 0, 9, "tall fair", "Tall Fair");
    list1.add(span1);
    Attribute[] schemaAttributes = new Attribute[TestConstants.ATTRIBUTES_PEOPLE.length + 1];
    for (int count = 0; count < schemaAttributes.length - 1; count++) {
        schemaAttributes[count] = TestConstants.ATTRIBUTES_PEOPLE[count];
    }
    schemaAttributes[schemaAttributes.length - 1] = RESULTS_ATTRIBUTE;
    IField[] fields2 = { new StringField("christian john wayne"), new StringField("rock bale"), new IntegerField(42), new DoubleField(5.99), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1974")), new TextField("Tall Fair"), new ListField<Span>(list1) };
    Tuple tuple2 = new Tuple(new Schema(schemaAttributes), fields2);
    List<Tuple> expectedResults = new ArrayList<Tuple>();
    expectedResults.add(tuple2);
    List<String> attributeNames = Arrays.asList(TestConstants.FIRST_NAME, TestConstants.LAST_NAME, TestConstants.DESCRIPTION);
    List<Tuple> returnedResults = DictionaryMatcherTestHelper.getQueryResults(PEOPLE_TABLE, dictionary, attributeNames, KeywordMatchingType.SUBSTRING_SCANBASED);
    boolean contains = TestUtils.equals(expectedResults, returnedResults);
    Assert.assertTrue(contains);
}
Also used : Dictionary(edu.uci.ics.texera.dataflow.dictionarymatcher.Dictionary) Attribute(edu.uci.ics.texera.api.schema.Attribute) Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.texera.api.field.IntegerField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) StringField(edu.uci.ics.texera.api.field.StringField) TextField(edu.uci.ics.texera.api.field.TextField) DateField(edu.uci.ics.texera.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.texera.api.field.DoubleField) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 70 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class DictionaryMatcherTest method testSingleWordQueryInStringFieldUsingScan.

/**
 * Scenario: verifies GetNextTuple of DictionaryMatcher and single word
 * queries in String Field using SCANOPERATOR
 */
@Test
public void testSingleWordQueryInStringFieldUsingScan() throws Exception {
    ArrayList<String> names = new ArrayList<String>(Arrays.asList("bruce"));
    Dictionary dictionary = new Dictionary(names);
    // create a data tuple first
    List<Span> list = new ArrayList<Span>();
    Span span = new Span("firstName", 0, 5, "bruce", "bruce");
    list.add(span);
    Attribute[] schemaAttributes = new Attribute[TestConstants.ATTRIBUTES_PEOPLE.length + 1];
    for (int count = 0; count < schemaAttributes.length - 1; count++) {
        schemaAttributes[count] = TestConstants.ATTRIBUTES_PEOPLE[count];
    }
    schemaAttributes[schemaAttributes.length - 1] = RESULTS_ATTRIBUTE;
    IField[] fields1 = { new StringField("bruce"), new StringField("john Lee"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("Tall Angry"), new ListField<Span>(list) };
    Tuple tuple1 = new Tuple(new Schema(schemaAttributes), fields1);
    List<Tuple> expectedResults = new ArrayList<Tuple>();
    expectedResults.add(tuple1);
    List<String> attributeNames = Arrays.asList(TestConstants.FIRST_NAME, TestConstants.LAST_NAME, TestConstants.DESCRIPTION);
    List<Tuple> returnedResults = DictionaryMatcherTestHelper.getQueryResults(PEOPLE_TABLE, dictionary, attributeNames, KeywordMatchingType.SUBSTRING_SCANBASED);
    boolean contains = TestUtils.equals(expectedResults, returnedResults);
    Assert.assertTrue(contains);
}
Also used : Dictionary(edu.uci.ics.texera.dataflow.dictionarymatcher.Dictionary) Attribute(edu.uci.ics.texera.api.schema.Attribute) Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.texera.api.field.IntegerField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) StringField(edu.uci.ics.texera.api.field.StringField) TextField(edu.uci.ics.texera.api.field.TextField) DateField(edu.uci.ics.texera.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.texera.api.field.DoubleField) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Aggregations

Span (edu.uci.ics.texera.api.span.Span)130 ArrayList (java.util.ArrayList)104 IField (edu.uci.ics.texera.api.field.IField)100 Tuple (edu.uci.ics.texera.api.tuple.Tuple)99 Schema (edu.uci.ics.texera.api.schema.Schema)90 Test (org.junit.Test)84 TextField (edu.uci.ics.texera.api.field.TextField)78 IntegerField (edu.uci.ics.texera.api.field.IntegerField)66 StringField (edu.uci.ics.texera.api.field.StringField)64 DoubleField (edu.uci.ics.texera.api.field.DoubleField)55 DateField (edu.uci.ics.texera.api.field.DateField)52 SimpleDateFormat (java.text.SimpleDateFormat)52 Attribute (edu.uci.ics.texera.api.schema.Attribute)51 ListField (edu.uci.ics.texera.api.field.ListField)40 Dictionary (edu.uci.ics.texera.dataflow.dictionarymatcher.Dictionary)30 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)11 AttributeType (edu.uci.ics.texera.api.schema.AttributeType)11 KeywordMatcherSourceOperator (edu.uci.ics.texera.dataflow.keywordmatcher.KeywordMatcherSourceOperator)11 JoinDistancePredicate (edu.uci.ics.texera.dataflow.join.JoinDistancePredicate)9 Collectors (java.util.stream.Collectors)8