Search in sources :

Example 61 with Span

use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.

the class FuzzyTokenMatcherPerformanceTest method match.

/*
     * This function does match for a list of queries
     */
public static void match(ArrayList<String> queryList, double threshold, String luceneAnalyzerStr, String tableName, boolean bool) throws TextDBException, IOException {
    List<String> attributeNames = Arrays.asList(MedlineIndexWriter.ABSTRACT);
    for (String query : queryList) {
        FuzzyTokenSourcePredicate predicate = new FuzzyTokenSourcePredicate(query, attributeNames, luceneAnalyzerStr, threshold, tableName, null);
        FuzzyTokenMatcherSourceOperator fuzzyTokenSource = new FuzzyTokenMatcherSourceOperator(predicate);
        long startMatchTime = System.currentTimeMillis();
        fuzzyTokenSource.open();
        int counter = 0;
        Tuple nextTuple = null;
        while ((nextTuple = fuzzyTokenSource.getNextTuple()) != null) {
            ListField<Span> spanListField = nextTuple.getField(SchemaConstants.SPAN_LIST);
            List<Span> spanList = spanListField.getValue();
            counter += spanList.size();
        }
        fuzzyTokenSource.close();
        long endMatchTime = System.currentTimeMillis();
        double matchTime = (endMatchTime - startMatchTime) / 1000.0;
        timeResults.add(Double.parseDouble(String.format("%.4f", matchTime)));
        totalResultCount += counter;
    }
}
Also used : FuzzyTokenMatcherSourceOperator(edu.uci.ics.textdb.exp.fuzzytokenmatcher.FuzzyTokenMatcherSourceOperator) FuzzyTokenSourcePredicate(edu.uci.ics.textdb.exp.fuzzytokenmatcher.FuzzyTokenSourcePredicate) Span(edu.uci.ics.textdb.api.span.Span) Tuple(edu.uci.ics.textdb.api.tuple.Tuple)

Example 62 with Span

use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.

the class DictionaryMatcherTest method testSingleWordQueryInTextFieldUsingScan.

/**
     * Scenario S-5:verifies GetNextTuple of DictionaryMatcher and single word
     * queries in Text Field using SCANOPERATOR
     */
@Test
public void testSingleWordQueryInTextFieldUsingScan() throws Exception {
    ArrayList<String> names = new ArrayList<String>(Arrays.asList("tall"));
    Dictionary dictionary = new Dictionary(names);
    // create a data tuple first
    List<Span> list = new ArrayList<Span>();
    Span span = new Span("description", 0, 4, "tall", "Tall");
    list.add(span);
    Attribute[] schemaAttributes = new Attribute[TestConstants.ATTRIBUTES_PEOPLE.length + 1];
    for (int count = 0; count < schemaAttributes.length - 1; count++) {
        schemaAttributes[count] = TestConstants.ATTRIBUTES_PEOPLE[count];
    }
    schemaAttributes[schemaAttributes.length - 1] = RESULTS_ATTRIBUTE;
    IField[] fields1 = { new StringField("bruce"), new StringField("john Lee"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("Tall Angry"), new ListField<Span>(list) };
    IField[] fields2 = { new StringField("christian john wayne"), new StringField("rock bale"), new IntegerField(42), new DoubleField(5.99), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1974")), new TextField("Tall Fair"), new ListField<Span>(list) };
    Tuple tuple1 = new Tuple(new Schema(schemaAttributes), fields1);
    Tuple tuple2 = new Tuple(new Schema(schemaAttributes), fields2);
    List<Tuple> expectedResults = new ArrayList<Tuple>();
    expectedResults.add(tuple1);
    expectedResults.add(tuple2);
    List<String> attributeNames = Arrays.asList(TestConstants.FIRST_NAME, TestConstants.LAST_NAME, TestConstants.DESCRIPTION);
    List<Tuple> returnedResults = DictionaryMatcherTestHelper.getQueryResults(PEOPLE_TABLE, dictionary, attributeNames, KeywordMatchingType.SUBSTRING_SCANBASED);
    boolean contains = TestUtils.equals(expectedResults, returnedResults);
    Assert.assertTrue(contains);
}
Also used : Dictionary(edu.uci.ics.textdb.exp.dictionarymatcher.Dictionary) Attribute(edu.uci.ics.textdb.api.schema.Attribute) Schema(edu.uci.ics.textdb.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.textdb.api.field.IntegerField) IField(edu.uci.ics.textdb.api.field.IField) Span(edu.uci.ics.textdb.api.span.Span) StringField(edu.uci.ics.textdb.api.field.StringField) TextField(edu.uci.ics.textdb.api.field.TextField) DateField(edu.uci.ics.textdb.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.textdb.api.field.DoubleField) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) Test(org.junit.Test)

Example 63 with Span

use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.

the class DictionaryMatcherTest method testWordInMultipleFieldsQueryUsingScan.

/**
     * Scenario S-11:verifies: data source has multiple attributes, and an
     * entity can appear in all the fields and multiple times using SUBSTRING_SCANBASE
     * OPERATOR.
     */
@Test
public void testWordInMultipleFieldsQueryUsingScan() throws Exception {
    ArrayList<String> names = new ArrayList<String>(Arrays.asList("lin clooney"));
    Dictionary dictionary = new Dictionary(names);
    // create a data tuple first
    List<Span> list = new ArrayList<Span>();
    Span span1 = new Span("lastName", 0, 11, "lin clooney", "lin clooney");
    Span span2 = new Span("description", 0, 11, "lin clooney", "Lin Clooney");
    Span span3 = new Span("description", 25, 36, "lin clooney", "lin clooney");
    list.add(span1);
    list.add(span2);
    list.add(span3);
    Attribute[] schemaAttributes = new Attribute[TestConstants.ATTRIBUTES_PEOPLE.length + 1];
    for (int count = 0; count < schemaAttributes.length - 1; count++) {
        schemaAttributes[count] = TestConstants.ATTRIBUTES_PEOPLE[count];
    }
    schemaAttributes[schemaAttributes.length - 1] = RESULTS_ATTRIBUTE;
    IField[] fields1 = { new StringField("george lin lin"), new StringField("lin clooney"), new IntegerField(43), new DoubleField(6.06), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-13-1973")), new TextField("Lin Clooney is Short and lin clooney is Angry"), new ListField<Span>(list) };
    Tuple tuple1 = new Tuple(new Schema(schemaAttributes), fields1);
    List<Tuple> expectedResults = new ArrayList<Tuple>();
    expectedResults.add(tuple1);
    List<String> attributeNames = Arrays.asList(TestConstants.FIRST_NAME, TestConstants.LAST_NAME, TestConstants.DESCRIPTION);
    List<Tuple> returnedResults = DictionaryMatcherTestHelper.getQueryResults(PEOPLE_TABLE, dictionary, attributeNames, KeywordMatchingType.SUBSTRING_SCANBASED);
    boolean contains = TestUtils.equals(expectedResults, returnedResults);
    Assert.assertTrue(contains);
}
Also used : Dictionary(edu.uci.ics.textdb.exp.dictionarymatcher.Dictionary) Attribute(edu.uci.ics.textdb.api.schema.Attribute) Schema(edu.uci.ics.textdb.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.textdb.api.field.IntegerField) IField(edu.uci.ics.textdb.api.field.IField) Span(edu.uci.ics.textdb.api.span.Span) StringField(edu.uci.ics.textdb.api.field.StringField) TextField(edu.uci.ics.textdb.api.field.TextField) DateField(edu.uci.ics.textdb.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.textdb.api.field.DoubleField) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) Test(org.junit.Test)

Example 64 with Span

use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.

the class WordCountOperator method computeWordCount.

private void computeWordCount() throws TextDBException {
    Tuple tuple;
    HashMap<String, Integer> wordCountMap = new HashMap<>();
    while ((tuple = this.inputOperator.getNextTuple()) != null) {
        ListField<Span> payloadField = tuple.getField("payload");
        List<Span> payloadSpanList = payloadField.getValue();
        for (Span span : payloadSpanList) {
            if (span.getAttributeName().equals(predicate.getAttribute())) {
                String key = span.getValue().toLowerCase();
                wordCountMap.put(key, wordCountMap.get(key) == null ? 1 : wordCountMap.get(key) + 1);
            }
        }
    }
    sortedWordCountMap = wordCountMap.entrySet().stream().sorted((e1, e2) -> e2.getValue().compareTo(e1.getValue())).collect(Collectors.toList());
    wordCountIterator = sortedWordCountMap.iterator();
}
Also used : HashMap(java.util.HashMap) Span(edu.uci.ics.textdb.api.span.Span) Tuple(edu.uci.ics.textdb.api.tuple.Tuple)

Example 65 with Span

use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.

the class DataflowUtils method generatePayload.

public static List<Span> generatePayload(String attributeName, String fieldValue, Analyzer luceneAnalyzer) {
    List<Span> payload = new ArrayList<>();
    try {
        TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(fieldValue));
        OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
        int tokenPositionCounter = -1;
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            tokenPositionCounter += positionIncrementAttribute.getPositionIncrement();
            int tokenPosition = tokenPositionCounter;
            int charStart = offsetAttribute.startOffset();
            int charEnd = offsetAttribute.endOffset();
            String analyzedTermStr = charTermAttribute.toString();
            String originalTermStr = fieldValue.substring(charStart, charEnd);
            payload.add(new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition));
        }
        tokenStream.close();
    } catch (IOException e) {
        // return empty payload
        payload.clear();
    }
    return payload;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IOException(java.io.IOException) Span(edu.uci.ics.textdb.api.span.Span) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Aggregations

Span (edu.uci.ics.textdb.api.span.Span)112 ArrayList (java.util.ArrayList)97 Schema (edu.uci.ics.textdb.api.schema.Schema)88 IField (edu.uci.ics.textdb.api.field.IField)86 Tuple (edu.uci.ics.textdb.api.tuple.Tuple)80 TextField (edu.uci.ics.textdb.api.field.TextField)71 Attribute (edu.uci.ics.textdb.api.schema.Attribute)71 Test (org.junit.Test)71 IntegerField (edu.uci.ics.textdb.api.field.IntegerField)60 StringField (edu.uci.ics.textdb.api.field.StringField)58 DoubleField (edu.uci.ics.textdb.api.field.DoubleField)49 DateField (edu.uci.ics.textdb.api.field.DateField)46 SimpleDateFormat (java.text.SimpleDateFormat)46 Dictionary (edu.uci.ics.textdb.exp.dictionarymatcher.Dictionary)25 ListField (edu.uci.ics.textdb.api.field.ListField)18 KeywordMatcherSourceOperator (edu.uci.ics.textdb.exp.keywordmatcher.KeywordMatcherSourceOperator)10 AttributeType (edu.uci.ics.textdb.api.schema.AttributeType)9 JoinDistancePredicate (edu.uci.ics.textdb.exp.join.JoinDistancePredicate)9 DataFlowException (edu.uci.ics.textdb.api.exception.DataFlowException)7 SchemaConstants (edu.uci.ics.textdb.api.constants.SchemaConstants)5