Search in sources :

Example 1 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class DataReader method buildPayloadFromTermVector.

private ArrayList<Span> buildPayloadFromTermVector(List<IField> fields, int docID) throws IOException {
    ArrayList<Span> payloadSpanList = new ArrayList<>();
    for (Attribute attr : inputSchema.getAttributes()) {
        String attributeName = attr.getName();
        AttributeType attributeType = attr.getType();
        // payload.
        if (attributeType != AttributeType.TEXT) {
            continue;
        }
        String fieldValue = fields.get(inputSchema.getIndex(attributeName)).getValue().toString();
        Terms termVector = luceneIndexReader.getTermVector(docID, attributeName);
        if (termVector == null) {
            continue;
        }
        TermsEnum termsEnum = termVector.iterator();
        PostingsEnum termPostings = null;
        // go through document terms
        while ((termsEnum.next()) != null) {
            termPostings = termsEnum.postings(termPostings, PostingsEnum.ALL);
            if (termPostings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
                continue;
            }
            // for each term, go through its postings
            for (int i = 0; i < termPostings.freq(); i++) {
                // nextPosition needs to be called first
                int tokenPosition = termPostings.nextPosition();
                int charStart = termPostings.startOffset();
                int charEnd = termPostings.endOffset();
                String analyzedTermStr = termsEnum.term().utf8ToString();
                String originalTermStr = fieldValue.substring(charStart, charEnd);
                Span span = new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition);
                payloadSpanList.add(span);
            }
        }
    }
    return payloadSpanList;
}
Also used : Attribute(edu.uci.ics.texera.api.schema.Attribute) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) ArrayList(java.util.ArrayList) Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) Span(edu.uci.ics.texera.api.span.Span) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 2 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class SpanTupleTest method createSpanListField.

private IField createSpanListField() {
    List<Span> list = new ArrayList<Span>();
    // The key value will be:
    // For RegexMatcher : "n.*k"
    // For NamedEntityMatcher : LOCATION
    // For DictionaryMatcher: "new york" - For DictionaryMatcher the key and
    // value are same
    // For KeyWordMatcher: "new york" - the value can be "new" or "york"
    Span span1 = new Span("description", 18, 26, "LOCATION", "new york");
    Span span2 = new Span("description", 52, 63, "LOCATION", "los angeles");
    list.add(span1);
    list.add(span2);
    IField spanListField = new ListField<Span>(list);
    return spanListField;
}
Also used : ArrayList(java.util.ArrayList) ListField(edu.uci.ics.texera.api.field.ListField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span)

Example 3 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class SpanTest method testGetters.

@Test
public void testGetters() {
    String attributeName = "description";
    int start = 10;
    int end = 20;
    String key = "location";
    String value = "new york";
    span = new Span(attributeName, start, end, key, value);
    Assert.assertEquals(start, span.getStart());
    Assert.assertEquals(end, span.getEnd());
    Assert.assertEquals(key, span.getKey());
    Assert.assertEquals(value, span.getValue());
    Assert.assertEquals(attributeName, span.getAttributeName());
}
Also used : Span(edu.uci.ics.texera.api.span.Span) Test(org.junit.Test)

Example 4 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class JsonSerializationTest method testTupleWithSpanlist.

@Test
public void testTupleWithSpanlist() {
    Tuple tuple = new Tuple.Builder().add("attr1", AttributeType.TEXT, new TextField("test")).add("spanList", AttributeType.LIST, new ListField<Span>(Arrays.asList(new Span("attr1", 0, 4, "test", "test")))).build();
    TestUtils.testJsonSerialization(tuple);
}
Also used : TextField(edu.uci.ics.texera.api.field.TextField) ListField(edu.uci.ics.texera.api.field.ListField) Span(edu.uci.ics.texera.api.span.Span) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 5 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class FuzzyTokenMatcherPerformanceTest method match.

/*
     * This function does match for a list of queries
     */
public static void match(ArrayList<String> queryList, double threshold, String luceneAnalyzerStr, String tableName, boolean bool) throws TexeraException, IOException {
    List<String> attributeNames = Arrays.asList(MedlineIndexWriter.ABSTRACT);
    for (String query : queryList) {
        FuzzyTokenSourcePredicate predicate = new FuzzyTokenSourcePredicate(query, attributeNames, luceneAnalyzerStr, threshold, tableName, SchemaConstants.SPAN_LIST);
        FuzzyTokenMatcherSourceOperator fuzzyTokenSource = new FuzzyTokenMatcherSourceOperator(predicate);
        long startMatchTime = System.currentTimeMillis();
        fuzzyTokenSource.open();
        int counter = 0;
        Tuple nextTuple = null;
        while ((nextTuple = fuzzyTokenSource.getNextTuple()) != null) {
            ListField<Span> spanListField = nextTuple.getField(SchemaConstants.SPAN_LIST);
            List<Span> spanList = spanListField.getValue();
            counter += spanList.size();
        }
        fuzzyTokenSource.close();
        long endMatchTime = System.currentTimeMillis();
        double matchTime = (endMatchTime - startMatchTime) / 1000.0;
        timeResults.add(Double.parseDouble(String.format("%.4f", matchTime)));
        totalResultCount += counter;
    }
}
Also used : FuzzyTokenMatcherSourceOperator(edu.uci.ics.texera.dataflow.fuzzytokenmatcher.FuzzyTokenMatcherSourceOperator) FuzzyTokenSourcePredicate(edu.uci.ics.texera.dataflow.fuzzytokenmatcher.FuzzyTokenSourcePredicate) Span(edu.uci.ics.texera.api.span.Span) Tuple(edu.uci.ics.texera.api.tuple.Tuple)

Aggregations

Span (edu.uci.ics.texera.api.span.Span)135 ArrayList (java.util.ArrayList)104 IField (edu.uci.ics.texera.api.field.IField)100 Tuple (edu.uci.ics.texera.api.tuple.Tuple)100 Schema (edu.uci.ics.texera.api.schema.Schema)91 Test (org.junit.Test)84 TextField (edu.uci.ics.texera.api.field.TextField)79 IntegerField (edu.uci.ics.texera.api.field.IntegerField)67 StringField (edu.uci.ics.texera.api.field.StringField)65 DoubleField (edu.uci.ics.texera.api.field.DoubleField)55 Attribute (edu.uci.ics.texera.api.schema.Attribute)53 DateField (edu.uci.ics.texera.api.field.DateField)52 SimpleDateFormat (java.text.SimpleDateFormat)52 ListField (edu.uci.ics.texera.api.field.ListField)41 Dictionary (edu.uci.ics.texera.dataflow.dictionarymatcher.Dictionary)30 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)13 AttributeType (edu.uci.ics.texera.api.schema.AttributeType)12 KeywordMatcherSourceOperator (edu.uci.ics.texera.dataflow.keywordmatcher.KeywordMatcherSourceOperator)11 JoinDistancePredicate (edu.uci.ics.texera.dataflow.join.JoinDistancePredicate)9 Collectors (java.util.stream.Collectors)9