Search in sources :

Example 61 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class LabeledRegexMatcherTest method testQueryWithoutQualifiersLabeledRegex2.

@Test
public void testQueryWithoutQualifiersLabeledRegex2() throws Exception {
    String query = "<lab2> is <lab1>";
    List<Tuple> exactResults = RegexMatcherTestHelper.getQueryResults(PEOPLE_TABLE, query, "short", Arrays.asList(TestConstants.DESCRIPTION), "lab1", false, Integer.MAX_VALUE, 0, "Clooney", "lab2");
    List<Tuple> expectedResults = new ArrayList<>();
    // expected to match "Short angry"
    List<Tuple> data = TestConstants.getSamplePeopleTuples();
    Schema spanSchema = new Schema.Builder().add(TestConstants.SCHEMA_PEOPLE).add(RESULTS, AttributeType.LIST).build();
    List<Span> spans = new ArrayList<>();
    spans.add(new Span(TestConstants.DESCRIPTION, 4, 20, query, "Clooney is Short"));
    IField spanField = new ListField<>(new ArrayList<>(spans));
    List<IField> fields = new ArrayList<>(data.get(3).getFields());
    fields.add(spanField);
    expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
    List<String> attributeNames = new ArrayList<>();
    attributeNames.add(RESULTS);
    Assert.assertTrue(TestUtils.attributeEquals(expectedResults, exactResults, attributeNames));
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) ListField(edu.uci.ics.texera.api.field.ListField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 62 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class MysqlSinkTest method test2TupleInsertion.

/**
 * Create two tuples, insert into mysql
 * @throws ParseException
 */
public void test2TupleInsertion() throws Exception {
    ArrayList<String> attributeNames = new ArrayList<>();
    attributeNames.add(TestConstants.FIRST_NAME);
    attributeNames.add(TestConstants.LAST_NAME);
    attributeNames.add(TestConstants.DESCRIPTION);
    // Prepare the expected result list
    List<Span> list = new ArrayList<>();
    Span span1 = new Span("firstName", 0, 5, "bruce", "bruce");
    Span span2 = new Span("lastnName", 0, 5, "jacki", "jacki");
    list.add(span1);
    list.add(span2);
    Attribute[] schemaAttributes = new Attribute[TestConstants.ATTRIBUTES_PEOPLE.length + 1];
    for (int count = 0; count < schemaAttributes.length - 1; count++) {
        schemaAttributes[count] = TestConstants.ATTRIBUTES_PEOPLE[count];
    }
    schemaAttributes[schemaAttributes.length - 1] = SchemaConstants.SPAN_LIST_ATTRIBUTE;
    IField[] fields1 = { new StringField("bruce"), new StringField("john Lee"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("Tall Angry"), new ListField<>(list) };
    IField[] fields2 = { new StringField("test"), new StringField("jackie chan"), new IntegerField(0), new DoubleField(6.0), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("09-18-1994")), new TextField("Angry Bird"), new ListField<>(list) };
    Tuple tuple1 = new Tuple(new Schema(schemaAttributes), fields1);
    Tuple tuple2 = new Tuple(new Schema(schemaAttributes), fields2);
    IOperator localInputOperator = Mockito.mock(IOperator.class);
    Mockito.when(localInputOperator.getOutputSchema()).thenReturn(new Schema(schemaAttributes)).thenReturn(null);
    Mockito.when(localInputOperator.getNextTuple()).thenReturn(tuple1).thenReturn(tuple2).thenReturn(null);
    mysqlSink.setInputOperator(localInputOperator);
    mysqlSink.open();
    mysqlSink.processTuples();
    mysqlSink.close();
}
Also used : Attribute(edu.uci.ics.texera.api.schema.Attribute) IOperator(edu.uci.ics.texera.api.dataflow.IOperator) Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.texera.api.field.IntegerField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) StringField(edu.uci.ics.texera.api.field.StringField) TextField(edu.uci.ics.texera.api.field.TextField) DateField(edu.uci.ics.texera.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.texera.api.field.DoubleField) Tuple(edu.uci.ics.texera.api.tuple.Tuple)

Example 63 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class FuzzyTokenMatcherPerformanceTest method match.

/*
     * This function does match for a list of queries
     */
public static void match(ArrayList<String> queryList, double threshold, String luceneAnalyzerStr, String tableName, boolean bool) throws TexeraException, IOException {
    List<String> attributeNames = Arrays.asList(MedlineIndexWriter.ABSTRACT);
    for (String query : queryList) {
        FuzzyTokenSourcePredicate predicate = new FuzzyTokenSourcePredicate(query, attributeNames, luceneAnalyzerStr, threshold, tableName, SchemaConstants.SPAN_LIST);
        FuzzyTokenMatcherSourceOperator fuzzyTokenSource = new FuzzyTokenMatcherSourceOperator(predicate);
        long startMatchTime = System.currentTimeMillis();
        fuzzyTokenSource.open();
        int counter = 0;
        Tuple nextTuple = null;
        while ((nextTuple = fuzzyTokenSource.getNextTuple()) != null) {
            ListField<Span> spanListField = nextTuple.getField(SchemaConstants.SPAN_LIST);
            List<Span> spanList = spanListField.getValue();
            counter += spanList.size();
        }
        fuzzyTokenSource.close();
        long endMatchTime = System.currentTimeMillis();
        double matchTime = (endMatchTime - startMatchTime) / 1000.0;
        timeResults.add(Double.parseDouble(String.format("%.4f", matchTime)));
        totalResultCount += counter;
    }
}
Also used : FuzzyTokenMatcherSourceOperator(edu.uci.ics.texera.dataflow.fuzzytokenmatcher.FuzzyTokenMatcherSourceOperator) FuzzyTokenSourcePredicate(edu.uci.ics.texera.dataflow.fuzzytokenmatcher.FuzzyTokenSourcePredicate) Span(edu.uci.ics.texera.api.span.Span) Tuple(edu.uci.ics.texera.api.tuple.Tuple)

Example 64 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class KeywordMatcherPerformanceTest method match.

/*
     * This function does match for a list of queries
     */
public static void match(ArrayList<String> queryList, KeywordMatchingType opType, String luceneAnalyzerStr, String tableName) throws TexeraException, IOException {
    String[] attributeNames = new String[] { MedlineIndexWriter.ABSTRACT };
    for (String query : queryList) {
        KeywordSourcePredicate predicate = new KeywordSourcePredicate(query, Arrays.asList(attributeNames), luceneAnalyzerStr, opType, tableName, SchemaConstants.SPAN_LIST);
        KeywordMatcherSourceOperator keywordSource = new KeywordMatcherSourceOperator(predicate);
        long startMatchTime = System.currentTimeMillis();
        keywordSource.open();
        int counter = 0;
        Tuple nextTuple = null;
        while ((nextTuple = keywordSource.getNextTuple()) != null) {
            ListField<Span> spanListField = nextTuple.getField(SchemaConstants.SPAN_LIST);
            List<Span> spanList = spanListField.getValue();
            counter += spanList.size();
        }
        keywordSource.close();
        long endMatchTime = System.currentTimeMillis();
        double matchTime = (endMatchTime - startMatchTime) / 1000.0;
        timeResults.add(Double.parseDouble(String.format("%.4f", matchTime)));
        totalResultCount += counter;
    }
}
Also used : KeywordSourcePredicate(edu.uci.ics.texera.dataflow.keywordmatcher.KeywordSourcePredicate) Span(edu.uci.ics.texera.api.span.Span) Tuple(edu.uci.ics.texera.api.tuple.Tuple) KeywordMatcherSourceOperator(edu.uci.ics.texera.dataflow.keywordmatcher.KeywordMatcherSourceOperator)

Example 65 with Span

use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.

the class DataReader method buildPayloadFromTermVector.

private ArrayList<Span> buildPayloadFromTermVector(List<IField> fields, int docID) throws IOException {
    ArrayList<Span> payloadSpanList = new ArrayList<>();
    for (Attribute attr : inputSchema.getAttributes()) {
        String attributeName = attr.getName();
        AttributeType attributeType = attr.getType();
        // payload.
        if (attributeType != AttributeType.TEXT) {
            continue;
        }
        String fieldValue = fields.get(inputSchema.getIndex(attributeName)).getValue().toString();
        Terms termVector = luceneIndexReader.getTermVector(docID, attributeName);
        if (termVector == null) {
            continue;
        }
        TermsEnum termsEnum = termVector.iterator();
        PostingsEnum termPostings = null;
        // go through document terms
        while ((termsEnum.next()) != null) {
            termPostings = termsEnum.postings(termPostings, PostingsEnum.ALL);
            if (termPostings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
                continue;
            }
            // for each term, go through its postings
            for (int i = 0; i < termPostings.freq(); i++) {
                // nextPosition needs to be called first
                int tokenPosition = termPostings.nextPosition();
                int charStart = termPostings.startOffset();
                int charEnd = termPostings.endOffset();
                String analyzedTermStr = termsEnum.term().utf8ToString();
                String originalTermStr = fieldValue.substring(charStart, charEnd);
                Span span = new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition);
                payloadSpanList.add(span);
            }
        }
    }
    return payloadSpanList;
}
Also used : Attribute(edu.uci.ics.texera.api.schema.Attribute) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) ArrayList(java.util.ArrayList) Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) Span(edu.uci.ics.texera.api.span.Span) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

Span (edu.uci.ics.texera.api.span.Span)130 ArrayList (java.util.ArrayList)104 IField (edu.uci.ics.texera.api.field.IField)100 Tuple (edu.uci.ics.texera.api.tuple.Tuple)99 Schema (edu.uci.ics.texera.api.schema.Schema)90 Test (org.junit.Test)84 TextField (edu.uci.ics.texera.api.field.TextField)78 IntegerField (edu.uci.ics.texera.api.field.IntegerField)66 StringField (edu.uci.ics.texera.api.field.StringField)64 DoubleField (edu.uci.ics.texera.api.field.DoubleField)55 DateField (edu.uci.ics.texera.api.field.DateField)52 SimpleDateFormat (java.text.SimpleDateFormat)52 Attribute (edu.uci.ics.texera.api.schema.Attribute)51 ListField (edu.uci.ics.texera.api.field.ListField)40 Dictionary (edu.uci.ics.texera.dataflow.dictionarymatcher.Dictionary)30 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)11 AttributeType (edu.uci.ics.texera.api.schema.AttributeType)11 KeywordMatcherSourceOperator (edu.uci.ics.texera.dataflow.keywordmatcher.KeywordMatcherSourceOperator)11 JoinDistancePredicate (edu.uci.ics.texera.dataflow.join.JoinDistancePredicate)9 Collectors (java.util.stream.Collectors)8