Search in sources :

Example 26 with ListField

use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.

the class KeywordMatcher method appendConjunctionMatchingSpans.

private List<Span> appendConjunctionMatchingSpans(Tuple inputTuple, List<String> attributeNames, Set<String> queryTokenSet, String queryKeyword) throws DataflowException {
    ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
    List<Span> payload = payloadField.getValue();
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : attributeNames) {
        AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        // for STRING type, the query should match the fieldValue completely
        if (attributeType == AttributeType.STRING) {
            if (queryKeyword.equals(fieldValue)) {
                Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
                matchingResults.add(span);
            }
        }
        // list for this field
        if (attributeType == AttributeType.TEXT) {
            List<Span> relevantSpans = filterRelevantSpans(payload, queryTokenSet);
            List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
            if (DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
                matchingResults.addAll(fieldSpanList);
            }
        }
    }
    return matchingResults;
}
Also used : ListField(edu.uci.ics.texera.api.field.ListField) java.util(java.util) Tuple(edu.uci.ics.texera.api.tuple.Tuple) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) Collectors(java.util.stream.Collectors) Span(edu.uci.ics.texera.api.span.Span) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) AbstractSingleInputOperator(edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator) ErrorMessages(edu.uci.ics.texera.api.constants.ErrorMessages) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowUtils(edu.uci.ics.texera.dataflow.utils.DataflowUtils) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Span(edu.uci.ics.texera.api.span.Span)

Example 27 with ListField

use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.

the class KeywordMatcher method appendPhraseMatchingSpans.

private List<Span> appendPhraseMatchingSpans(Tuple inputTuple, List<String> attributeNames, List<String> queryTokenList, List<String> queryTokenListWithStopwords, String queryKeyword) throws DataflowException {
    ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
    List<Span> payload = payloadField.getValue();
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : attributeNames) {
        AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        // for STRING type, the query should match the fieldValue completely
        if (attributeType == AttributeType.STRING) {
            if (queryKeyword.equals(fieldValue)) {
                Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
                matchingResults.add(span);
            }
        }
        // phrase query
        if (attributeType == AttributeType.TEXT) {
            Set<String> queryTokenSet = new HashSet<>(queryTokenList);
            List<Span> relevantSpans = filterRelevantSpans(payload, queryTokenSet);
            List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
            if (!DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
                // in the spans
                continue;
            }
            matchingResults.addAll(DataflowUtils.constructPhraseMatchingSpans(attributeName, fieldValue, queryKeyword, fieldSpanList, queryTokenListWithStopwords, queryTokenList));
        }
    }
    return matchingResults;
}
Also used : ListField(edu.uci.ics.texera.api.field.ListField) java.util(java.util) Tuple(edu.uci.ics.texera.api.tuple.Tuple) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) Collectors(java.util.stream.Collectors) Span(edu.uci.ics.texera.api.span.Span) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) AbstractSingleInputOperator(edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator) ErrorMessages(edu.uci.ics.texera.api.constants.ErrorMessages) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowUtils(edu.uci.ics.texera.dataflow.utils.DataflowUtils) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Span(edu.uci.ics.texera.api.span.Span)

Example 28 with ListField

use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.

the class NlpEntityTestConstants method getTest1ResultTuples.

public static List<Tuple> getTest1ResultTuples() {
    List<Span> spanList = new ArrayList<Span>();
    Span span1 = new Span("sentence_one", 0, 9, NlpEntityType.ORGANIZATION.toString(), "Microsoft");
    spanList.add(span1);
    IField[] fields1 = { new TextField("Microsoft is an organization.") };
    Tuple tuple1 = new Tuple(SCHEMA_ONE_SENTENCE, fields1);
    Tuple returnTuple = new Tuple.Builder(tuple1).add(REULST_ATTRIBUTE, new ListField<Span>(spanList)).build();
    return Arrays.asList(returnTuple);
}
Also used : ArrayList(java.util.ArrayList) TextField(edu.uci.ics.texera.api.field.TextField) ListField(edu.uci.ics.texera.api.field.ListField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span)

Example 29 with ListField

use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.

the class NlpEntityTestConstants method getTest3ResultTuples.

public static List<Tuple> getTest3ResultTuples() {
    List<Span> spanList = new ArrayList<Span>();
    Span span1 = new Span("sentence_one", 0, 9, NlpEntityType.ORGANIZATION.toString(), "Microsoft");
    Span span2 = new Span("sentence_one", 11, 17, NlpEntityType.ORGANIZATION.toString(), "Google");
    Span span3 = new Span("sentence_one", 22, 30, NlpEntityType.ORGANIZATION.toString(), "Facebook");
    Span span4 = new Span("sentence_one", 53, 65, NlpEntityType.PERSON.toString(), "Donald Trump");
    Span span5 = new Span("sentence_one", 70, 82, NlpEntityType.PERSON.toString(), "Barack Obama");
    spanList.add(span1);
    spanList.add(span2);
    spanList.add(span3);
    spanList.add(span4);
    spanList.add(span5);
    IField[] fields1 = { new TextField("Microsoft, Google and Facebook are organizations and Donald Trump and Barack Obama are persons.") };
    Tuple tuple1 = new Tuple(SCHEMA_ONE_SENTENCE, fields1);
    Tuple returnTuple = new Tuple.Builder(tuple1).add(REULST_ATTRIBUTE, new ListField<Span>(spanList)).build();
    return Arrays.asList(returnTuple);
}
Also used : ArrayList(java.util.ArrayList) TextField(edu.uci.ics.texera.api.field.TextField) ListField(edu.uci.ics.texera.api.field.ListField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span)

Example 30 with ListField

use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.

the class RegexMatcherTest method testGetNextTupleCorpURL.

@Test
public void testGetNextTupleCorpURL() throws Exception {
    String query = "^(https?:\\/\\/)?([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?$";
    List<Tuple> exactResults = RegexMatcherTestHelper.getQueryResults(CORP_TABLE, query, Arrays.asList(RegexTestConstantsCorp.URL));
    List<Tuple> expectedResults = new ArrayList<Tuple>();
    // expected to match "http://weibo.com"
    List<Tuple> data = RegexTestConstantsCorp.getSampleCorpTuples();
    Schema spanSchema = new Schema.Builder().add(RegexTestConstantsCorp.SCHEMA_CORP).add(RESULTS, AttributeType.LIST).build();
    List<Span> spans = new ArrayList<Span>();
    spans.add(new Span(RegexTestConstantsCorp.URL, 0, 16, query, "http://weibo.com"));
    IField spanField = new ListField<Span>(new ArrayList<Span>(spans));
    List<IField> fields = new ArrayList<IField>(data.get(1).getFields());
    fields.add(spanField);
    expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
    // expected to match "https://www.microsoft.com/en-us/"
    spans.clear();
    spans.add(new Span(RegexTestConstantsCorp.URL, 0, 32, query, "https://www.microsoft.com/en-us/"));
    spanField = new ListField<Span>(new ArrayList<Span>(spans));
    fields = new ArrayList<IField>(data.get(2).getFields());
    fields.add(spanField);
    expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
    Assert.assertTrue(TestUtils.equals(expectedResults, exactResults));
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) ListField(edu.uci.ics.texera.api.field.ListField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Aggregations

ListField (edu.uci.ics.texera.api.field.ListField)42 Span (edu.uci.ics.texera.api.span.Span)40 IField (edu.uci.ics.texera.api.field.IField)33 ArrayList (java.util.ArrayList)32 Tuple (edu.uci.ics.texera.api.tuple.Tuple)27 Schema (edu.uci.ics.texera.api.schema.Schema)26 Test (org.junit.Test)19 TextField (edu.uci.ics.texera.api.field.TextField)11 SchemaConstants (edu.uci.ics.texera.api.constants.SchemaConstants)8 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)8 AttributeType (edu.uci.ics.texera.api.schema.AttributeType)8 Collectors (java.util.stream.Collectors)8 TexeraException (edu.uci.ics.texera.api.exception.TexeraException)7 ErrorMessages (edu.uci.ics.texera.api.constants.ErrorMessages)6 DataflowUtils (edu.uci.ics.texera.dataflow.utils.DataflowUtils)6 java.util (java.util)6 AbstractSingleInputOperator (edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator)5 Attribute (edu.uci.ics.texera.api.schema.Attribute)4 JsonCreator (com.fasterxml.jackson.annotation.JsonCreator)2 JsonProperty (com.fasterxml.jackson.annotation.JsonProperty)2