Search in sources :

Example 21 with ListField

use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.

the class NlpEntityTestConstants method getTest4ResultTuples.

public static List<Tuple> getTest4ResultTuples() {
    List<Span> spanList = new ArrayList<Span>();
    Span span1 = new Span("sentence_one", 0, 9, NlpEntityType.ORGANIZATION.toString(), "Microsoft");
    Span span2 = new Span("sentence_one", 11, 17, NlpEntityType.ORGANIZATION.toString(), "Google");
    Span span3 = new Span("sentence_one", 22, 30, NlpEntityType.ORGANIZATION.toString(), "Facebook");
    Span span4 = new Span("sentence_two", 0, 12, NlpEntityType.PERSON.toString(), "Donald Trump");
    Span span5 = new Span("sentence_two", 17, 29, NlpEntityType.PERSON.toString(), "Barack Obama");
    spanList.add(span1);
    spanList.add(span2);
    spanList.add(span3);
    spanList.add(span4);
    spanList.add(span5);
    IField[] fields1 = { new TextField("Microsoft, Google and Facebook are organizations."), new TextField("Donald Trump and Barack Obama are persons") };
    Tuple tuple1 = new Tuple(SCHEMA_TWO_SENTENCE, fields1);
    Tuple returnTuple = new Tuple.Builder(tuple1).add(REULST_ATTRIBUTE, new ListField<Span>(spanList)).build();
    return Arrays.asList(returnTuple);
}
Also used : ArrayList(java.util.ArrayList) TextField(edu.uci.ics.texera.api.field.TextField) ListField(edu.uci.ics.texera.api.field.ListField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span)

Example 22 with ListField

use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.

the class NlpEntityTestConstants method getTest7ResultTuples.

public static List<Tuple> getTest7ResultTuples() {
    List<Span> spanList = new ArrayList<Span>();
    Span span1 = new Span("sentence_one", 12, 16, NlpEntityType.ADJECTIVE.toString(), "warm");
    spanList.add(span1);
    IField[] fields1 = { new TextField("Feeling the warm sun rays beaming steadily down, the girl decided there was no need to wear a coat.") };
    Tuple tuple1 = new Tuple(SCHEMA_ONE_SENTENCE, fields1);
    Tuple returnTuple = new Tuple.Builder(tuple1).add(REULST_ATTRIBUTE, new ListField<Span>(spanList)).build();
    return Arrays.asList(returnTuple);
}
Also used : ArrayList(java.util.ArrayList) TextField(edu.uci.ics.texera.api.field.TextField) ListField(edu.uci.ics.texera.api.field.ListField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span)

Example 23 with ListField

use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.

the class SimilarityJoinPredicate method joinTuples.

@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws DataflowException {
    if (similarityThreshold == 0) {
        return null;
    }
    // get the span list only with the joinAttributeName
    ListField<Span> innerSpanListField = innerTuple.getField(SchemaConstants.SPAN_LIST);
    List<Span> innerRelevantSpanList = innerSpanListField.getValue().stream().filter(span -> span.getAttributeName().equals(innerJoinAttrName)).collect(Collectors.toList());
    ListField<Span> outerSpanListField = outerTuple.getField(SchemaConstants.SPAN_LIST);
    List<Span> outerRelevantSpanList = outerSpanListField.getValue().stream().filter(span -> span.getAttributeName().equals(outerJoinAttrName)).collect(Collectors.toList());
    // get a set of span's values (since multiple spans may have the same value)
    Set<String> innerSpanValueSet = innerRelevantSpanList.stream().map(span -> span.getValue()).collect(Collectors.toSet());
    Set<String> outerSpanValueSet = outerRelevantSpanList.stream().map(span -> span.getValue()).collect(Collectors.toSet());
    // compute the result value set using the similarity function
    Set<String> resultValueSet = new HashSet<>();
    for (String innerString : innerSpanValueSet) {
        for (String outerString : outerSpanValueSet) {
            if (this.similarityFunc.calculateSimilarity(innerString, outerString) >= this.similarityThreshold) {
                resultValueSet.add(innerString);
                resultValueSet.add(outerString);
            }
        }
    }
    // return null if none of them are similar
    if (resultValueSet.isEmpty()) {
        return null;
    }
    // generate the result spans
    List<Span> resultSpans = new ArrayList<>();
    for (Span span : innerRelevantSpanList) {
        if (resultValueSet.contains(span.getValue())) {
            resultSpans.add(addFieldPrefix(span, INNER_PREFIX));
        }
    }
    for (Span span : outerRelevantSpanList) {
        if (resultValueSet.contains(span.getValue())) {
            resultSpans.add(addFieldPrefix(span, OUTER_PREFIX));
        }
    }
    return mergeTuples(innerTuple, outerTuple, outputSchema, resultSpans);
}
Also used : JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) ListField(edu.uci.ics.texera.api.field.ListField) java.util(java.util) edu.uci.ics.texera.api.tuple(edu.uci.ics.texera.api.tuple) ImmutableMap(com.google.common.collect.ImmutableMap) PropertyNameConstants(edu.uci.ics.texera.dataflow.common.PropertyNameConstants) PredicateBase(edu.uci.ics.texera.dataflow.common.PredicateBase) Collectors(java.util.stream.Collectors) Span(edu.uci.ics.texera.api.span.Span) OperatorGroupConstants(edu.uci.ics.texera.dataflow.common.OperatorGroupConstants) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IField(edu.uci.ics.texera.api.field.IField) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) JsonIgnore(com.fasterxml.jackson.annotation.JsonIgnore) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) IDField(edu.uci.ics.texera.api.field.IDField) NormalizedLevenshtein(info.debatty.java.stringsimilarity.NormalizedLevenshtein) Span(edu.uci.ics.texera.api.span.Span)

Example 24 with ListField

use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.

the class FuzzyTokenMatcher method processOneInputTuple.

@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws TexeraException {
    // add payload if needed before passing it to the matching functions
    if (addPayload) {
        Tuple.Builder tupleBuilderPayload = new Tuple.Builder(inputTuple);
        tupleBuilderPayload.add(SchemaConstants.PAYLOAD_ATTRIBUTE, new ListField<Span>(DataflowUtils.generatePayloadFromTuple(inputTuple, predicate.getLuceneAnalyzerStr())));
        inputTuple = tupleBuilderPayload.build();
    }
    ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
    List<Span> relevantSpans = filterRelevantSpans(payloadField.getValue());
    List<Span> matchingResults = new ArrayList<>();
    /*
         * The source operator returns spans even for those fields which did not
         * satisfy the threshold criterion. So if two attributes A,B have 10 and
         * 5 matching tokens, and we set threshold to 10, the number of spans
         * returned is 15. So we need to filter those 5 spans for attribute B.
         */
    for (String attributeName : this.predicate.getAttributeNames()) {
        AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getType();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.TEXT && attributeType != AttributeType.STRING) {
            throw new DataflowException("FuzzyTokenMatcher: Fields other than TEXT or STRING are not supported");
        }
        List<Span> fieldSpans = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).filter(span -> predicate.getQueryTokens().contains(span.getKey())).collect(Collectors.toList());
        if (fieldSpans.size() >= predicate.getThreshold()) {
            matchingResults.addAll(fieldSpans);
        }
    }
    if (matchingResults.isEmpty()) {
        return null;
    }
    Tuple.Builder tupleBuilder = new Tuple.Builder(inputTuple);
    if (addResultAttribute) {
        tupleBuilder.add(predicate.getSpanListName(), AttributeType.LIST, new ListField<Span>(matchingResults));
    }
    return tupleBuilder.build();
}
Also used : ListField(edu.uci.ics.texera.api.field.ListField) Iterator(java.util.Iterator) Tuple(edu.uci.ics.texera.api.tuple.Tuple) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) Collectors(java.util.stream.Collectors) Span(edu.uci.ics.texera.api.span.Span) ArrayList(java.util.ArrayList) List(java.util.List) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) AbstractSingleInputOperator(edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator) ErrorMessages(edu.uci.ics.texera.api.constants.ErrorMessages) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowUtils(edu.uci.ics.texera.dataflow.utils.DataflowUtils) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) ArrayList(java.util.ArrayList) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Span(edu.uci.ics.texera.api.span.Span) Tuple(edu.uci.ics.texera.api.tuple.Tuple)

Example 25 with ListField

use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.

the class DictionaryMatcher method appendPhraseMatchingSpans4Dictionary.

public List<Span> appendPhraseMatchingSpans4Dictionary(Tuple inputTuple, List<String> attributeNames, List<List<String>> queryTokenList, List<Set<String>> queryTokenSetList, List<List<String>> queryTokenListWithStopwords, List<String> queryList) throws DataflowException {
    List<Span> matchingResults = new ArrayList<>();
    ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
    List<Span> payload = payloadField.getValue();
    Map<Integer, List<Span>> relevantSpansMap = filterRelevantSpans(payload, queryTokenSetList);
    for (String attributeName : attributeNames) {
        AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        // for STRING type, the query should match the fieldValue completely
        if (attributeType == AttributeType.STRING) {
            if (queryList.contains(fieldValue)) {
                Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
                matchingResults.add(span);
            }
        }
        // for TEXT type, spans need to be reconstructed according to the phrase query.
        if (attributeType == AttributeType.TEXT) {
            for (int index : relevantSpansMap.keySet()) {
                List<Span> fieldSpanList = relevantSpansMap.get(index).stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
                if (fieldSpanList.isEmpty() || !DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSetList.get(index))) {
                    continue;
                }
                matchingResults.addAll(DataflowUtils.constructPhraseMatchingSpans(attributeName, fieldValue, queryList.get(index), fieldSpanList, queryTokenListWithStopwords.get(index), queryTokenList.get(index)));
            }
        }
    }
    return matchingResults;
}
Also used : ListField(edu.uci.ics.texera.api.field.ListField) java.util(java.util) Tuple(edu.uci.ics.texera.api.tuple.Tuple) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) Collectors(java.util.stream.Collectors) Span(edu.uci.ics.texera.api.span.Span) Matcher(java.util.regex.Matcher) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IOperator(edu.uci.ics.texera.api.dataflow.IOperator) AbstractSingleInputOperator(edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator) ErrorMessages(edu.uci.ics.texera.api.constants.ErrorMessages) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Pattern(java.util.regex.Pattern) DataflowUtils(edu.uci.ics.texera.dataflow.utils.DataflowUtils) KeywordMatchingType(edu.uci.ics.texera.dataflow.keywordmatcher.KeywordMatchingType) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Span(edu.uci.ics.texera.api.span.Span)

Aggregations

ListField (edu.uci.ics.texera.api.field.ListField)42 Span (edu.uci.ics.texera.api.span.Span)40 IField (edu.uci.ics.texera.api.field.IField)33 ArrayList (java.util.ArrayList)32 Tuple (edu.uci.ics.texera.api.tuple.Tuple)27 Schema (edu.uci.ics.texera.api.schema.Schema)26 Test (org.junit.Test)19 TextField (edu.uci.ics.texera.api.field.TextField)11 SchemaConstants (edu.uci.ics.texera.api.constants.SchemaConstants)8 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)8 AttributeType (edu.uci.ics.texera.api.schema.AttributeType)8 Collectors (java.util.stream.Collectors)8 TexeraException (edu.uci.ics.texera.api.exception.TexeraException)7 ErrorMessages (edu.uci.ics.texera.api.constants.ErrorMessages)6 DataflowUtils (edu.uci.ics.texera.dataflow.utils.DataflowUtils)6 java.util (java.util)6 AbstractSingleInputOperator (edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator)5 Attribute (edu.uci.ics.texera.api.schema.Attribute)4 JsonCreator (com.fasterxml.jackson.annotation.JsonCreator)2 JsonProperty (com.fasterxml.jackson.annotation.JsonProperty)2