Search in sources :

Example 41 with ListField

use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.

the class SimilarityJoinPredicate method mergeTuples.

private Tuple mergeTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema, List<Span> mergeSpanList) {
    List<IField> resultFields = new ArrayList<>();
    for (String attrName : outputSchema.getAttributeNames()) {
        // generate a new _ID field for this tuple
        if (attrName.equals(SchemaConstants._ID)) {
            IDField newID = new IDField(UUID.randomUUID().toString());
            resultFields.add(newID);
        // use the generated spanList
        } else if (attrName.equals(SchemaConstants.SPAN_LIST)) {
            resultFields.add(new ListField<Span>(mergeSpanList));
        // put the payload of two tuples together
        } else if (attrName.equals(SchemaConstants.PAYLOAD)) {
            ListField<Span> innerPayloadField = innerTuple.getField(SchemaConstants.PAYLOAD);
            List<Span> innerPayload = innerPayloadField.getValue();
            ListField<Span> outerPayloadField = outerTuple.getField(SchemaConstants.PAYLOAD);
            List<Span> outerPayload = outerPayloadField.getValue();
            List<Span> resultPayload = new ArrayList<>();
            resultPayload.addAll(innerPayload.stream().map(span -> addFieldPrefix(span, INNER_PREFIX)).collect(Collectors.toList()));
            resultPayload.addAll(outerPayload.stream().map(span -> addFieldPrefix(span, "outer_")).collect(Collectors.toList()));
            resultFields.add(new ListField<Span>(resultPayload));
        // add other fields from inner/outer tuples
        } else {
            if (attrName.startsWith(INNER_PREFIX)) {
                resultFields.add(innerTuple.getField(attrName.substring(INNER_PREFIX.length())));
            } else if (attrName.startsWith(OUTER_PREFIX)) {
                resultFields.add(outerTuple.getField(attrName.substring(OUTER_PREFIX.length())));
            }
        }
    }
    return new Tuple(outputSchema, resultFields);
}
Also used : IDField(edu.uci.ics.texera.api.field.IDField) ListField(edu.uci.ics.texera.api.field.ListField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span)

Example 42 with ListField

use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.

the class WordCloudSink method wordCount.

public List<Map.Entry<String, Integer>> wordCount() {
    Tuple tuple;
    HashMap<String, Integer> wordCountMap = new HashMap<>();
    while ((tuple = inputOperator.getNextTuple()) != null) {
        if (addPayload) {
            tuple = new Tuple.Builder(tuple).add(SchemaConstants.PAYLOAD_ATTRIBUTE, new ListField<Span>(DataflowUtils.generatePayloadFromTuple(tuple, predicate.getLuceneAnalyzerString()))).build();
        }
        ListField<Span> payloadField = tuple.getField("payload");
        List<Span> payloadSpanList = payloadField.getValue();
        for (Span span : payloadSpanList) {
            if (span.getAttributeName().equals(predicate.getAttribute())) {
                String key = span.getValue().toLowerCase();
                if (!StopAnalyzer.ENGLISH_STOP_WORDS_SET.contains(key))
                    wordCountMap.put(key, wordCountMap.get(key) == null ? 1 : wordCountMap.get(key) + 1);
            }
        }
    }
    return wordCountMap.entrySet().stream().sorted((e1, e2) -> e2.getValue().compareTo(e1.getValue())).collect(Collectors.toList());
}
Also used : ListField(edu.uci.ics.texera.api.field.ListField) java.util(java.util) Tuple(edu.uci.ics.texera.api.tuple.Tuple) VisualizationConstants(edu.uci.ics.texera.dataflow.sink.VisualizationConstants) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) VisualizationOperator(edu.uci.ics.texera.dataflow.sink.VisualizationOperator) Collectors(java.util.stream.Collectors) Span(edu.uci.ics.texera.api.span.Span) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) StringField(edu.uci.ics.texera.api.field.StringField) ErrorMessages(edu.uci.ics.texera.api.constants.ErrorMessages) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) StopAnalyzer(org.apache.lucene.analysis.core.StopAnalyzer) DataflowUtils(edu.uci.ics.texera.dataflow.utils.DataflowUtils) IntegerField(edu.uci.ics.texera.api.field.IntegerField) Span(edu.uci.ics.texera.api.span.Span) Tuple(edu.uci.ics.texera.api.tuple.Tuple)

Aggregations

ListField (edu.uci.ics.texera.api.field.ListField)42 Span (edu.uci.ics.texera.api.span.Span)40 IField (edu.uci.ics.texera.api.field.IField)33 ArrayList (java.util.ArrayList)32 Tuple (edu.uci.ics.texera.api.tuple.Tuple)27 Schema (edu.uci.ics.texera.api.schema.Schema)26 Test (org.junit.Test)19 TextField (edu.uci.ics.texera.api.field.TextField)11 SchemaConstants (edu.uci.ics.texera.api.constants.SchemaConstants)8 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)8 AttributeType (edu.uci.ics.texera.api.schema.AttributeType)8 Collectors (java.util.stream.Collectors)8 TexeraException (edu.uci.ics.texera.api.exception.TexeraException)7 ErrorMessages (edu.uci.ics.texera.api.constants.ErrorMessages)6 DataflowUtils (edu.uci.ics.texera.dataflow.utils.DataflowUtils)6 java.util (java.util)6 AbstractSingleInputOperator (edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator)5 Attribute (edu.uci.ics.texera.api.schema.Attribute)4 JsonCreator (com.fasterxml.jackson.annotation.JsonCreator)2 JsonProperty (com.fasterxml.jackson.annotation.JsonProperty)2