Examples with Aggregation - datawave.query.function.Aggregation

Example 1 with Aggregation

use of datawave.query.function.Aggregation in project datawave by NationalSecurityAgency.

the class QueryIterator method seek.

@Override
public void seek(Range range, Collection<ByteSequence> columnFamilies, boolean inclusive) throws IOException {
    // preserve the original range for use with the Final Document tracking iterator because it is placed after the ResultCountingIterator
    // so the FinalDocumentTracking iterator needs the start key with the count already appended
    originalRange = range;
    getActiveQueryLog().get(getQueryId()).beginCall(this.originalRange, ActiveQuery.CallType.SEEK);
    Span span = Trace.start("QueryIterator.seek");
    if (!this.isIncludeGroupingContext() && (this.query.contains("grouping:") || this.query.contains("matchesInGroup") || this.query.contains("MatchesInGroup") || this.query.contains("atomValuesMatch"))) {
        this.setIncludeGroupingContext(true);
        this.groupingContextAddedByMe = true;
    } else {
        this.groupingContextAddedByMe = false;
    }
    try {
        if (log.isDebugEnabled()) {
            log.debug("Seek range: " + range + " " + query);
        }
        this.range = range;
        // determine whether this is a teardown/rebuild range
        long resultCount = 0;
        if (!range.isStartKeyInclusive()) {
            // see if we can fail fast. If we were rebuilt with the FinalDocument key, then we are already completely done
            if (collectTimingDetails && FinalDocumentTrackingIterator.isFinalDocumentKey(range.getStartKey())) {
                this.seekKeySource = new EmptyTreeIterable();
                this.serializedDocuments = EmptyIterator.emptyIterator();
                prepareKeyValue(span);
                return;
            }
            // see if we have a count in the cf
            Key startKey = range.getStartKey();
            String[] parts = StringUtils.split(startKey.getColumnFamily().toString(), '\0');
            if (parts.length == 3) {
                resultCount = NumericalEncoder.decode(parts[0]).longValue();
                // remove the count from the range
                startKey = new Key(startKey.getRow(), new Text(parts[1] + '\0' + parts[2]), startKey.getColumnQualifier(), startKey.getColumnVisibility(), startKey.getTimestamp());
                this.range = range = new Range(startKey, range.isStartKeyInclusive(), range.getEndKey(), range.isEndKeyInclusive());
            }
        }
        // determine whether this is a document specific range
        Range documentRange = isDocumentSpecificRange(range) ? range : null;
        // is done
        if (documentRange != null && !documentRange.isStartKeyInclusive()) {
            if (log.isTraceEnabled()) {
                log.trace("Received non-inclusive event specific range: " + documentRange);
            }
            if (gatherTimingDetails()) {
                this.seekKeySource = new EvaluationTrackingNestedIterator(QuerySpan.Stage.EmptyTree, trackingSpan, new EmptyTreeIterable(), myEnvironment);
            } else {
                this.seekKeySource = new EmptyTreeIterable();
            }
        } else // if the Range is for a single document and the query doesn't reference any index-only or tokenized fields
        if (documentRange != null && (!this.isContainsIndexOnlyTerms() && this.getTermFrequencyFields().isEmpty() && !super.mustUseFieldIndex)) {
            if (log.isTraceEnabled()) {
                log.trace("Received event specific range: " + documentRange);
            }
            // We can take a shortcut to the directly to the event
            Map.Entry<Key, Document> documentKey = Maps.immutableEntry(super.getDocumentKey.apply(documentRange), new Document());
            if (log.isTraceEnabled()) {
                log.trace("Transformed document key: " + documentKey);
            }
            if (gatherTimingDetails()) {
                this.seekKeySource = new EvaluationTrackingNestedIterator(QuerySpan.Stage.DocumentSpecificTree, trackingSpan, new DocumentSpecificNestedIterator(documentKey), myEnvironment);
            } else {
                this.seekKeySource = new DocumentSpecificNestedIterator(documentKey);
            }
        } else {
            this.seekKeySource = buildDocumentIterator(documentRange, range, columnFamilies, inclusive);
        }
        // Create the pipeline iterator for document aggregation and
        // evaluation within a thread pool
        PipelineIterator pipelineIter = PipelineFactory.createIterator(this.seekKeySource, getMaxEvaluationPipelines(), getMaxPipelineCachedResults(), getSerialPipelineRequest(), querySpanCollector, trackingSpan, this, sourceForDeepCopies.deepCopy(myEnvironment), myEnvironment, yield, yieldThresholdMs, columnFamilies, inclusive);
        pipelineIter.setCollectTimingDetails(collectTimingDetails);
        // TODO pipelineIter.setStatsdHostAndPort(statsdHostAndPort);
        pipelineIter.startPipeline();
        // gather Key,Document Entries from the pipelines
        Iterator<Entry<Key, Document>> pipelineDocuments = pipelineIter;
        if (log.isTraceEnabled()) {
            pipelineDocuments = Iterators.filter(pipelineDocuments, keyDocumentEntry -> {
                log.trace("after pipeline, keyDocumentEntry:" + keyDocumentEntry);
                return true;
            });
        }
        // now apply the unique transform if requested
        UniqueTransform uniquify = getUniqueTransform();
        if (uniquify != null) {
            pipelineDocuments = Iterators.filter(pipelineDocuments, uniquify.getUniquePredicate());
        }
        // apply the grouping transform if requested and if the batch size is greater than zero
        // if the batch size is 0, then grouping is computed only on the web server
        GroupingTransform groupify = getGroupingTransform();
        if (groupify != null && this.groupFieldsBatchSize > 0) {
            pipelineDocuments = groupingTransform.getGroupingIterator(pipelineDocuments, this.groupFieldsBatchSize, this.yield);
            if (log.isTraceEnabled()) {
                pipelineDocuments = Iterators.filter(pipelineDocuments, keyDocumentEntry -> {
                    log.trace("after grouping, keyDocumentEntry:" + keyDocumentEntry);
                    return true;
                });
            }
        }
        pipelineDocuments = Iterators.filter(pipelineDocuments, keyDocumentEntry -> {
            // last chance before the documents are serialized
            getActiveQueryLog().get(getQueryId()).recordStats(keyDocumentEntry.getValue(), querySpanCollector.getCombinedQuerySpan(null));
            // Always return true since we just want to record data in the ActiveQueryLog
            return true;
        });
        if (this.getReturnType() == ReturnType.kryo) {
            // Serialize the Document using Kryo
            this.serializedDocuments = Iterators.transform(pipelineDocuments, new KryoDocumentSerializer(isReducedResponse(), isCompressResults()));
        } else if (this.getReturnType() == ReturnType.writable) {
            // Use the Writable interface to serialize the Document
            this.serializedDocuments = Iterators.transform(pipelineDocuments, new WritableDocumentSerializer(isReducedResponse()));
        } else if (this.getReturnType() == ReturnType.tostring) {
            // Just return a toString() representation of the document
            this.serializedDocuments = Iterators.transform(pipelineDocuments, new ToStringDocumentSerializer(isReducedResponse()));
        } else {
            throw new IllegalArgumentException("Unknown return type of: " + this.getReturnType());
        }
        if (log.isTraceEnabled()) {
            KryoDocumentDeserializer dser = new KryoDocumentDeserializer();
            this.serializedDocuments = Iterators.filter(this.serializedDocuments, keyValueEntry -> {
                log.trace("after serializing, keyValueEntry:" + dser.apply(keyValueEntry));
                return true;
            });
        }
        // Cannot do this on document specific ranges as the count would place the keys outside the initial range
        if (!sortedUIDs && documentRange == null) {
            this.serializedDocuments = new ResultCountingIterator(serializedDocuments, resultCount, yield);
        } else if (this.sortedUIDs) {
            // we have sorted UIDs, so we can mask out the cq
            this.serializedDocuments = new KeyAdjudicator<>(serializedDocuments, yield);
        }
        // only add the final document tracking iterator which sends stats back to the client if collectTimingDetails is true
        if (collectTimingDetails) {
            // if there is no document to return, then add an empty document
            // to store the timing metadata
            this.serializedDocuments = new FinalDocumentTrackingIterator(querySpanCollector, trackingSpan, originalRange, this.serializedDocuments, this.getReturnType(), this.isReducedResponse(), this.isCompressResults(), this.yield);
        }
        if (log.isTraceEnabled()) {
            KryoDocumentDeserializer dser = new KryoDocumentDeserializer();
            this.serializedDocuments = Iterators.filter(this.serializedDocuments, keyValueEntry -> {
                log.debug("finally, considering:" + dser.apply(keyValueEntry));
                return true;
            });
        }
        // Determine if we have items to return
        prepareKeyValue(span);
    } catch (Exception e) {
        handleException(e);
    } finally {
        if (gatherTimingDetails() && trackingSpan != null && querySpanCollector != null) {
            querySpanCollector.addQuerySpan(trackingSpan);
        }
        if (null != span) {
            span.stop();
        }
        QueryStatsDClient client = getStatsdClient();
        if (client != null) {
            client.flush();
        }
        getActiveQueryLog().get(getQueryId()).endCall(this.originalRange, ActiveQuery.CallType.SEEK);
        if (this.key == null && this.value == null) {
            // no entries to return
            getActiveQueryLog().remove(getQueryId(), this.originalRange);
        }
    }
}

Also used : ByteSequence(org.apache.accumulo.core.data.ByteSequence) Document(datawave.query.attributes.Document) PipelineIterator(datawave.query.iterator.pipeline.PipelineIterator) FileSystem(org.apache.hadoop.fs.FileSystem) Text(org.apache.hadoop.io.Text) FileStatus(org.apache.hadoop.fs.FileStatus) DocumentData(datawave.query.iterator.aggregation.DocumentData) EvaluationTrackingFunction(datawave.query.iterator.profile.EvaluationTrackingFunction) Map(java.util.Map) QueryStatsDClient(datawave.query.statsd.QueryStatsDClient) Span(org.apache.accumulo.core.trace.Span) CompareToBuilder(org.apache.commons.lang.builder.CompareToBuilder) WritableDocumentSerializer(datawave.query.function.serializer.WritableDocumentSerializer) ASTJexlScript(org.apache.commons.jexl2.parser.ASTJexlScript) ConfigException(org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException) Trace(org.apache.accumulo.core.trace.Trace) Set(java.util.Set) EmptyIterator(org.apache.commons.collections4.iterators.EmptyIterator) RemoveGroupingContext(datawave.query.function.RemoveGroupingContext) DefaultArithmetic(datawave.query.jexl.DefaultArithmetic) Predicate(com.google.common.base.Predicate) DatawaveFieldIndexListIteratorJexl(datawave.core.iterators.DatawaveFieldIndexListIteratorJexl) DocumentProjection(datawave.query.function.DocumentProjection) YieldingKeyValueIterator(org.apache.accumulo.core.iterators.YieldingKeyValueIterator) JexlContextCreator(datawave.query.function.JexlContextCreator) IterationInterruptedException(org.apache.accumulo.core.iterators.IterationInterruptedException) IndexOnlyContextCreatorBuilder(datawave.query.function.IndexOnlyContextCreatorBuilder) StringUtils(datawave.util.StringUtils) JexlASTHelper(datawave.query.jexl.JexlASTHelper) JexlArithmetic(org.apache.commons.jexl2.JexlArithmetic) JexlNode(org.apache.commons.jexl2.parser.JexlNode) DataTypeAsField(datawave.query.function.DataTypeAsField) TraceIterators(datawave.query.util.TraceIterators) SortedKeyValueIterator(org.apache.accumulo.core.iterators.SortedKeyValueIterator) InterruptedIOException(java.io.InterruptedIOException) FsAction(org.apache.hadoop.fs.permission.FsAction) KeyToDocumentData(datawave.query.function.KeyToDocumentData) Lists(com.google.common.collect.Lists) Aggregation(datawave.query.function.Aggregation) Key(org.apache.accumulo.core.data.Key) BasePoolableObjectFactory(org.apache.commons.pool.BasePoolableObjectFactory) Nullable(javax.annotation.Nullable) ToStringDocumentSerializer(datawave.query.function.serializer.ToStringDocumentSerializer) TFFactory(datawave.query.postprocessing.tf.TFFactory) Throwables(com.google.common.base.Throwables) IOException(java.io.IOException) Range(org.apache.accumulo.core.data.Range) QuerySpan(datawave.query.iterator.profile.QuerySpan) DocumentPermutation(datawave.query.function.DocumentPermutation) WHEN_EXHAUSTED_BLOCK(org.apache.commons.pool.impl.GenericObjectPool.WHEN_EXHAUSTED_BLOCK) Preconditions(com.google.common.base.Preconditions) MaskedValueFilterInterface(datawave.query.function.MaskedValueFilterInterface) UnmodifiableIterator(com.google.common.collect.UnmodifiableIterator) QuerySpanCollector(datawave.query.iterator.profile.QuerySpanCollector) StatefulArithmetic(datawave.query.jexl.StatefulArithmetic) YieldCallback(org.apache.accumulo.core.iterators.YieldCallback) IteratorEnvironment(org.apache.accumulo.core.iterators.IteratorEnvironment) EvaluationTrackingPredicate(datawave.query.iterator.profile.EvaluationTrackingPredicate) KryoDocumentSerializer(datawave.query.function.serializer.KryoDocumentSerializer) IteratorBuildingVisitor(datawave.query.jexl.visitors.IteratorBuildingVisitor) DocumentMetadata(datawave.query.function.DocumentMetadata) EmptyContext(datawave.query.util.EmptyContext) IndexOnlyContextCreator(datawave.query.function.IndexOnlyContextCreator) EvaluationTrackingIterator(datawave.query.iterator.profile.EvaluationTrackingIterator) GroupingTransform(datawave.query.transformer.GroupingTransform) Logger(org.apache.log4j.Logger) ValueTuple(datawave.query.attributes.ValueTuple) SourceTrackingIterator(datawave.query.iterator.profile.SourceTrackingIterator) PipelineQuerySpanCollectionIterator(datawave.query.iterator.profile.PipelineQuerySpanCollectionIterator) Path(org.apache.hadoop.fs.Path) KeyAdjudicator(datawave.query.jexl.functions.KeyAdjudicator) Value(org.apache.accumulo.core.data.Value) PipelineFactory(datawave.query.iterator.pipeline.PipelineFactory) AttributeKeepFilter(datawave.query.attributes.AttributeKeepFilter) SatisfactionVisitor(datawave.query.jexl.visitors.SatisfactionVisitor) ActiveQueryLog(datawave.query.tracking.ActiveQueryLog) MaskedValueFilterFactory(datawave.query.function.MaskedValueFilterFactory) Tuple2(datawave.query.util.Tuple2) Tuple3(datawave.query.util.Tuple3) Function(com.google.common.base.Function) JexlEvaluation(datawave.query.function.JexlEvaluation) DelayedNonEventSubTreeVisitor(datawave.query.jexl.visitors.DelayedNonEventSubTreeVisitor) EmptyDocumentFilter(datawave.query.predicate.EmptyDocumentFilter) TypeMetadata(datawave.query.util.TypeMetadata) Collection(java.util.Collection) Sets(com.google.common.collect.Sets) FileNotFoundException(java.io.FileNotFoundException) ReturnType(datawave.query.DocumentSerialization.ReturnType) TabletClosedException(org.apache.accumulo.tserver.tablet.TabletClosedException) List(java.util.List) ActiveQuery(datawave.query.tracking.ActiveQuery) MarkingFunctionsFactory(datawave.marking.MarkingFunctionsFactory) Type(datawave.data.type.Type) Entry(java.util.Map.Entry) LimitFields(datawave.query.function.LimitFields) IvaratorCacheDirConfig(datawave.query.iterator.ivarator.IvaratorCacheDirConfig) FinalDocumentTrackingIterator(datawave.query.iterator.profile.FinalDocumentTrackingIterator) EntryToTuple(datawave.query.util.EntryToTuple) GenericObjectPool(org.apache.commons.pool.impl.GenericObjectPool) HashMap(java.util.HashMap) Multimap(com.google.common.collect.Multimap) MultiThreadedQuerySpan(datawave.query.iterator.profile.MultiThreadedQuerySpan) UniqueTransform(datawave.query.transformer.UniqueTransform) Iterators(com.google.common.collect.Iterators) IdentityAggregator(datawave.query.jexl.functions.IdentityAggregator) DatawaveJexlContext(datawave.query.jexl.DatawaveJexlContext) CompositeIngest(datawave.ingest.data.config.ingest.CompositeIngest) Iterator(java.util.Iterator) MalformedURLException(java.net.MalformedURLException) Preconditions.checkNotNull(com.google.common.base.Preconditions.checkNotNull) KryoDocumentDeserializer(datawave.query.function.deserializer.KryoDocumentDeserializer) Maps(com.google.common.collect.Maps) Constants(datawave.query.Constants) NumericalEncoder(datawave.data.type.util.NumericalEncoder) TupleToEntry(datawave.query.util.TupleToEntry) EvaluationTrackingNestedIterator(datawave.query.iterator.profile.EvaluationTrackingNestedIterator) CompositeMetadata(datawave.query.composite.CompositeMetadata) Comparator(java.util.Comparator) Collections(java.util.Collections) VariableNameVisitor(datawave.query.jexl.visitors.VariableNameVisitor) ToStringDocumentSerializer(datawave.query.function.serializer.ToStringDocumentSerializer) UniqueTransform(datawave.query.transformer.UniqueTransform) KeyAdjudicator(datawave.query.jexl.functions.KeyAdjudicator) GroupingTransform(datawave.query.transformer.GroupingTransform) Document(datawave.query.attributes.Document) Span(org.apache.accumulo.core.trace.Span) QuerySpan(datawave.query.iterator.profile.QuerySpan) MultiThreadedQuerySpan(datawave.query.iterator.profile.MultiThreadedQuerySpan) FinalDocumentTrackingIterator(datawave.query.iterator.profile.FinalDocumentTrackingIterator) KryoDocumentSerializer(datawave.query.function.serializer.KryoDocumentSerializer) KryoDocumentDeserializer(datawave.query.function.deserializer.KryoDocumentDeserializer) Entry(java.util.Map.Entry) TupleToEntry(datawave.query.util.TupleToEntry) EvaluationTrackingNestedIterator(datawave.query.iterator.profile.EvaluationTrackingNestedIterator) PipelineIterator(datawave.query.iterator.pipeline.PipelineIterator) WritableDocumentSerializer(datawave.query.function.serializer.WritableDocumentSerializer) Text(org.apache.hadoop.io.Text) Range(org.apache.accumulo.core.data.Range) ConfigException(org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException) IterationInterruptedException(org.apache.accumulo.core.iterators.IterationInterruptedException) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) TabletClosedException(org.apache.accumulo.tserver.tablet.TabletClosedException) MalformedURLException(java.net.MalformedURLException) QueryStatsDClient(datawave.query.statsd.QueryStatsDClient) Key(org.apache.accumulo.core.data.Key)

Example 2 with Aggregation

use of datawave.query.function.Aggregation in project datawave by NationalSecurityAgency.

the class QueryIterator method createDocumentPipeline.

/**
 * Create the pipeline. It is very important that this pipeline can handle resetting the bottom iterator with a new value. This means that hasNext() needs
 * to call the next iterator. The only state that can be maintained is the next value ready after hasNext() has been called. Once next returns the value,
 * the next hasNext() call must call the next iterator again. So for example Iterators.filter() cannot be used as it uses a google commons AbstractIterator
 * that maintains an iterator state (failed, ready, done); use statelessFilter above instead.
 *
 * @param deepSourceCopy
 * @param documentSpecificSource
 * @return iterator of keys and values
 */
public Iterator<Entry<Key, Document>> createDocumentPipeline(SortedKeyValueIterator<Key, Value> deepSourceCopy, final NestedQueryIterator<Key> documentSpecificSource, Collection<ByteSequence> columnFamilies, boolean inclusive, QuerySpanCollector querySpanCollector) {
    QuerySpan trackingSpan = null;
    if (gatherTimingDetails()) {
        trackingSpan = new QuerySpan(getStatsdClient());
    }
    if (log.isTraceEnabled()) {
        log.trace("createDocumentPipeline");
    }
    final Function<Entry<Key, Document>, Entry<DocumentData, Document>> docMapper;
    if (isFieldIndexSatisfyingQuery()) {
        if (log.isTraceEnabled()) {
            log.trace("isFieldIndexSatisfyingQuery");
        }
        docMapper = new Function<Entry<Key, Document>, Entry<DocumentData, Document>>() {

            @Nullable
            @Override
            public Entry<DocumentData, Document> apply(@Nullable Entry<Key, Document> input) {
                Entry<DocumentData, Document> entry = null;
                if (input != null) {
                    entry = Maps.immutableEntry(new DocumentData(input.getKey(), Collections.singleton(input.getKey()), Collections.EMPTY_LIST, true), input.getValue());
                }
                return entry;
            }
        };
    } else {
        docMapper = new KeyToDocumentData(deepSourceCopy, myEnvironment, documentOptions, super.equality, getEvaluationFilter(), this.includeHierarchyFields, this.includeHierarchyFields);
    }
    Iterator<Entry<DocumentData, Document>> sourceIterator = Iterators.transform(documentSpecificSource, from -> {
        Entry<Key, Document> entry = Maps.immutableEntry(from, documentSpecificSource.document());
        return docMapper.apply(entry);
    });
    // Take the document Keys and transform it into Entry<Key,Document>,
    // removing Attributes for this Document
    // which do not fall within the expected time range
    Iterator<Entry<Key, Document>> documents = null;
    Aggregation a = new Aggregation(this.getTimeFilter(), this.typeMetadataWithNonIndexed, compositeMetadata, this.isIncludeGroupingContext(), this.includeRecordId, this.disableIndexOnlyDocuments(), getEvaluationFilter(), isTrackSizes());
    if (gatherTimingDetails()) {
        documents = Iterators.transform(sourceIterator, new EvaluationTrackingFunction<>(QuerySpan.Stage.Aggregation, trackingSpan, a));
    } else {
        documents = Iterators.transform(sourceIterator, a);
    }
    // Inject the data type as a field if the user requested it
    if (this.includeDatatype) {
        if (gatherTimingDetails()) {
            documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.DataTypeAsField, trackingSpan, new DataTypeAsField(this.datatypeKey)));
        } else {
            documents = Iterators.transform(documents, new DataTypeAsField(this.datatypeKey));
        }
    }
    // Inject the document permutations if required
    if (!this.getDocumentPermutations().isEmpty()) {
        if (gatherTimingDetails()) {
            documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.DocumentPermutation, trackingSpan, new DocumentPermutation.DocumentPermutationAggregation(this.getDocumentPermutations())));
        } else {
            documents = Iterators.transform(documents, new DocumentPermutation.DocumentPermutationAggregation(this.getDocumentPermutations()));
        }
    }
    if (gatherTimingDetails()) {
        documents = new EvaluationTrackingIterator(QuerySpan.Stage.DocumentEvaluation, trackingSpan, getEvaluation(documentSpecificSource, deepSourceCopy, documents, compositeMetadata, typeMetadataWithNonIndexed, columnFamilies, inclusive));
    } else {
        documents = getEvaluation(documentSpecificSource, deepSourceCopy, documents, compositeMetadata, typeMetadataWithNonIndexed, columnFamilies, inclusive);
    }
    // a hook to allow mapping the document such as with the TLD or Parent
    // query logics
    // or if the document was not aggregated in the first place because the
    // field index fields completely satisfied the query
    documents = mapDocument(deepSourceCopy, documents, compositeMetadata);
    // apply any configured post processing
    documents = getPostProcessingChain(documents);
    if (gatherTimingDetails()) {
        documents = new EvaluationTrackingIterator(QuerySpan.Stage.PostProcessing, trackingSpan, documents);
    }
    // Filter out masked values if requested
    if (this.filterMaskedValues) {
        MaskedValueFilterInterface mvfi = MaskedValueFilterFactory.get(this.isIncludeGroupingContext(), this.isReducedResponse());
        if (gatherTimingDetails()) {
            documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.MaskedValueFilter, trackingSpan, mvfi));
        } else {
            documents = Iterators.transform(documents, mvfi);
        }
    }
    // now filter the attributes to those with the keep flag set true
    if (gatherTimingDetails()) {
        documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.AttributeKeepFilter, trackingSpan, new AttributeKeepFilter<>()));
    } else {
        documents = Iterators.transform(documents, new AttributeKeepFilter<>());
    }
    // Project fields using a whitelist or a blacklist before serialization
    if (this.projectResults) {
        if (gatherTimingDetails()) {
            documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.DocumentProjection, trackingSpan, getProjection()));
        } else {
            documents = Iterators.transform(documents, getProjection());
        }
    }
    // remove the composite entries
    documents = Iterators.transform(documents, this.getCompositeProjection());
    // projection or visibility filtering)
    if (gatherTimingDetails()) {
        documents = statelessFilter(documents, new EvaluationTrackingPredicate<>(QuerySpan.Stage.EmptyDocumentFilter, trackingSpan, new EmptyDocumentFilter()));
        documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.DocumentMetadata, trackingSpan, new DocumentMetadata()));
    } else {
        documents = statelessFilter(documents, new EmptyDocumentFilter());
        documents = Iterators.transform(documents, new DocumentMetadata());
    }
    if (!this.limitFieldsMap.isEmpty()) {
        if (gatherTimingDetails()) {
            documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.LimitFields, trackingSpan, new LimitFields(this.getLimitFieldsMap())));
        } else {
            documents = Iterators.transform(documents, new LimitFields(this.getLimitFieldsMap()));
        }
    }
    // do I need to remove the grouping context I added above?
    if (groupingContextAddedByMe) {
        if (gatherTimingDetails()) {
            documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.RemoveGroupingContext, trackingSpan, new RemoveGroupingContext()));
        } else {
            documents = Iterators.transform(documents, new RemoveGroupingContext());
        }
    }
    // only add the pipeline query span collection iterator which will cache metrics with each document if collectTimingDetails is true
    if (collectTimingDetails) {
        // if there is not a result, then add the trackingSpan to the
        // QuerySpanCollector
        // if there was a result, then the metrics from the trackingSpan
        // will be added here
        documents = new PipelineQuerySpanCollectionIterator(querySpanCollector, trackingSpan, documents);
    }
    return documents;
}

Also used : Document(datawave.query.attributes.Document) KeyToDocumentData(datawave.query.function.KeyToDocumentData) Aggregation(datawave.query.function.Aggregation) DocumentPermutation(datawave.query.function.DocumentPermutation) Entry(java.util.Map.Entry) TupleToEntry(datawave.query.util.TupleToEntry) EvaluationTrackingFunction(datawave.query.iterator.profile.EvaluationTrackingFunction) RemoveGroupingContext(datawave.query.function.RemoveGroupingContext) EmptyDocumentFilter(datawave.query.predicate.EmptyDocumentFilter) DataTypeAsField(datawave.query.function.DataTypeAsField) EvaluationTrackingIterator(datawave.query.iterator.profile.EvaluationTrackingIterator) PipelineQuerySpanCollectionIterator(datawave.query.iterator.profile.PipelineQuerySpanCollectionIterator) QuerySpan(datawave.query.iterator.profile.QuerySpan) MultiThreadedQuerySpan(datawave.query.iterator.profile.MultiThreadedQuerySpan) MaskedValueFilterInterface(datawave.query.function.MaskedValueFilterInterface) DocumentData(datawave.query.iterator.aggregation.DocumentData) KeyToDocumentData(datawave.query.function.KeyToDocumentData) EvaluationTrackingPredicate(datawave.query.iterator.profile.EvaluationTrackingPredicate) LimitFields(datawave.query.function.LimitFields) DocumentMetadata(datawave.query.function.DocumentMetadata) AttributeKeepFilter(datawave.query.attributes.AttributeKeepFilter) Nullable(javax.annotation.Nullable) Key(org.apache.accumulo.core.data.Key)

Example 3 with Aggregation

use of datawave.query.function.Aggregation in project datawave by NationalSecurityAgency.

the class IndexOnlyFunctionIterator method initializeFetch.

/*
     * Trigger the fetch by creating a stack of iterators based on a specialized, index-only, KeyToDocumentData implementation.
     * 
     * @param fieldName The field to be fetched
     * 
     * @param fetchAllRecords If true, fetch all relevant records, fully populating the Document and its DocumentData with all relevant name/value pairs.
     * 
     * @return an iterator of Key/Document pairs
     */
private <E> Iterator<Entry<Key, Document>> initializeFetch(final String fieldName, final IndexOnlyKeyToDocumentData keyToDocumentData) {
    Collection<Entry<Key, Document>> collection = Collections.emptySet();
    Iterator<Entry<Key, Document>> documents = collection.iterator();
    try {
        // Create a range to load a document with index-only information
        final Range parent = this.parentRange;
        final Key startKey = parent.getStartKey();
        final Text tfRow = startKey.getRow();
        final Text tfCf = new Text(TF_COLUMN_FAMILY);
        Text tfPartialCq = startKey.getColumnFamily();
        if ((tfPartialCq.getLength() == 0) && (null != this.documentKey)) {
            tfPartialCq = this.documentKey.getColumnFamily();
        }
        final ColumnVisibility cv = new ColumnVisibility(startKey.getColumnVisibility());
        long timeStamp = startKey.getTimestamp();
        final Key start = new Key(tfRow, tfCf, tfPartialCq, cv, timeStamp);
        final Key stop = new Key(tfRow, tfCf, tfPartialCq, cv, timeStamp);
        final Range indexOnlyRange = new Range(start, stop);
        // Take the document Keys and transform it into Entry<Key,Document>, which will remove attributes for this document
        // not falling within the expected time range
        final TypeMetadata typeMetadata = this.contextCreator.getTypeMetadata();
        final CompositeMetadata compositeMetadata = this.contextCreator.getCompositeMetadata();
        boolean includeGroupingContext = this.contextCreator.isIncludeGroupingContext();
        final TimeFilter timeFilter = this.contextCreator.getTimeFilter();
        boolean includeRecordId = this.contextCreator.isIncludeRecordId();
        final Aggregation aggregation = new Aggregation(timeFilter, typeMetadata, compositeMetadata, includeGroupingContext, includeRecordId, false, null);
        // Construct an iterator to build the document. Although the DocumentData will be retrieved from the tf section
        // of the shard table, the IndexOnlyKeyToDocumentData will reformat the entries to "look" like records from standard
        // columns.
        final Key documentKey = this.contextCreator.getGetDocumentKey().apply(indexOnlyRange);
        final DocumentSpecificTreeIterable source = new DocumentSpecificTreeIterable(documentKey, keyToDocumentData);
        // Initialize the seek
        source.iterator();
        // Initialize the fetch
        documents = Iterators.transform(keyToDocumentData, aggregation);
    } catch (final Exception e) {
        final String message = "Could not perform function on index-only field '" + fieldName + "\' for range " + this.parentRange;
        LOG.error(message, e);
    }
    return documents;
}

Also used : TypeMetadata(datawave.query.util.TypeMetadata) TimeFilter(datawave.query.predicate.TimeFilter) Text(org.apache.hadoop.io.Text) Range(org.apache.accumulo.core.data.Range) Aggregation(datawave.query.function.Aggregation) Entry(java.util.Map.Entry) CompositeMetadata(datawave.query.composite.CompositeMetadata) ColumnVisibility(org.apache.accumulo.core.security.ColumnVisibility) Key(org.apache.accumulo.core.data.Key)

Example 4 with Aggregation

use of datawave.query.function.Aggregation in project datawave by NationalSecurityAgency.

the class DynamicFacetIterator method getDocumentIterator.

@SuppressWarnings({ "unchecked", "rawtypes" })
@Override
public Iterator<Entry<Key, Document>> getDocumentIterator(Range range, Collection<ByteSequence> columnFamilies, boolean inclusive) throws IOException, ConfigException, InstantiationException, IllegalAccessException {
    // Otherwise, we have to use the field index
    // Seek() the boolean logic stuff
    createAndSeekIndexIterator(range, columnFamilies, inclusive);
    Function<Entry<Key, Document>, Entry<DocumentData, Document>> keyToDoc = null;
    // TODO consider using the new EventDataQueryExpressionFilter
    EventDataQueryFieldFilter projection = null;
    Iterator<Entry<Key, Document>> documents = null;
    if (!configuration.getFacetedFields().isEmpty()) {
        projection = new EventDataQueryFieldFilter();
        projection.initializeWhitelist(configuration.getFacetedFields());
    }
    if (!configuration.hasFieldLimits() || projection != null) {
        keyToDoc = new KeyToDocumentData(source.deepCopy(myEnvironment), super.equality, projection, this.includeHierarchyFields, this.includeHierarchyFields);
    }
    AccumuloTreeIterable<Key, DocumentData> doc = null;
    if (null != keyToDoc) {
        doc = new AccumuloTreeIterable<>(fieldIndexResults.tree, keyToDoc);
    } else {
        if (log.isTraceEnabled()) {
            log.trace("Skipping document lookup, because we don't need it");
        }
        doc = new AccumuloTreeIterable<>(fieldIndexResults.tree, new Function<Entry<Key, Document>, Entry<DocumentData, Document>>() {

            @Override
            @Nullable
            public Entry<DocumentData, Document> apply(@Nullable Entry<Key, Document> input) {
                Set<Key> docKeys = Sets.newHashSet();
                List<Entry<Key, Value>> attrs = Lists.newArrayList();
                return Maps.immutableEntry(new DocumentData(input.getKey(), docKeys, attrs, true), input.getValue());
            }
        });
    }
    doc.seek(range, columnFamilies, inclusive);
    TypeMetadata typeMetadata = this.getTypeMetadata();
    documents = Iterators.transform(doc.iterator(), new Aggregation(this.getTimeFilter(), typeMetadata, compositeMetadata, this.isIncludeGroupingContext(), this.includeRecordId, false, null));
    switch(configuration.getType()) {
        case SHARD_COUNT:
        case DAY_COUNT:
            SortedKeyValueIterator<Key, Value> sourceDeepCopy = source.deepCopy(myEnvironment);
            documents = getEvaluation(sourceDeepCopy, documents, compositeMetadata, typeMetadata, columnFamilies, inclusive);
            // Take the document Keys and transform it into Entry<Key,Document>, removing Attributes for this Document
            // which do not fall within the expected time range
            documents = Iterators.transform(documents, new DocumentCountCardinality(configuration.getType(), !merge));
        default:
            break;
    }
    return documents;
}

Also used : TypeMetadata(datawave.query.util.TypeMetadata) Document(datawave.query.attributes.Document) DocumentCountCardinality(datawave.query.function.DocumentCountCardinality) KeyToDocumentData(datawave.query.function.KeyToDocumentData) Aggregation(datawave.query.function.Aggregation) KeyToDocumentData(datawave.query.function.KeyToDocumentData) DocumentData(datawave.query.iterator.aggregation.DocumentData) Function(com.google.common.base.Function) Entry(java.util.Map.Entry) EventDataQueryFieldFilter(datawave.query.predicate.EventDataQueryFieldFilter) Value(org.apache.accumulo.core.data.Value) Key(org.apache.accumulo.core.data.Key) Nullable(javax.annotation.Nullable)

Example 5 with Aggregation

use of datawave.query.function.Aggregation in project datawave by NationalSecurityAgency.

the class QueryIterator method mapDocument.

protected Iterator<Entry<Key, Document>> mapDocument(SortedKeyValueIterator<Key, Value> deepSourceCopy, Iterator<Entry<Key, Document>> documents, CompositeMetadata compositeMetadata) {
    // now lets pull the data if we need to
    if (log.isTraceEnabled()) {
        log.trace("mapDocument " + fieldIndexSatisfiesQuery);
    }
    if (fieldIndexSatisfiesQuery) {
        final KeyToDocumentData docMapper = new KeyToDocumentData(deepSourceCopy, this.myEnvironment, this.documentOptions, super.equality, getEvaluationFilter(), this.includeHierarchyFields, this.includeHierarchyFields);
        Iterator<Tuple2<Key, Document>> mappedDocuments = Iterators.transform(documents, new GetDocument(docMapper, new Aggregation(this.getTimeFilter(), typeMetadataWithNonIndexed, compositeMetadata, this.isIncludeGroupingContext(), this.includeRecordId, this.disableIndexOnlyDocuments(), getEvaluationFilter(), isTrackSizes())));
        Iterator<Entry<Key, Document>> retDocuments = Iterators.transform(mappedDocuments, new TupleToEntry<>());
        // Inject the document permutations if required
        if (!this.getDocumentPermutations().isEmpty()) {
            if (gatherTimingDetails()) {
                retDocuments = Iterators.transform(retDocuments, new EvaluationTrackingFunction<>(QuerySpan.Stage.DocumentPermutation, trackingSpan, new DocumentPermutation.DocumentPermutationAggregation(this.getDocumentPermutations())));
            } else {
                retDocuments = Iterators.transform(retDocuments, new DocumentPermutation.DocumentPermutationAggregation(this.getDocumentPermutations()));
            }
        }
        return retDocuments;
    }
    return documents;
}

Also used : Aggregation(datawave.query.function.Aggregation) DocumentPermutation(datawave.query.function.DocumentPermutation) Entry(java.util.Map.Entry) TupleToEntry(datawave.query.util.TupleToEntry) EvaluationTrackingFunction(datawave.query.iterator.profile.EvaluationTrackingFunction) Tuple2(datawave.query.util.Tuple2) KeyToDocumentData(datawave.query.function.KeyToDocumentData)

Aggregations

Aggregation (datawave.query.function.Aggregation)5 Entry (java.util.Map.Entry)5 KeyToDocumentData (datawave.query.function.KeyToDocumentData)4 Key (org.apache.accumulo.core.data.Key)4 Document (datawave.query.attributes.Document)3 DocumentPermutation (datawave.query.function.DocumentPermutation)3 DocumentData (datawave.query.iterator.aggregation.DocumentData)3 EvaluationTrackingFunction (datawave.query.iterator.profile.EvaluationTrackingFunction)3 TupleToEntry (datawave.query.util.TupleToEntry)3 TypeMetadata (datawave.query.util.TypeMetadata)3 Nullable (javax.annotation.Nullable)3 Function (com.google.common.base.Function)2 AttributeKeepFilter (datawave.query.attributes.AttributeKeepFilter)2 CompositeMetadata (datawave.query.composite.CompositeMetadata)2 DataTypeAsField (datawave.query.function.DataTypeAsField)2 DocumentMetadata (datawave.query.function.DocumentMetadata)2 LimitFields (datawave.query.function.LimitFields)2 MaskedValueFilterInterface (datawave.query.function.MaskedValueFilterInterface)2 RemoveGroupingContext (datawave.query.function.RemoveGroupingContext)2 EvaluationTrackingIterator (datawave.query.iterator.profile.EvaluationTrackingIterator)2