use of datawave.query.function.Aggregation in project datawave by NationalSecurityAgency.
the class QueryIterator method seek.
@Override
public void seek(Range range, Collection<ByteSequence> columnFamilies, boolean inclusive) throws IOException {
// preserve the original range for use with the Final Document tracking iterator because it is placed after the ResultCountingIterator
// so the FinalDocumentTracking iterator needs the start key with the count already appended
originalRange = range;
getActiveQueryLog().get(getQueryId()).beginCall(this.originalRange, ActiveQuery.CallType.SEEK);
Span span = Trace.start("QueryIterator.seek");
if (!this.isIncludeGroupingContext() && (this.query.contains("grouping:") || this.query.contains("matchesInGroup") || this.query.contains("MatchesInGroup") || this.query.contains("atomValuesMatch"))) {
this.setIncludeGroupingContext(true);
this.groupingContextAddedByMe = true;
} else {
this.groupingContextAddedByMe = false;
}
try {
if (log.isDebugEnabled()) {
log.debug("Seek range: " + range + " " + query);
}
this.range = range;
// determine whether this is a teardown/rebuild range
long resultCount = 0;
if (!range.isStartKeyInclusive()) {
// see if we can fail fast. If we were rebuilt with the FinalDocument key, then we are already completely done
if (collectTimingDetails && FinalDocumentTrackingIterator.isFinalDocumentKey(range.getStartKey())) {
this.seekKeySource = new EmptyTreeIterable();
this.serializedDocuments = EmptyIterator.emptyIterator();
prepareKeyValue(span);
return;
}
// see if we have a count in the cf
Key startKey = range.getStartKey();
String[] parts = StringUtils.split(startKey.getColumnFamily().toString(), '\0');
if (parts.length == 3) {
resultCount = NumericalEncoder.decode(parts[0]).longValue();
// remove the count from the range
startKey = new Key(startKey.getRow(), new Text(parts[1] + '\0' + parts[2]), startKey.getColumnQualifier(), startKey.getColumnVisibility(), startKey.getTimestamp());
this.range = range = new Range(startKey, range.isStartKeyInclusive(), range.getEndKey(), range.isEndKeyInclusive());
}
}
// determine whether this is a document specific range
Range documentRange = isDocumentSpecificRange(range) ? range : null;
// is done
if (documentRange != null && !documentRange.isStartKeyInclusive()) {
if (log.isTraceEnabled()) {
log.trace("Received non-inclusive event specific range: " + documentRange);
}
if (gatherTimingDetails()) {
this.seekKeySource = new EvaluationTrackingNestedIterator(QuerySpan.Stage.EmptyTree, trackingSpan, new EmptyTreeIterable(), myEnvironment);
} else {
this.seekKeySource = new EmptyTreeIterable();
}
} else // if the Range is for a single document and the query doesn't reference any index-only or tokenized fields
if (documentRange != null && (!this.isContainsIndexOnlyTerms() && this.getTermFrequencyFields().isEmpty() && !super.mustUseFieldIndex)) {
if (log.isTraceEnabled()) {
log.trace("Received event specific range: " + documentRange);
}
// We can take a shortcut to the directly to the event
Map.Entry<Key, Document> documentKey = Maps.immutableEntry(super.getDocumentKey.apply(documentRange), new Document());
if (log.isTraceEnabled()) {
log.trace("Transformed document key: " + documentKey);
}
if (gatherTimingDetails()) {
this.seekKeySource = new EvaluationTrackingNestedIterator(QuerySpan.Stage.DocumentSpecificTree, trackingSpan, new DocumentSpecificNestedIterator(documentKey), myEnvironment);
} else {
this.seekKeySource = new DocumentSpecificNestedIterator(documentKey);
}
} else {
this.seekKeySource = buildDocumentIterator(documentRange, range, columnFamilies, inclusive);
}
// Create the pipeline iterator for document aggregation and
// evaluation within a thread pool
PipelineIterator pipelineIter = PipelineFactory.createIterator(this.seekKeySource, getMaxEvaluationPipelines(), getMaxPipelineCachedResults(), getSerialPipelineRequest(), querySpanCollector, trackingSpan, this, sourceForDeepCopies.deepCopy(myEnvironment), myEnvironment, yield, yieldThresholdMs, columnFamilies, inclusive);
pipelineIter.setCollectTimingDetails(collectTimingDetails);
// TODO pipelineIter.setStatsdHostAndPort(statsdHostAndPort);
pipelineIter.startPipeline();
// gather Key,Document Entries from the pipelines
Iterator<Entry<Key, Document>> pipelineDocuments = pipelineIter;
if (log.isTraceEnabled()) {
pipelineDocuments = Iterators.filter(pipelineDocuments, keyDocumentEntry -> {
log.trace("after pipeline, keyDocumentEntry:" + keyDocumentEntry);
return true;
});
}
// now apply the unique transform if requested
UniqueTransform uniquify = getUniqueTransform();
if (uniquify != null) {
pipelineDocuments = Iterators.filter(pipelineDocuments, uniquify.getUniquePredicate());
}
// apply the grouping transform if requested and if the batch size is greater than zero
// if the batch size is 0, then grouping is computed only on the web server
GroupingTransform groupify = getGroupingTransform();
if (groupify != null && this.groupFieldsBatchSize > 0) {
pipelineDocuments = groupingTransform.getGroupingIterator(pipelineDocuments, this.groupFieldsBatchSize, this.yield);
if (log.isTraceEnabled()) {
pipelineDocuments = Iterators.filter(pipelineDocuments, keyDocumentEntry -> {
log.trace("after grouping, keyDocumentEntry:" + keyDocumentEntry);
return true;
});
}
}
pipelineDocuments = Iterators.filter(pipelineDocuments, keyDocumentEntry -> {
// last chance before the documents are serialized
getActiveQueryLog().get(getQueryId()).recordStats(keyDocumentEntry.getValue(), querySpanCollector.getCombinedQuerySpan(null));
// Always return true since we just want to record data in the ActiveQueryLog
return true;
});
if (this.getReturnType() == ReturnType.kryo) {
// Serialize the Document using Kryo
this.serializedDocuments = Iterators.transform(pipelineDocuments, new KryoDocumentSerializer(isReducedResponse(), isCompressResults()));
} else if (this.getReturnType() == ReturnType.writable) {
// Use the Writable interface to serialize the Document
this.serializedDocuments = Iterators.transform(pipelineDocuments, new WritableDocumentSerializer(isReducedResponse()));
} else if (this.getReturnType() == ReturnType.tostring) {
// Just return a toString() representation of the document
this.serializedDocuments = Iterators.transform(pipelineDocuments, new ToStringDocumentSerializer(isReducedResponse()));
} else {
throw new IllegalArgumentException("Unknown return type of: " + this.getReturnType());
}
if (log.isTraceEnabled()) {
KryoDocumentDeserializer dser = new KryoDocumentDeserializer();
this.serializedDocuments = Iterators.filter(this.serializedDocuments, keyValueEntry -> {
log.trace("after serializing, keyValueEntry:" + dser.apply(keyValueEntry));
return true;
});
}
// Cannot do this on document specific ranges as the count would place the keys outside the initial range
if (!sortedUIDs && documentRange == null) {
this.serializedDocuments = new ResultCountingIterator(serializedDocuments, resultCount, yield);
} else if (this.sortedUIDs) {
// we have sorted UIDs, so we can mask out the cq
this.serializedDocuments = new KeyAdjudicator<>(serializedDocuments, yield);
}
// only add the final document tracking iterator which sends stats back to the client if collectTimingDetails is true
if (collectTimingDetails) {
// if there is no document to return, then add an empty document
// to store the timing metadata
this.serializedDocuments = new FinalDocumentTrackingIterator(querySpanCollector, trackingSpan, originalRange, this.serializedDocuments, this.getReturnType(), this.isReducedResponse(), this.isCompressResults(), this.yield);
}
if (log.isTraceEnabled()) {
KryoDocumentDeserializer dser = new KryoDocumentDeserializer();
this.serializedDocuments = Iterators.filter(this.serializedDocuments, keyValueEntry -> {
log.debug("finally, considering:" + dser.apply(keyValueEntry));
return true;
});
}
// Determine if we have items to return
prepareKeyValue(span);
} catch (Exception e) {
handleException(e);
} finally {
if (gatherTimingDetails() && trackingSpan != null && querySpanCollector != null) {
querySpanCollector.addQuerySpan(trackingSpan);
}
if (null != span) {
span.stop();
}
QueryStatsDClient client = getStatsdClient();
if (client != null) {
client.flush();
}
getActiveQueryLog().get(getQueryId()).endCall(this.originalRange, ActiveQuery.CallType.SEEK);
if (this.key == null && this.value == null) {
// no entries to return
getActiveQueryLog().remove(getQueryId(), this.originalRange);
}
}
}
use of datawave.query.function.Aggregation in project datawave by NationalSecurityAgency.
the class QueryIterator method createDocumentPipeline.
/**
* Create the pipeline. It is very important that this pipeline can handle resetting the bottom iterator with a new value. This means that hasNext() needs
* to call the next iterator. The only state that can be maintained is the next value ready after hasNext() has been called. Once next returns the value,
* the next hasNext() call must call the next iterator again. So for example Iterators.filter() cannot be used as it uses a google commons AbstractIterator
* that maintains an iterator state (failed, ready, done); use statelessFilter above instead.
*
* @param deepSourceCopy
* @param documentSpecificSource
* @return iterator of keys and values
*/
public Iterator<Entry<Key, Document>> createDocumentPipeline(SortedKeyValueIterator<Key, Value> deepSourceCopy, final NestedQueryIterator<Key> documentSpecificSource, Collection<ByteSequence> columnFamilies, boolean inclusive, QuerySpanCollector querySpanCollector) {
QuerySpan trackingSpan = null;
if (gatherTimingDetails()) {
trackingSpan = new QuerySpan(getStatsdClient());
}
if (log.isTraceEnabled()) {
log.trace("createDocumentPipeline");
}
final Function<Entry<Key, Document>, Entry<DocumentData, Document>> docMapper;
if (isFieldIndexSatisfyingQuery()) {
if (log.isTraceEnabled()) {
log.trace("isFieldIndexSatisfyingQuery");
}
docMapper = new Function<Entry<Key, Document>, Entry<DocumentData, Document>>() {
@Nullable
@Override
public Entry<DocumentData, Document> apply(@Nullable Entry<Key, Document> input) {
Entry<DocumentData, Document> entry = null;
if (input != null) {
entry = Maps.immutableEntry(new DocumentData(input.getKey(), Collections.singleton(input.getKey()), Collections.EMPTY_LIST, true), input.getValue());
}
return entry;
}
};
} else {
docMapper = new KeyToDocumentData(deepSourceCopy, myEnvironment, documentOptions, super.equality, getEvaluationFilter(), this.includeHierarchyFields, this.includeHierarchyFields);
}
Iterator<Entry<DocumentData, Document>> sourceIterator = Iterators.transform(documentSpecificSource, from -> {
Entry<Key, Document> entry = Maps.immutableEntry(from, documentSpecificSource.document());
return docMapper.apply(entry);
});
// Take the document Keys and transform it into Entry<Key,Document>,
// removing Attributes for this Document
// which do not fall within the expected time range
Iterator<Entry<Key, Document>> documents = null;
Aggregation a = new Aggregation(this.getTimeFilter(), this.typeMetadataWithNonIndexed, compositeMetadata, this.isIncludeGroupingContext(), this.includeRecordId, this.disableIndexOnlyDocuments(), getEvaluationFilter(), isTrackSizes());
if (gatherTimingDetails()) {
documents = Iterators.transform(sourceIterator, new EvaluationTrackingFunction<>(QuerySpan.Stage.Aggregation, trackingSpan, a));
} else {
documents = Iterators.transform(sourceIterator, a);
}
// Inject the data type as a field if the user requested it
if (this.includeDatatype) {
if (gatherTimingDetails()) {
documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.DataTypeAsField, trackingSpan, new DataTypeAsField(this.datatypeKey)));
} else {
documents = Iterators.transform(documents, new DataTypeAsField(this.datatypeKey));
}
}
// Inject the document permutations if required
if (!this.getDocumentPermutations().isEmpty()) {
if (gatherTimingDetails()) {
documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.DocumentPermutation, trackingSpan, new DocumentPermutation.DocumentPermutationAggregation(this.getDocumentPermutations())));
} else {
documents = Iterators.transform(documents, new DocumentPermutation.DocumentPermutationAggregation(this.getDocumentPermutations()));
}
}
if (gatherTimingDetails()) {
documents = new EvaluationTrackingIterator(QuerySpan.Stage.DocumentEvaluation, trackingSpan, getEvaluation(documentSpecificSource, deepSourceCopy, documents, compositeMetadata, typeMetadataWithNonIndexed, columnFamilies, inclusive));
} else {
documents = getEvaluation(documentSpecificSource, deepSourceCopy, documents, compositeMetadata, typeMetadataWithNonIndexed, columnFamilies, inclusive);
}
// a hook to allow mapping the document such as with the TLD or Parent
// query logics
// or if the document was not aggregated in the first place because the
// field index fields completely satisfied the query
documents = mapDocument(deepSourceCopy, documents, compositeMetadata);
// apply any configured post processing
documents = getPostProcessingChain(documents);
if (gatherTimingDetails()) {
documents = new EvaluationTrackingIterator(QuerySpan.Stage.PostProcessing, trackingSpan, documents);
}
// Filter out masked values if requested
if (this.filterMaskedValues) {
MaskedValueFilterInterface mvfi = MaskedValueFilterFactory.get(this.isIncludeGroupingContext(), this.isReducedResponse());
if (gatherTimingDetails()) {
documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.MaskedValueFilter, trackingSpan, mvfi));
} else {
documents = Iterators.transform(documents, mvfi);
}
}
// now filter the attributes to those with the keep flag set true
if (gatherTimingDetails()) {
documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.AttributeKeepFilter, trackingSpan, new AttributeKeepFilter<>()));
} else {
documents = Iterators.transform(documents, new AttributeKeepFilter<>());
}
// Project fields using a whitelist or a blacklist before serialization
if (this.projectResults) {
if (gatherTimingDetails()) {
documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.DocumentProjection, trackingSpan, getProjection()));
} else {
documents = Iterators.transform(documents, getProjection());
}
}
// remove the composite entries
documents = Iterators.transform(documents, this.getCompositeProjection());
// projection or visibility filtering)
if (gatherTimingDetails()) {
documents = statelessFilter(documents, new EvaluationTrackingPredicate<>(QuerySpan.Stage.EmptyDocumentFilter, trackingSpan, new EmptyDocumentFilter()));
documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.DocumentMetadata, trackingSpan, new DocumentMetadata()));
} else {
documents = statelessFilter(documents, new EmptyDocumentFilter());
documents = Iterators.transform(documents, new DocumentMetadata());
}
if (!this.limitFieldsMap.isEmpty()) {
if (gatherTimingDetails()) {
documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.LimitFields, trackingSpan, new LimitFields(this.getLimitFieldsMap())));
} else {
documents = Iterators.transform(documents, new LimitFields(this.getLimitFieldsMap()));
}
}
// do I need to remove the grouping context I added above?
if (groupingContextAddedByMe) {
if (gatherTimingDetails()) {
documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.RemoveGroupingContext, trackingSpan, new RemoveGroupingContext()));
} else {
documents = Iterators.transform(documents, new RemoveGroupingContext());
}
}
// only add the pipeline query span collection iterator which will cache metrics with each document if collectTimingDetails is true
if (collectTimingDetails) {
// if there is not a result, then add the trackingSpan to the
// QuerySpanCollector
// if there was a result, then the metrics from the trackingSpan
// will be added here
documents = new PipelineQuerySpanCollectionIterator(querySpanCollector, trackingSpan, documents);
}
return documents;
}
use of datawave.query.function.Aggregation in project datawave by NationalSecurityAgency.
the class IndexOnlyFunctionIterator method initializeFetch.
/*
* Trigger the fetch by creating a stack of iterators based on a specialized, index-only, KeyToDocumentData implementation.
*
* @param fieldName The field to be fetched
*
* @param fetchAllRecords If true, fetch all relevant records, fully populating the Document and its DocumentData with all relevant name/value pairs.
*
* @return an iterator of Key/Document pairs
*/
private <E> Iterator<Entry<Key, Document>> initializeFetch(final String fieldName, final IndexOnlyKeyToDocumentData keyToDocumentData) {
Collection<Entry<Key, Document>> collection = Collections.emptySet();
Iterator<Entry<Key, Document>> documents = collection.iterator();
try {
// Create a range to load a document with index-only information
final Range parent = this.parentRange;
final Key startKey = parent.getStartKey();
final Text tfRow = startKey.getRow();
final Text tfCf = new Text(TF_COLUMN_FAMILY);
Text tfPartialCq = startKey.getColumnFamily();
if ((tfPartialCq.getLength() == 0) && (null != this.documentKey)) {
tfPartialCq = this.documentKey.getColumnFamily();
}
final ColumnVisibility cv = new ColumnVisibility(startKey.getColumnVisibility());
long timeStamp = startKey.getTimestamp();
final Key start = new Key(tfRow, tfCf, tfPartialCq, cv, timeStamp);
final Key stop = new Key(tfRow, tfCf, tfPartialCq, cv, timeStamp);
final Range indexOnlyRange = new Range(start, stop);
// Take the document Keys and transform it into Entry<Key,Document>, which will remove attributes for this document
// not falling within the expected time range
final TypeMetadata typeMetadata = this.contextCreator.getTypeMetadata();
final CompositeMetadata compositeMetadata = this.contextCreator.getCompositeMetadata();
boolean includeGroupingContext = this.contextCreator.isIncludeGroupingContext();
final TimeFilter timeFilter = this.contextCreator.getTimeFilter();
boolean includeRecordId = this.contextCreator.isIncludeRecordId();
final Aggregation aggregation = new Aggregation(timeFilter, typeMetadata, compositeMetadata, includeGroupingContext, includeRecordId, false, null);
// Construct an iterator to build the document. Although the DocumentData will be retrieved from the tf section
// of the shard table, the IndexOnlyKeyToDocumentData will reformat the entries to "look" like records from standard
// columns.
final Key documentKey = this.contextCreator.getGetDocumentKey().apply(indexOnlyRange);
final DocumentSpecificTreeIterable source = new DocumentSpecificTreeIterable(documentKey, keyToDocumentData);
// Initialize the seek
source.iterator();
// Initialize the fetch
documents = Iterators.transform(keyToDocumentData, aggregation);
} catch (final Exception e) {
final String message = "Could not perform function on index-only field '" + fieldName + "\' for range " + this.parentRange;
LOG.error(message, e);
}
return documents;
}
use of datawave.query.function.Aggregation in project datawave by NationalSecurityAgency.
the class DynamicFacetIterator method getDocumentIterator.
@SuppressWarnings({ "unchecked", "rawtypes" })
@Override
public Iterator<Entry<Key, Document>> getDocumentIterator(Range range, Collection<ByteSequence> columnFamilies, boolean inclusive) throws IOException, ConfigException, InstantiationException, IllegalAccessException {
// Otherwise, we have to use the field index
// Seek() the boolean logic stuff
createAndSeekIndexIterator(range, columnFamilies, inclusive);
Function<Entry<Key, Document>, Entry<DocumentData, Document>> keyToDoc = null;
// TODO consider using the new EventDataQueryExpressionFilter
EventDataQueryFieldFilter projection = null;
Iterator<Entry<Key, Document>> documents = null;
if (!configuration.getFacetedFields().isEmpty()) {
projection = new EventDataQueryFieldFilter();
projection.initializeWhitelist(configuration.getFacetedFields());
}
if (!configuration.hasFieldLimits() || projection != null) {
keyToDoc = new KeyToDocumentData(source.deepCopy(myEnvironment), super.equality, projection, this.includeHierarchyFields, this.includeHierarchyFields);
}
AccumuloTreeIterable<Key, DocumentData> doc = null;
if (null != keyToDoc) {
doc = new AccumuloTreeIterable<>(fieldIndexResults.tree, keyToDoc);
} else {
if (log.isTraceEnabled()) {
log.trace("Skipping document lookup, because we don't need it");
}
doc = new AccumuloTreeIterable<>(fieldIndexResults.tree, new Function<Entry<Key, Document>, Entry<DocumentData, Document>>() {
@Override
@Nullable
public Entry<DocumentData, Document> apply(@Nullable Entry<Key, Document> input) {
Set<Key> docKeys = Sets.newHashSet();
List<Entry<Key, Value>> attrs = Lists.newArrayList();
return Maps.immutableEntry(new DocumentData(input.getKey(), docKeys, attrs, true), input.getValue());
}
});
}
doc.seek(range, columnFamilies, inclusive);
TypeMetadata typeMetadata = this.getTypeMetadata();
documents = Iterators.transform(doc.iterator(), new Aggregation(this.getTimeFilter(), typeMetadata, compositeMetadata, this.isIncludeGroupingContext(), this.includeRecordId, false, null));
switch(configuration.getType()) {
case SHARD_COUNT:
case DAY_COUNT:
SortedKeyValueIterator<Key, Value> sourceDeepCopy = source.deepCopy(myEnvironment);
documents = getEvaluation(sourceDeepCopy, documents, compositeMetadata, typeMetadata, columnFamilies, inclusive);
// Take the document Keys and transform it into Entry<Key,Document>, removing Attributes for this Document
// which do not fall within the expected time range
documents = Iterators.transform(documents, new DocumentCountCardinality(configuration.getType(), !merge));
default:
break;
}
return documents;
}
use of datawave.query.function.Aggregation in project datawave by NationalSecurityAgency.
the class QueryIterator method mapDocument.
protected Iterator<Entry<Key, Document>> mapDocument(SortedKeyValueIterator<Key, Value> deepSourceCopy, Iterator<Entry<Key, Document>> documents, CompositeMetadata compositeMetadata) {
// now lets pull the data if we need to
if (log.isTraceEnabled()) {
log.trace("mapDocument " + fieldIndexSatisfiesQuery);
}
if (fieldIndexSatisfiesQuery) {
final KeyToDocumentData docMapper = new KeyToDocumentData(deepSourceCopy, this.myEnvironment, this.documentOptions, super.equality, getEvaluationFilter(), this.includeHierarchyFields, this.includeHierarchyFields);
Iterator<Tuple2<Key, Document>> mappedDocuments = Iterators.transform(documents, new GetDocument(docMapper, new Aggregation(this.getTimeFilter(), typeMetadataWithNonIndexed, compositeMetadata, this.isIncludeGroupingContext(), this.includeRecordId, this.disableIndexOnlyDocuments(), getEvaluationFilter(), isTrackSizes())));
Iterator<Entry<Key, Document>> retDocuments = Iterators.transform(mappedDocuments, new TupleToEntry<>());
// Inject the document permutations if required
if (!this.getDocumentPermutations().isEmpty()) {
if (gatherTimingDetails()) {
retDocuments = Iterators.transform(retDocuments, new EvaluationTrackingFunction<>(QuerySpan.Stage.DocumentPermutation, trackingSpan, new DocumentPermutation.DocumentPermutationAggregation(this.getDocumentPermutations())));
} else {
retDocuments = Iterators.transform(retDocuments, new DocumentPermutation.DocumentPermutationAggregation(this.getDocumentPermutations()));
}
}
return retDocuments;
}
return documents;
}
Aggregations