use of datawave.query.attributes.Document in project datawave by NationalSecurityAgency.
the class QueryIterator method createDocumentPipeline.
/**
* Create the pipeline. It is very important that this pipeline can handle resetting the bottom iterator with a new value. This means that hasNext() needs
* to call the next iterator. The only state that can be maintained is the next value ready after hasNext() has been called. Once next returns the value,
* the next hasNext() call must call the next iterator again. So for example Iterators.filter() cannot be used as it uses a google commons AbstractIterator
* that maintains an iterator state (failed, ready, done); use statelessFilter above instead.
*
* @param deepSourceCopy
* @param documentSpecificSource
* @return iterator of keys and values
*/
public Iterator<Entry<Key, Document>> createDocumentPipeline(SortedKeyValueIterator<Key, Value> deepSourceCopy, final NestedQueryIterator<Key> documentSpecificSource, Collection<ByteSequence> columnFamilies, boolean inclusive, QuerySpanCollector querySpanCollector) {
QuerySpan trackingSpan = null;
if (gatherTimingDetails()) {
trackingSpan = new QuerySpan(getStatsdClient());
}
if (log.isTraceEnabled()) {
log.trace("createDocumentPipeline");
}
final Function<Entry<Key, Document>, Entry<DocumentData, Document>> docMapper;
if (isFieldIndexSatisfyingQuery()) {
if (log.isTraceEnabled()) {
log.trace("isFieldIndexSatisfyingQuery");
}
docMapper = new Function<Entry<Key, Document>, Entry<DocumentData, Document>>() {
@Nullable
@Override
public Entry<DocumentData, Document> apply(@Nullable Entry<Key, Document> input) {
Entry<DocumentData, Document> entry = null;
if (input != null) {
entry = Maps.immutableEntry(new DocumentData(input.getKey(), Collections.singleton(input.getKey()), Collections.EMPTY_LIST, true), input.getValue());
}
return entry;
}
};
} else {
docMapper = new KeyToDocumentData(deepSourceCopy, myEnvironment, documentOptions, super.equality, getEvaluationFilter(), this.includeHierarchyFields, this.includeHierarchyFields);
}
Iterator<Entry<DocumentData, Document>> sourceIterator = Iterators.transform(documentSpecificSource, from -> {
Entry<Key, Document> entry = Maps.immutableEntry(from, documentSpecificSource.document());
return docMapper.apply(entry);
});
// Take the document Keys and transform it into Entry<Key,Document>,
// removing Attributes for this Document
// which do not fall within the expected time range
Iterator<Entry<Key, Document>> documents = null;
Aggregation a = new Aggregation(this.getTimeFilter(), this.typeMetadataWithNonIndexed, compositeMetadata, this.isIncludeGroupingContext(), this.includeRecordId, this.disableIndexOnlyDocuments(), getEvaluationFilter(), isTrackSizes());
if (gatherTimingDetails()) {
documents = Iterators.transform(sourceIterator, new EvaluationTrackingFunction<>(QuerySpan.Stage.Aggregation, trackingSpan, a));
} else {
documents = Iterators.transform(sourceIterator, a);
}
// Inject the data type as a field if the user requested it
if (this.includeDatatype) {
if (gatherTimingDetails()) {
documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.DataTypeAsField, trackingSpan, new DataTypeAsField(this.datatypeKey)));
} else {
documents = Iterators.transform(documents, new DataTypeAsField(this.datatypeKey));
}
}
// Inject the document permutations if required
if (!this.getDocumentPermutations().isEmpty()) {
if (gatherTimingDetails()) {
documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.DocumentPermutation, trackingSpan, new DocumentPermutation.DocumentPermutationAggregation(this.getDocumentPermutations())));
} else {
documents = Iterators.transform(documents, new DocumentPermutation.DocumentPermutationAggregation(this.getDocumentPermutations()));
}
}
if (gatherTimingDetails()) {
documents = new EvaluationTrackingIterator(QuerySpan.Stage.DocumentEvaluation, trackingSpan, getEvaluation(documentSpecificSource, deepSourceCopy, documents, compositeMetadata, typeMetadataWithNonIndexed, columnFamilies, inclusive));
} else {
documents = getEvaluation(documentSpecificSource, deepSourceCopy, documents, compositeMetadata, typeMetadataWithNonIndexed, columnFamilies, inclusive);
}
// a hook to allow mapping the document such as with the TLD or Parent
// query logics
// or if the document was not aggregated in the first place because the
// field index fields completely satisfied the query
documents = mapDocument(deepSourceCopy, documents, compositeMetadata);
// apply any configured post processing
documents = getPostProcessingChain(documents);
if (gatherTimingDetails()) {
documents = new EvaluationTrackingIterator(QuerySpan.Stage.PostProcessing, trackingSpan, documents);
}
// Filter out masked values if requested
if (this.filterMaskedValues) {
MaskedValueFilterInterface mvfi = MaskedValueFilterFactory.get(this.isIncludeGroupingContext(), this.isReducedResponse());
if (gatherTimingDetails()) {
documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.MaskedValueFilter, trackingSpan, mvfi));
} else {
documents = Iterators.transform(documents, mvfi);
}
}
// now filter the attributes to those with the keep flag set true
if (gatherTimingDetails()) {
documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.AttributeKeepFilter, trackingSpan, new AttributeKeepFilter<>()));
} else {
documents = Iterators.transform(documents, new AttributeKeepFilter<>());
}
// Project fields using a whitelist or a blacklist before serialization
if (this.projectResults) {
if (gatherTimingDetails()) {
documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.DocumentProjection, trackingSpan, getProjection()));
} else {
documents = Iterators.transform(documents, getProjection());
}
}
// remove the composite entries
documents = Iterators.transform(documents, this.getCompositeProjection());
// projection or visibility filtering)
if (gatherTimingDetails()) {
documents = statelessFilter(documents, new EvaluationTrackingPredicate<>(QuerySpan.Stage.EmptyDocumentFilter, trackingSpan, new EmptyDocumentFilter()));
documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.DocumentMetadata, trackingSpan, new DocumentMetadata()));
} else {
documents = statelessFilter(documents, new EmptyDocumentFilter());
documents = Iterators.transform(documents, new DocumentMetadata());
}
if (!this.limitFieldsMap.isEmpty()) {
if (gatherTimingDetails()) {
documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.LimitFields, trackingSpan, new LimitFields(this.getLimitFieldsMap())));
} else {
documents = Iterators.transform(documents, new LimitFields(this.getLimitFieldsMap()));
}
}
// do I need to remove the grouping context I added above?
if (groupingContextAddedByMe) {
if (gatherTimingDetails()) {
documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.RemoveGroupingContext, trackingSpan, new RemoveGroupingContext()));
} else {
documents = Iterators.transform(documents, new RemoveGroupingContext());
}
}
// only add the pipeline query span collection iterator which will cache metrics with each document if collectTimingDetails is true
if (collectTimingDetails) {
// if there is not a result, then add the trackingSpan to the
// QuerySpanCollector
// if there was a result, then the metrics from the trackingSpan
// will be added here
documents = new PipelineQuerySpanCollectionIterator(querySpanCollector, trackingSpan, documents);
}
return documents;
}
use of datawave.query.attributes.Document in project datawave by NationalSecurityAgency.
the class FacetedTransformer method _transform.
private FacetsBase _transform(Entry<Key, Document> documentEntry) throws EmptyObjectException {
if (documentEntry == null) {
// buildResponse will return a null object if there was only metadata in the document
throw new EmptyObjectException();
}
Key documentKey = correctKey(documentEntry.getKey());
Document document = documentEntry.getValue();
if (null == documentKey || null == document)
throw new IllegalArgumentException("Null key or value. Key:" + documentKey + ", Value: " + documentEntry.getValue());
extractMetrics(document, documentKey);
document.debugDocumentSize(documentKey);
String row = documentKey.getRow().toString();
String colf = documentKey.getColumnFamily().toString();
int index = colf.indexOf("\0");
Preconditions.checkArgument(-1 != index);
String dataType = colf.substring(0, index);
String uid = colf.substring(index + 1);
// We don't have to consult the Document to rebuild the Visibility, the key
// should have the correct top-level visibility
ColumnVisibility eventCV = new ColumnVisibility(documentKey.getColumnVisibility());
FacetsBase output = null;
try {
// build response method here
output = buildResponse(document, documentKey, eventCV, colf, row, this.markingFunctions);
} catch (Exception ex) {
log.error("Error building response document", ex);
throw new RuntimeException(ex);
}
if (output == null) {
// buildResponse will return a null object if there was only metadata in the document
throw new EmptyObjectException();
}
if (cardinalityConfiguration != null) {
collectCardinalities(document, documentKey, uid, dataType);
}
return output;
}
use of datawave.query.attributes.Document in project datawave by NationalSecurityAgency.
the class ContentTransform method apply.
@Nullable
@Override
public Map.Entry<Key, Document> apply(@Nullable Map.Entry<Key, Document> keyDocumentEntry) {
if (keyDocumentEntry != null) {
Document document = keyDocumentEntry.getValue();
Key documentKey = DocumentTransformer.correctKey(keyDocumentEntry.getKey());
String colf = documentKey.getColumnFamily().toString();
int index = colf.indexOf("\0");
String uid = colf.substring(index + 1);
for (String contentFieldName : this.contentFieldNames) {
if (document.containsKey(contentFieldName)) {
Attribute<?> contentField = document.remove(contentFieldName);
if (contentField.getData().toString().equalsIgnoreCase("true")) {
Content c = new Content(uid, contentField.getMetadata(), document.isToKeep());
document.put(contentFieldName, c, false, this.reducedResponse);
}
}
}
}
return keyDocumentEntry;
}
use of datawave.query.attributes.Document in project datawave by NationalSecurityAgency.
the class FacetedFunction method apply.
/*
* (non-Javadoc)
*
* @see rx.functions.Action1#call(java.lang.Object)
*/
@Override
public Entry<Key, Value> apply(Entry<Key, Value> entry) {
Entry<Key, Document> doc = deserializer.apply(entry);
if (null == summarizer) {
summarizer = new MergeSummarization(doc.getKey(), doc.getValue());
}
Iterator<Entry<Key, Document>> finalIter = Iterators.singletonIterator(summarizer.apply(doc));
for (Function<Entry<Key, Document>, Entry<Key, Document>> func : transforms) {
finalIter = Iterators.transform(finalIter, func);
}
return serializer.apply(finalIter.next());
}
use of datawave.query.attributes.Document in project datawave by NationalSecurityAgency.
the class TermOffsetPopulator method getContextMap.
/**
* Build TermOffset map for use in JexlEvaluation
*
* @param docKey
* key that maps to a document
* @param keys
* set of keys that map to hits on tf fields
* @param fields
* set of fields to remove from the search space
* @return
*/
public Map<String, Object> getContextMap(Key docKey, Set<Key> keys, Set<String> fields) {
document = new Document();
TermFrequencyIterator tfSource;
// Do not prune if no fields exist or if the tf fields would prune to nothing. TODO skip tf entirely if this would prune to zero
if (fields == null || fields.isEmpty() || fields.size() == termFrequencyFieldValues.keySet().size()) {
tfSource = new TermFrequencyIterator(termFrequencyFieldValues, keys);
} else {
// There are fields to remove, reduce the search space and continue
Multimap<String, String> tfFVs = HashMultimap.create(termFrequencyFieldValues);
fields.forEach(tfFVs::removeAll);
tfSource = new TermFrequencyIterator(tfFVs, keys);
if (tfFVs.size() == 0) {
log.error("Created a TFIter with no field values. Orig fields: " + termFrequencyFieldValues.keySet() + " fields to remove: " + fields);
}
}
Range range = getRange(keys);
try {
tfSource.init(source, null, null);
tfSource.seek(getRange(keys), null, false);
} catch (IOException e) {
log.error("Seek to the range failed: " + range, e);
}
// set the document context on the filter
if (evaluationFilter != null) {
evaluationFilter.startNewDocument(docKey);
}
Map<String, TermFrequencyList> termOffsetMap = Maps.newHashMap();
while (tfSource.hasTop()) {
Key key = tfSource.getTopKey();
FieldValue fv = FieldValue.getFieldValue(key);
// add the zone and term to our internal document
Content attr = new Content(fv.getValue(), source.getTopKey(), evaluationFilter == null || evaluationFilter.keep(key));
// no need to apply the evaluation filter here as the TermFrequencyIterator above is already doing more filtering than we can do here.
// So this filter is simply extraneous. However if the an EventDataQueryFilter implementation gets smarter somehow, then it can be added back in
// here.
// For example the AncestorQueryLogic may require this....
// if (evaluationFilter == null || evaluationFilter.apply(Maps.immutableEntry(key, StringUtils.EMPTY_STRING))) {
this.document.put(fv.getField(), attr);
TreeMultimap<TermFrequencyList.Zone, TermWeightPosition> offsets = TreeMultimap.create();
try {
TermWeight.Info twInfo = TermWeight.Info.parseFrom(tfSource.getTopValue().get());
// if no content expansion fields then assume every field is permitted for unfielded content functions
TermFrequencyList.Zone twZone = new TermFrequencyList.Zone(fv.getField(), (contentExpansionFields == null || contentExpansionFields.isEmpty() || contentExpansionFields.contains(fv.getField())), TermFrequencyList.getEventId(key));
TermWeightPosition.Builder position = new TermWeightPosition.Builder();
for (int i = 0; i < twInfo.getTermOffsetCount(); i++) {
position.setTermWeightOffsetInfo(twInfo, i);
offsets.put(twZone, position.build());
position.reset();
}
} catch (InvalidProtocolBufferException e) {
log.error("Could not deserialize TermWeight protocol buffer for: " + source.getTopKey());
return null;
}
// First time looking up this term in a field
TermFrequencyList tfl = termOffsetMap.get(fv.getValue());
if (null == tfl) {
termOffsetMap.put(fv.getValue(), new TermFrequencyList(offsets));
} else {
// Merge in the offsets for the current field+term with all previous
// offsets from other fields in the same term
tfl.addOffsets(offsets);
}
try {
tfSource.next();
} catch (IOException ioe) {
log.error("Next failed: " + range, ioe);
break;
}
}
// Load the actual map into map that will be put into the JexlContext
Map<String, Object> map = new HashMap<>();
map.put(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffsetMap);
return map;
}
Aggregations