Examples with Document - datawave.query.attributes.Document

Example 16 with Document

use of datawave.query.attributes.Document in project datawave by NationalSecurityAgency.

the class QueryIterator method createDocumentPipeline.

/**
 * Create the pipeline. It is very important that this pipeline can handle resetting the bottom iterator with a new value. This means that hasNext() needs
 * to call the next iterator. The only state that can be maintained is the next value ready after hasNext() has been called. Once next returns the value,
 * the next hasNext() call must call the next iterator again. So for example Iterators.filter() cannot be used as it uses a google commons AbstractIterator
 * that maintains an iterator state (failed, ready, done); use statelessFilter above instead.
 *
 * @param deepSourceCopy
 * @param documentSpecificSource
 * @return iterator of keys and values
 */
public Iterator<Entry<Key, Document>> createDocumentPipeline(SortedKeyValueIterator<Key, Value> deepSourceCopy, final NestedQueryIterator<Key> documentSpecificSource, Collection<ByteSequence> columnFamilies, boolean inclusive, QuerySpanCollector querySpanCollector) {
    QuerySpan trackingSpan = null;
    if (gatherTimingDetails()) {
        trackingSpan = new QuerySpan(getStatsdClient());
    }
    if (log.isTraceEnabled()) {
        log.trace("createDocumentPipeline");
    }
    final Function<Entry<Key, Document>, Entry<DocumentData, Document>> docMapper;
    if (isFieldIndexSatisfyingQuery()) {
        if (log.isTraceEnabled()) {
            log.trace("isFieldIndexSatisfyingQuery");
        }
        docMapper = new Function<Entry<Key, Document>, Entry<DocumentData, Document>>() {

            @Nullable
            @Override
            public Entry<DocumentData, Document> apply(@Nullable Entry<Key, Document> input) {
                Entry<DocumentData, Document> entry = null;
                if (input != null) {
                    entry = Maps.immutableEntry(new DocumentData(input.getKey(), Collections.singleton(input.getKey()), Collections.EMPTY_LIST, true), input.getValue());
                }
                return entry;
            }
        };
    } else {
        docMapper = new KeyToDocumentData(deepSourceCopy, myEnvironment, documentOptions, super.equality, getEvaluationFilter(), this.includeHierarchyFields, this.includeHierarchyFields);
    }
    Iterator<Entry<DocumentData, Document>> sourceIterator = Iterators.transform(documentSpecificSource, from -> {
        Entry<Key, Document> entry = Maps.immutableEntry(from, documentSpecificSource.document());
        return docMapper.apply(entry);
    });
    // Take the document Keys and transform it into Entry<Key,Document>,
    // removing Attributes for this Document
    // which do not fall within the expected time range
    Iterator<Entry<Key, Document>> documents = null;
    Aggregation a = new Aggregation(this.getTimeFilter(), this.typeMetadataWithNonIndexed, compositeMetadata, this.isIncludeGroupingContext(), this.includeRecordId, this.disableIndexOnlyDocuments(), getEvaluationFilter(), isTrackSizes());
    if (gatherTimingDetails()) {
        documents = Iterators.transform(sourceIterator, new EvaluationTrackingFunction<>(QuerySpan.Stage.Aggregation, trackingSpan, a));
    } else {
        documents = Iterators.transform(sourceIterator, a);
    }
    // Inject the data type as a field if the user requested it
    if (this.includeDatatype) {
        if (gatherTimingDetails()) {
            documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.DataTypeAsField, trackingSpan, new DataTypeAsField(this.datatypeKey)));
        } else {
            documents = Iterators.transform(documents, new DataTypeAsField(this.datatypeKey));
        }
    }
    // Inject the document permutations if required
    if (!this.getDocumentPermutations().isEmpty()) {
        if (gatherTimingDetails()) {
            documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.DocumentPermutation, trackingSpan, new DocumentPermutation.DocumentPermutationAggregation(this.getDocumentPermutations())));
        } else {
            documents = Iterators.transform(documents, new DocumentPermutation.DocumentPermutationAggregation(this.getDocumentPermutations()));
        }
    }
    if (gatherTimingDetails()) {
        documents = new EvaluationTrackingIterator(QuerySpan.Stage.DocumentEvaluation, trackingSpan, getEvaluation(documentSpecificSource, deepSourceCopy, documents, compositeMetadata, typeMetadataWithNonIndexed, columnFamilies, inclusive));
    } else {
        documents = getEvaluation(documentSpecificSource, deepSourceCopy, documents, compositeMetadata, typeMetadataWithNonIndexed, columnFamilies, inclusive);
    }
    // a hook to allow mapping the document such as with the TLD or Parent
    // query logics
    // or if the document was not aggregated in the first place because the
    // field index fields completely satisfied the query
    documents = mapDocument(deepSourceCopy, documents, compositeMetadata);
    // apply any configured post processing
    documents = getPostProcessingChain(documents);
    if (gatherTimingDetails()) {
        documents = new EvaluationTrackingIterator(QuerySpan.Stage.PostProcessing, trackingSpan, documents);
    }
    // Filter out masked values if requested
    if (this.filterMaskedValues) {
        MaskedValueFilterInterface mvfi = MaskedValueFilterFactory.get(this.isIncludeGroupingContext(), this.isReducedResponse());
        if (gatherTimingDetails()) {
            documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.MaskedValueFilter, trackingSpan, mvfi));
        } else {
            documents = Iterators.transform(documents, mvfi);
        }
    }
    // now filter the attributes to those with the keep flag set true
    if (gatherTimingDetails()) {
        documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.AttributeKeepFilter, trackingSpan, new AttributeKeepFilter<>()));
    } else {
        documents = Iterators.transform(documents, new AttributeKeepFilter<>());
    }
    // Project fields using a whitelist or a blacklist before serialization
    if (this.projectResults) {
        if (gatherTimingDetails()) {
            documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.DocumentProjection, trackingSpan, getProjection()));
        } else {
            documents = Iterators.transform(documents, getProjection());
        }
    }
    // remove the composite entries
    documents = Iterators.transform(documents, this.getCompositeProjection());
    // projection or visibility filtering)
    if (gatherTimingDetails()) {
        documents = statelessFilter(documents, new EvaluationTrackingPredicate<>(QuerySpan.Stage.EmptyDocumentFilter, trackingSpan, new EmptyDocumentFilter()));
        documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.DocumentMetadata, trackingSpan, new DocumentMetadata()));
    } else {
        documents = statelessFilter(documents, new EmptyDocumentFilter());
        documents = Iterators.transform(documents, new DocumentMetadata());
    }
    if (!this.limitFieldsMap.isEmpty()) {
        if (gatherTimingDetails()) {
            documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.LimitFields, trackingSpan, new LimitFields(this.getLimitFieldsMap())));
        } else {
            documents = Iterators.transform(documents, new LimitFields(this.getLimitFieldsMap()));
        }
    }
    // do I need to remove the grouping context I added above?
    if (groupingContextAddedByMe) {
        if (gatherTimingDetails()) {
            documents = Iterators.transform(documents, new EvaluationTrackingFunction<>(QuerySpan.Stage.RemoveGroupingContext, trackingSpan, new RemoveGroupingContext()));
        } else {
            documents = Iterators.transform(documents, new RemoveGroupingContext());
        }
    }
    // only add the pipeline query span collection iterator which will cache metrics with each document if collectTimingDetails is true
    if (collectTimingDetails) {
        // if there is not a result, then add the trackingSpan to the
        // QuerySpanCollector
        // if there was a result, then the metrics from the trackingSpan
        // will be added here
        documents = new PipelineQuerySpanCollectionIterator(querySpanCollector, trackingSpan, documents);
    }
    return documents;
}

Also used : Document(datawave.query.attributes.Document) KeyToDocumentData(datawave.query.function.KeyToDocumentData) Aggregation(datawave.query.function.Aggregation) DocumentPermutation(datawave.query.function.DocumentPermutation) Entry(java.util.Map.Entry) TupleToEntry(datawave.query.util.TupleToEntry) EvaluationTrackingFunction(datawave.query.iterator.profile.EvaluationTrackingFunction) RemoveGroupingContext(datawave.query.function.RemoveGroupingContext) EmptyDocumentFilter(datawave.query.predicate.EmptyDocumentFilter) DataTypeAsField(datawave.query.function.DataTypeAsField) EvaluationTrackingIterator(datawave.query.iterator.profile.EvaluationTrackingIterator) PipelineQuerySpanCollectionIterator(datawave.query.iterator.profile.PipelineQuerySpanCollectionIterator) QuerySpan(datawave.query.iterator.profile.QuerySpan) MultiThreadedQuerySpan(datawave.query.iterator.profile.MultiThreadedQuerySpan) MaskedValueFilterInterface(datawave.query.function.MaskedValueFilterInterface) DocumentData(datawave.query.iterator.aggregation.DocumentData) KeyToDocumentData(datawave.query.function.KeyToDocumentData) EvaluationTrackingPredicate(datawave.query.iterator.profile.EvaluationTrackingPredicate) LimitFields(datawave.query.function.LimitFields) DocumentMetadata(datawave.query.function.DocumentMetadata) AttributeKeepFilter(datawave.query.attributes.AttributeKeepFilter) Nullable(javax.annotation.Nullable) Key(org.apache.accumulo.core.data.Key)

Example 17 with Document

use of datawave.query.attributes.Document in project datawave by NationalSecurityAgency.

the class FacetedTransformer method _transform.

private FacetsBase _transform(Entry<Key, Document> documentEntry) throws EmptyObjectException {
    if (documentEntry == null) {
        // buildResponse will return a null object if there was only metadata in the document
        throw new EmptyObjectException();
    }
    Key documentKey = correctKey(documentEntry.getKey());
    Document document = documentEntry.getValue();
    if (null == documentKey || null == document)
        throw new IllegalArgumentException("Null key or value. Key:" + documentKey + ", Value: " + documentEntry.getValue());
    extractMetrics(document, documentKey);
    document.debugDocumentSize(documentKey);
    String row = documentKey.getRow().toString();
    String colf = documentKey.getColumnFamily().toString();
    int index = colf.indexOf("\0");
    Preconditions.checkArgument(-1 != index);
    String dataType = colf.substring(0, index);
    String uid = colf.substring(index + 1);
    // We don't have to consult the Document to rebuild the Visibility, the key
    // should have the correct top-level visibility
    ColumnVisibility eventCV = new ColumnVisibility(documentKey.getColumnVisibility());
    FacetsBase output = null;
    try {
        // build response method here
        output = buildResponse(document, documentKey, eventCV, colf, row, this.markingFunctions);
    } catch (Exception ex) {
        log.error("Error building response document", ex);
        throw new RuntimeException(ex);
    }
    if (output == null) {
        // buildResponse will return a null object if there was only metadata in the document
        throw new EmptyObjectException();
    }
    if (cardinalityConfiguration != null) {
        collectCardinalities(document, documentKey, uid, dataType);
    }
    return output;
}

Also used : EmptyObjectException(datawave.webservice.query.exception.EmptyObjectException) ColumnVisibility(org.apache.accumulo.core.security.ColumnVisibility) Document(datawave.query.attributes.Document) Key(org.apache.accumulo.core.data.Key) EmptyObjectException(datawave.webservice.query.exception.EmptyObjectException) FacetsBase(datawave.webservice.query.result.event.FacetsBase)

Example 18 with Document

use of datawave.query.attributes.Document in project datawave by NationalSecurityAgency.

the class ContentTransform method apply.

@Nullable
@Override
public Map.Entry<Key, Document> apply(@Nullable Map.Entry<Key, Document> keyDocumentEntry) {
    if (keyDocumentEntry != null) {
        Document document = keyDocumentEntry.getValue();
        Key documentKey = DocumentTransformer.correctKey(keyDocumentEntry.getKey());
        String colf = documentKey.getColumnFamily().toString();
        int index = colf.indexOf("\0");
        String uid = colf.substring(index + 1);
        for (String contentFieldName : this.contentFieldNames) {
            if (document.containsKey(contentFieldName)) {
                Attribute<?> contentField = document.remove(contentFieldName);
                if (contentField.getData().toString().equalsIgnoreCase("true")) {
                    Content c = new Content(uid, contentField.getMetadata(), document.isToKeep());
                    document.put(contentFieldName, c, false, this.reducedResponse);
                }
            }
        }
    }
    return keyDocumentEntry;
}

Also used : Content(datawave.query.attributes.Content) Document(datawave.query.attributes.Document) Key(org.apache.accumulo.core.data.Key) Nullable(javax.annotation.Nullable)

Example 19 with Document

use of datawave.query.attributes.Document in project datawave by NationalSecurityAgency.

the class FacetedFunction method apply.

/*
     * (non-Javadoc)
     * 
     * @see rx.functions.Action1#call(java.lang.Object)
     */
@Override
public Entry<Key, Value> apply(Entry<Key, Value> entry) {
    Entry<Key, Document> doc = deserializer.apply(entry);
    if (null == summarizer) {
        summarizer = new MergeSummarization(doc.getKey(), doc.getValue());
    }
    Iterator<Entry<Key, Document>> finalIter = Iterators.singletonIterator(summarizer.apply(doc));
    for (Function<Entry<Key, Document>, Entry<Key, Document>> func : transforms) {
        finalIter = Iterators.transform(finalIter, func);
    }
    return serializer.apply(finalIter.next());
}

Also used : Entry(java.util.Map.Entry) MergeSummarization(datawave.query.function.MergeSummarization) Document(datawave.query.attributes.Document) Key(org.apache.accumulo.core.data.Key)

Example 20 with Document

use of datawave.query.attributes.Document in project datawave by NationalSecurityAgency.

the class TermOffsetPopulator method getContextMap.

/**
 * Build TermOffset map for use in JexlEvaluation
 *
 * @param docKey
 *            key that maps to a document
 * @param keys
 *            set of keys that map to hits on tf fields
 * @param fields
 *            set of fields to remove from the search space
 * @return
 */
public Map<String, Object> getContextMap(Key docKey, Set<Key> keys, Set<String> fields) {
    document = new Document();
    TermFrequencyIterator tfSource;
    // Do not prune if no fields exist or if the tf fields would prune to nothing. TODO skip tf entirely if this would prune to zero
    if (fields == null || fields.isEmpty() || fields.size() == termFrequencyFieldValues.keySet().size()) {
        tfSource = new TermFrequencyIterator(termFrequencyFieldValues, keys);
    } else {
        // There are fields to remove, reduce the search space and continue
        Multimap<String, String> tfFVs = HashMultimap.create(termFrequencyFieldValues);
        fields.forEach(tfFVs::removeAll);
        tfSource = new TermFrequencyIterator(tfFVs, keys);
        if (tfFVs.size() == 0) {
            log.error("Created a TFIter with no field values. Orig fields: " + termFrequencyFieldValues.keySet() + " fields to remove: " + fields);
        }
    }
    Range range = getRange(keys);
    try {
        tfSource.init(source, null, null);
        tfSource.seek(getRange(keys), null, false);
    } catch (IOException e) {
        log.error("Seek to the range failed: " + range, e);
    }
    // set the document context on the filter
    if (evaluationFilter != null) {
        evaluationFilter.startNewDocument(docKey);
    }
    Map<String, TermFrequencyList> termOffsetMap = Maps.newHashMap();
    while (tfSource.hasTop()) {
        Key key = tfSource.getTopKey();
        FieldValue fv = FieldValue.getFieldValue(key);
        // add the zone and term to our internal document
        Content attr = new Content(fv.getValue(), source.getTopKey(), evaluationFilter == null || evaluationFilter.keep(key));
        // no need to apply the evaluation filter here as the TermFrequencyIterator above is already doing more filtering than we can do here.
        // So this filter is simply extraneous. However if the an EventDataQueryFilter implementation gets smarter somehow, then it can be added back in
        // here.
        // For example the AncestorQueryLogic may require this....
        // if (evaluationFilter == null || evaluationFilter.apply(Maps.immutableEntry(key, StringUtils.EMPTY_STRING))) {
        this.document.put(fv.getField(), attr);
        TreeMultimap<TermFrequencyList.Zone, TermWeightPosition> offsets = TreeMultimap.create();
        try {
            TermWeight.Info twInfo = TermWeight.Info.parseFrom(tfSource.getTopValue().get());
            // if no content expansion fields then assume every field is permitted for unfielded content functions
            TermFrequencyList.Zone twZone = new TermFrequencyList.Zone(fv.getField(), (contentExpansionFields == null || contentExpansionFields.isEmpty() || contentExpansionFields.contains(fv.getField())), TermFrequencyList.getEventId(key));
            TermWeightPosition.Builder position = new TermWeightPosition.Builder();
            for (int i = 0; i < twInfo.getTermOffsetCount(); i++) {
                position.setTermWeightOffsetInfo(twInfo, i);
                offsets.put(twZone, position.build());
                position.reset();
            }
        } catch (InvalidProtocolBufferException e) {
            log.error("Could not deserialize TermWeight protocol buffer for: " + source.getTopKey());
            return null;
        }
        // First time looking up this term in a field
        TermFrequencyList tfl = termOffsetMap.get(fv.getValue());
        if (null == tfl) {
            termOffsetMap.put(fv.getValue(), new TermFrequencyList(offsets));
        } else {
            // Merge in the offsets for the current field+term with all previous
            // offsets from other fields in the same term
            tfl.addOffsets(offsets);
        }
        try {
            tfSource.next();
        } catch (IOException ioe) {
            log.error("Next failed: " + range, ioe);
            break;
        }
    }
    // Load the actual map into map that will be put into the JexlContext
    Map<String, Object> map = new HashMap<>();
    map.put(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffsetMap);
    return map;
}

Also used : HashMap(java.util.HashMap) TermFrequencyIterator(datawave.core.iterators.TermFrequencyIterator) Document(datawave.query.attributes.Document) TermWeightPosition(datawave.ingest.protobuf.TermWeightPosition) TermFrequencyList(datawave.query.jexl.functions.TermFrequencyList) InvalidProtocolBufferException(com.google.protobuf.InvalidProtocolBufferException) IOException(java.io.IOException) Range(org.apache.accumulo.core.data.Range) Content(datawave.query.attributes.Content) TermWeight(datawave.ingest.protobuf.TermWeight) Key(org.apache.accumulo.core.data.Key)

Aggregations

Document (datawave.query.attributes.Document)97 Key (org.apache.accumulo.core.data.Key)76 Test (org.junit.Test)35 Value (org.apache.accumulo.core.data.Value)30 HashSet (java.util.HashSet)28 Range (org.apache.accumulo.core.data.Range)26 Attribute (datawave.query.attributes.Attribute)18 Map (java.util.Map)17 Attributes (datawave.query.attributes.Attributes)16 HashMap (java.util.HashMap)16 AbstractMap (java.util.AbstractMap)14 TypeAttribute (datawave.query.attributes.TypeAttribute)13 Entry (java.util.Map.Entry)13 PreNormalizedAttribute (datawave.query.attributes.PreNormalizedAttribute)12 Set (java.util.Set)12 Content (datawave.query.attributes.Content)11 TypeMetadata (datawave.query.util.TypeMetadata)10 QueryImpl (datawave.webservice.query.QueryImpl)10 DatawaveKey (datawave.query.data.parsers.DatawaveKey)9 DatawaveJexlContext (datawave.query.jexl.DatawaveJexlContext)9