Search in sources :

Example 1 with TermWeightPosition

use of datawave.ingest.protobuf.TermWeightPosition in project datawave by NationalSecurityAgency.

the class TermOffsetPopulator method getContextMap.

/**
 * Build TermOffset map for use in JexlEvaluation
 *
 * @param docKey
 *            key that maps to a document
 * @param keys
 *            set of keys that map to hits on tf fields
 * @param fields
 *            set of fields to remove from the search space
 * @return
 */
public Map<String, Object> getContextMap(Key docKey, Set<Key> keys, Set<String> fields) {
    document = new Document();
    TermFrequencyIterator tfSource;
    // Do not prune if no fields exist or if the tf fields would prune to nothing. TODO skip tf entirely if this would prune to zero
    if (fields == null || fields.isEmpty() || fields.size() == termFrequencyFieldValues.keySet().size()) {
        tfSource = new TermFrequencyIterator(termFrequencyFieldValues, keys);
    } else {
        // There are fields to remove, reduce the search space and continue
        Multimap<String, String> tfFVs = HashMultimap.create(termFrequencyFieldValues);
        fields.forEach(tfFVs::removeAll);
        tfSource = new TermFrequencyIterator(tfFVs, keys);
        if (tfFVs.size() == 0) {
            log.error("Created a TFIter with no field values. Orig fields: " + termFrequencyFieldValues.keySet() + " fields to remove: " + fields);
        }
    }
    Range range = getRange(keys);
    try {
        tfSource.init(source, null, null);
        tfSource.seek(getRange(keys), null, false);
    } catch (IOException e) {
        log.error("Seek to the range failed: " + range, e);
    }
    // set the document context on the filter
    if (evaluationFilter != null) {
        evaluationFilter.startNewDocument(docKey);
    }
    Map<String, TermFrequencyList> termOffsetMap = Maps.newHashMap();
    while (tfSource.hasTop()) {
        Key key = tfSource.getTopKey();
        FieldValue fv = FieldValue.getFieldValue(key);
        // add the zone and term to our internal document
        Content attr = new Content(fv.getValue(), source.getTopKey(), evaluationFilter == null || evaluationFilter.keep(key));
        // no need to apply the evaluation filter here as the TermFrequencyIterator above is already doing more filtering than we can do here.
        // So this filter is simply extraneous. However if the an EventDataQueryFilter implementation gets smarter somehow, then it can be added back in
        // here.
        // For example the AncestorQueryLogic may require this....
        // if (evaluationFilter == null || evaluationFilter.apply(Maps.immutableEntry(key, StringUtils.EMPTY_STRING))) {
        this.document.put(fv.getField(), attr);
        TreeMultimap<TermFrequencyList.Zone, TermWeightPosition> offsets = TreeMultimap.create();
        try {
            TermWeight.Info twInfo = TermWeight.Info.parseFrom(tfSource.getTopValue().get());
            // if no content expansion fields then assume every field is permitted for unfielded content functions
            TermFrequencyList.Zone twZone = new TermFrequencyList.Zone(fv.getField(), (contentExpansionFields == null || contentExpansionFields.isEmpty() || contentExpansionFields.contains(fv.getField())), TermFrequencyList.getEventId(key));
            TermWeightPosition.Builder position = new TermWeightPosition.Builder();
            for (int i = 0; i < twInfo.getTermOffsetCount(); i++) {
                position.setTermWeightOffsetInfo(twInfo, i);
                offsets.put(twZone, position.build());
                position.reset();
            }
        } catch (InvalidProtocolBufferException e) {
            log.error("Could not deserialize TermWeight protocol buffer for: " + source.getTopKey());
            return null;
        }
        // First time looking up this term in a field
        TermFrequencyList tfl = termOffsetMap.get(fv.getValue());
        if (null == tfl) {
            termOffsetMap.put(fv.getValue(), new TermFrequencyList(offsets));
        } else {
            // Merge in the offsets for the current field+term with all previous
            // offsets from other fields in the same term
            tfl.addOffsets(offsets);
        }
        try {
            tfSource.next();
        } catch (IOException ioe) {
            log.error("Next failed: " + range, ioe);
            break;
        }
    }
    // Load the actual map into map that will be put into the JexlContext
    Map<String, Object> map = new HashMap<>();
    map.put(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffsetMap);
    return map;
}
Also used : HashMap(java.util.HashMap) TermFrequencyIterator(datawave.core.iterators.TermFrequencyIterator) Document(datawave.query.attributes.Document) TermWeightPosition(datawave.ingest.protobuf.TermWeightPosition) TermFrequencyList(datawave.query.jexl.functions.TermFrequencyList) InvalidProtocolBufferException(com.google.protobuf.InvalidProtocolBufferException) IOException(java.io.IOException) Range(org.apache.accumulo.core.data.Range) Content(datawave.query.attributes.Content) TermWeight(datawave.ingest.protobuf.TermWeight) Key(org.apache.accumulo.core.data.Key)

Example 2 with TermWeightPosition

use of datawave.ingest.protobuf.TermWeightPosition in project datawave by NationalSecurityAgency.

the class ContentFunctionsTest method testEvaluation1.

@Test
public void testEvaluation1() {
    String query = buildFunction(ContentFunctions.CONTENT_WITHIN_FUNCTION_NAME, "1", Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, "'dog'", "'cat'");
    Expression expr = engine.createExpression(query);
    List<TermWeightPosition> list1, list2;
    list1 = asList(Arrays.asList(1, 2, 3), Arrays.asList(0, 0, 0));
    // match (6-2) should match (3+1)
    list2 = asList(Arrays.asList(5, 6, 7), Arrays.asList(0, 2, 0));
    termOffSetMap.put("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1)));
    termOffSetMap.put("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2)));
    context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap);
    Object o = expr.evaluate(context);
    Assert.assertTrue(expect(o, true));
}
Also used : Expression(org.apache.commons.jexl2.Expression) Zone(datawave.query.jexl.functions.TermFrequencyList.Zone) TermWeightPosition(datawave.ingest.protobuf.TermWeightPosition) Test(org.junit.Test)

Example 3 with TermWeightPosition

use of datawave.ingest.protobuf.TermWeightPosition in project datawave by NationalSecurityAgency.

the class ContentFunctionsTest method testEvaluationPhraseTermOverlapFail.

@Test
public void testEvaluationPhraseTermOverlapFail() {
    String query = buildFunction(phraseFunction, Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, "'cat'", "'rat'", "'dog'");
    Expression expr = engine.createExpression(query);
    List<TermWeightPosition> list1, list2, list3;
    list1 = asList(2);
    list2 = asList(2);
    list3 = asList(1);
    termOffSetMap.put("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1)));
    termOffSetMap.put("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2)));
    termOffSetMap.put("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3)));
    context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap);
    Object o = expr.evaluate(context);
    Assert.assertTrue(expect(o, false));
}
Also used : Expression(org.apache.commons.jexl2.Expression) Zone(datawave.query.jexl.functions.TermFrequencyList.Zone) TermWeightPosition(datawave.ingest.protobuf.TermWeightPosition) Test(org.junit.Test)

Example 4 with TermWeightPosition

use of datawave.ingest.protobuf.TermWeightPosition in project datawave by NationalSecurityAgency.

the class ContentFunctionsTest method testEvaluationPhraseBasic2WithSkips.

@Test
public void testEvaluationPhraseBasic2WithSkips() {
    String query = buildFunction(ContentFunctions.CONTENT_PHRASE_FUNCTION_NAME, Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, "'dog'", "'cat'");
    Expression expr = engine.createExpression(query);
    List<TermWeightPosition> list1, list2;
    list1 = asList(Arrays.asList(1, 2, 3), Arrays.asList(0, 1, 0));
    list2 = asList(Arrays.asList(5, 6, 7), Arrays.asList(1, 3, 1));
    termOffSetMap.put("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1)));
    termOffSetMap.put("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2)));
    context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap);
    Object o = expr.evaluate(context);
    Assert.assertTrue(expect(o, true));
}
Also used : Expression(org.apache.commons.jexl2.Expression) Zone(datawave.query.jexl.functions.TermFrequencyList.Zone) TermWeightPosition(datawave.ingest.protobuf.TermWeightPosition) Test(org.junit.Test)

Example 5 with TermWeightPosition

use of datawave.ingest.protobuf.TermWeightPosition in project datawave by NationalSecurityAgency.

the class ContentFunctionsTest method testEvaluationPhraseThreeTermPass.

@Test
public void testEvaluationPhraseThreeTermPass() {
    String query = buildFunction(phraseFunction, Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, "'cat'", "'rat'", "'dog'");
    Expression expr = engine.createExpression(query);
    List<TermWeightPosition> list1, list2, list3;
    // cat
    list1 = asList(1, 2, 4);
    // rat
    list2 = asList(4, 7, 8, 10);
    // dog
    list3 = asList(4, 6);
    termOffSetMap.put("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1)));
    termOffSetMap.put("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2)));
    termOffSetMap.put("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3)));
    context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap);
    Object o = expr.evaluate(context);
    Assert.assertTrue(expect(o, true));
}
Also used : Expression(org.apache.commons.jexl2.Expression) Zone(datawave.query.jexl.functions.TermFrequencyList.Zone) TermWeightPosition(datawave.ingest.protobuf.TermWeightPosition) Test(org.junit.Test)

Aggregations

TermWeightPosition (datawave.ingest.protobuf.TermWeightPosition)69 Zone (datawave.query.jexl.functions.TermFrequencyList.Zone)67 Test (org.junit.Test)67 Expression (org.apache.commons.jexl2.Expression)66 InvalidProtocolBufferException (com.google.protobuf.InvalidProtocolBufferException)1 TermFrequencyIterator (datawave.core.iterators.TermFrequencyIterator)1 TermWeight (datawave.ingest.protobuf.TermWeight)1 Content (datawave.query.attributes.Content)1 Document (datawave.query.attributes.Document)1 TermFrequencyList (datawave.query.jexl.functions.TermFrequencyList)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 NavigableSet (java.util.NavigableSet)1 Key (org.apache.accumulo.core.data.Key)1 Range (org.apache.accumulo.core.data.Range)1