use of datawave.ingest.protobuf.TermWeightPosition in project datawave by NationalSecurityAgency.
the class TermOffsetPopulator method getContextMap.
/**
* Build TermOffset map for use in JexlEvaluation
*
* @param docKey
* key that maps to a document
* @param keys
* set of keys that map to hits on tf fields
* @param fields
* set of fields to remove from the search space
* @return
*/
public Map<String, Object> getContextMap(Key docKey, Set<Key> keys, Set<String> fields) {
document = new Document();
TermFrequencyIterator tfSource;
// Do not prune if no fields exist or if the tf fields would prune to nothing. TODO skip tf entirely if this would prune to zero
if (fields == null || fields.isEmpty() || fields.size() == termFrequencyFieldValues.keySet().size()) {
tfSource = new TermFrequencyIterator(termFrequencyFieldValues, keys);
} else {
// There are fields to remove, reduce the search space and continue
Multimap<String, String> tfFVs = HashMultimap.create(termFrequencyFieldValues);
fields.forEach(tfFVs::removeAll);
tfSource = new TermFrequencyIterator(tfFVs, keys);
if (tfFVs.size() == 0) {
log.error("Created a TFIter with no field values. Orig fields: " + termFrequencyFieldValues.keySet() + " fields to remove: " + fields);
}
}
Range range = getRange(keys);
try {
tfSource.init(source, null, null);
tfSource.seek(getRange(keys), null, false);
} catch (IOException e) {
log.error("Seek to the range failed: " + range, e);
}
// set the document context on the filter
if (evaluationFilter != null) {
evaluationFilter.startNewDocument(docKey);
}
Map<String, TermFrequencyList> termOffsetMap = Maps.newHashMap();
while (tfSource.hasTop()) {
Key key = tfSource.getTopKey();
FieldValue fv = FieldValue.getFieldValue(key);
// add the zone and term to our internal document
Content attr = new Content(fv.getValue(), source.getTopKey(), evaluationFilter == null || evaluationFilter.keep(key));
// no need to apply the evaluation filter here as the TermFrequencyIterator above is already doing more filtering than we can do here.
// So this filter is simply extraneous. However if the an EventDataQueryFilter implementation gets smarter somehow, then it can be added back in
// here.
// For example the AncestorQueryLogic may require this....
// if (evaluationFilter == null || evaluationFilter.apply(Maps.immutableEntry(key, StringUtils.EMPTY_STRING))) {
this.document.put(fv.getField(), attr);
TreeMultimap<TermFrequencyList.Zone, TermWeightPosition> offsets = TreeMultimap.create();
try {
TermWeight.Info twInfo = TermWeight.Info.parseFrom(tfSource.getTopValue().get());
// if no content expansion fields then assume every field is permitted for unfielded content functions
TermFrequencyList.Zone twZone = new TermFrequencyList.Zone(fv.getField(), (contentExpansionFields == null || contentExpansionFields.isEmpty() || contentExpansionFields.contains(fv.getField())), TermFrequencyList.getEventId(key));
TermWeightPosition.Builder position = new TermWeightPosition.Builder();
for (int i = 0; i < twInfo.getTermOffsetCount(); i++) {
position.setTermWeightOffsetInfo(twInfo, i);
offsets.put(twZone, position.build());
position.reset();
}
} catch (InvalidProtocolBufferException e) {
log.error("Could not deserialize TermWeight protocol buffer for: " + source.getTopKey());
return null;
}
// First time looking up this term in a field
TermFrequencyList tfl = termOffsetMap.get(fv.getValue());
if (null == tfl) {
termOffsetMap.put(fv.getValue(), new TermFrequencyList(offsets));
} else {
// Merge in the offsets for the current field+term with all previous
// offsets from other fields in the same term
tfl.addOffsets(offsets);
}
try {
tfSource.next();
} catch (IOException ioe) {
log.error("Next failed: " + range, ioe);
break;
}
}
// Load the actual map into map that will be put into the JexlContext
Map<String, Object> map = new HashMap<>();
map.put(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffsetMap);
return map;
}
use of datawave.ingest.protobuf.TermWeightPosition in project datawave by NationalSecurityAgency.
the class ContentFunctionsTest method testEvaluation1.
@Test
public void testEvaluation1() {
String query = buildFunction(ContentFunctions.CONTENT_WITHIN_FUNCTION_NAME, "1", Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, "'dog'", "'cat'");
Expression expr = engine.createExpression(query);
List<TermWeightPosition> list1, list2;
list1 = asList(Arrays.asList(1, 2, 3), Arrays.asList(0, 0, 0));
// match (6-2) should match (3+1)
list2 = asList(Arrays.asList(5, 6, 7), Arrays.asList(0, 2, 0));
termOffSetMap.put("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1)));
termOffSetMap.put("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2)));
context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap);
Object o = expr.evaluate(context);
Assert.assertTrue(expect(o, true));
}
use of datawave.ingest.protobuf.TermWeightPosition in project datawave by NationalSecurityAgency.
the class ContentFunctionsTest method testEvaluationPhraseTermOverlapFail.
@Test
public void testEvaluationPhraseTermOverlapFail() {
String query = buildFunction(phraseFunction, Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, "'cat'", "'rat'", "'dog'");
Expression expr = engine.createExpression(query);
List<TermWeightPosition> list1, list2, list3;
list1 = asList(2);
list2 = asList(2);
list3 = asList(1);
termOffSetMap.put("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1)));
termOffSetMap.put("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2)));
termOffSetMap.put("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3)));
context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap);
Object o = expr.evaluate(context);
Assert.assertTrue(expect(o, false));
}
use of datawave.ingest.protobuf.TermWeightPosition in project datawave by NationalSecurityAgency.
the class ContentFunctionsTest method testEvaluationPhraseBasic2WithSkips.
@Test
public void testEvaluationPhraseBasic2WithSkips() {
String query = buildFunction(ContentFunctions.CONTENT_PHRASE_FUNCTION_NAME, Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, "'dog'", "'cat'");
Expression expr = engine.createExpression(query);
List<TermWeightPosition> list1, list2;
list1 = asList(Arrays.asList(1, 2, 3), Arrays.asList(0, 1, 0));
list2 = asList(Arrays.asList(5, 6, 7), Arrays.asList(1, 3, 1));
termOffSetMap.put("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1)));
termOffSetMap.put("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2)));
context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap);
Object o = expr.evaluate(context);
Assert.assertTrue(expect(o, true));
}
use of datawave.ingest.protobuf.TermWeightPosition in project datawave by NationalSecurityAgency.
the class ContentFunctionsTest method testEvaluationPhraseThreeTermPass.
@Test
public void testEvaluationPhraseThreeTermPass() {
String query = buildFunction(phraseFunction, Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, "'cat'", "'rat'", "'dog'");
Expression expr = engine.createExpression(query);
List<TermWeightPosition> list1, list2, list3;
// cat
list1 = asList(1, 2, 4);
// rat
list2 = asList(4, 7, 8, 10);
// dog
list3 = asList(4, 6);
termOffSetMap.put("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1)));
termOffSetMap.put("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2)));
termOffSetMap.put("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3)));
context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap);
Object o = expr.evaluate(context);
Assert.assertTrue(expect(o, true));
}
Aggregations