use of nl.inl.blacklab.searches.SearchHits in project BlackLab by INL.
the class HitGroupsTokenFrequencies method get.
/**
* Get the token frequencies for the given query and hit property.
*
* @param source query to find token frequencies for
* @param requestedGroupingProperty
* @return token frequencies
*/
public static HitGroups get(SearchHits source, HitProperty requestedGroupingProperty) {
QueryInfo queryInfo = source.queryInfo();
Query filterQuery = source.getFilterQuery();
SearchSettings searchSettings = source.searchSettings();
try {
/**
* This is where we store our groups while we're computing/gathering them. Maps from group Id to number of hits (left) and number of docs (right)
*/
final ConcurrentHashMap<GroupIdHash, MutablePair<Integer, Integer>> occurances = new ConcurrentHashMap<>();
final BlackLabIndex index = queryInfo.index();
/**
* Document properties that are used in the grouping. (e.g. for query "all tokens, grouped by lemma + document year", will contain DocProperty("document year")
* This is not necessarily limited to just metadata, can also contain any other DocProperties such as document ID, document length, etc.
*/
final List<DocProperty> docProperties = new ArrayList<>();
/**
* Token properties that need to be grouped on, with sensitivity (case-sensitive grouping or not) and Terms
*/
final List<Triple<AnnotationForwardIndex, MatchSensitivity, Terms>> hitProperties = new ArrayList<>();
/**
* Stores the original index every (doc|hit)property has in the original interleaved/intertwined list.
* The requestedGroupingProperty sometimes represents more than one property (in the form of HitPropertyMultiple) such as 3 properties: [token text, document year, token lemma]
* The groups always get an id that is (roughly) the concatenation of the properties (in the example case [token text, document year, token lemma]),
* and it's important this id contains the respective values in the same order.
* We need to keep this list because otherwise we'd potentially change the order.
*
* Integer contains index in the source list (docProperties or hitProperties, from just above)
* Boolean is true when origin list was docProperties, false for hitProperties.
*/
final List<Pair<Integer, Boolean>> originalOrderOfUnpackedProperties = new ArrayList<>();
// Unpack the requestedGroupingProperty into its constituents and sort those into the appropriate categories: hit and doc properties.
{
List<HitProperty> props = requestedGroupingProperty.props() != null ? requestedGroupingProperty.props() : Arrays.asList(requestedGroupingProperty);
for (HitProperty p : props) {
final DocProperty asDocPropIfApplicable = p.docPropsOnly();
if (asDocPropIfApplicable != null) {
// property can be converted to docProperty (applies to the document instead of the token/hit)
if (DEBUG && asDocPropIfApplicable.props() != null) {
throw new RuntimeException("Nested PropertyMultiples detected, should never happen (when this code was originally written)");
}
final int positionInUnpackedList = docProperties.size();
docProperties.add(asDocPropIfApplicable);
originalOrderOfUnpackedProperties.add(Pair.of(positionInUnpackedList, true));
} else {
// Property couldn't be converted to DocProperty (is null). The current property is an actual HitProperty (applies to annotation/token/hit value)
List<Annotation> annot = p.needsContext();
if (DEBUG && (annot == null || annot.size() != 1)) {
throw new RuntimeException("Grouping property does not apply to singular annotation (nested propertymultiple? non-annotation grouping?) should never happen.");
}
final int positionInUnpackedList = hitProperties.size();
final AnnotationForwardIndex annotationFI = index.annotationForwardIndex(annot.get(0));
hitProperties.add(Triple.of(annotationFI, p.getSensitivities().get(0), annotationFI.terms()));
originalOrderOfUnpackedProperties.add(Pair.of(positionInUnpackedList, false));
}
}
}
final int numAnnotations = hitProperties.size();
long numberOfDocsProcessed;
final AtomicInteger numberOfHitsProcessed = new AtomicInteger();
final AtomicBoolean hitMaxHitsToProcess = new AtomicBoolean(false);
try (final BlockTimer c = BlockTimer.create("Top Level")) {
final List<Integer> docIds = new ArrayList<>();
try (BlockTimer d = c.child("Gathering documents")) {
queryInfo.index().searcher().search(filterQuery == null ? new MatchAllDocsQuery() : filterQuery, new SimpleCollector() {
private int docBase;
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
docBase = context.docBase;
super.doSetNextReader(context);
}
@Override
public void collect(int docId) throws IOException {
int globalDocId = docId + docBase;
docIds.add(globalDocId);
}
@Override
public boolean needsScores() {
return false;
}
});
}
numberOfDocsProcessed = docIds.size();
final IndexReader reader = queryInfo.index().reader();
final int[] minusOne = new int[] { -1 };
// What we do instead is for every document just retrieve how many tokens it contains (from its metadata), and add that count to the appropriate group
if (hitProperties.isEmpty()) {
try (BlockTimer f = c.child("Grouping documents (metadata only path)")) {
String fieldName = index.mainAnnotatedField().name();
DocPropertyAnnotatedFieldLength propTokens = new DocPropertyAnnotatedFieldLength(index, fieldName);
final int[] emptyTokenValuesArray = new int[0];
docIds.parallelStream().forEach(docId -> {
// ignore "extra closing token"
final int docLength = (int) propTokens.get(docId) - subtractClosingToken;
final DocResult synthesizedDocResult = DocResult.fromDoc(queryInfo, new PropertyValueDoc(new DocImpl(queryInfo.index(), docId)), 0, docLength);
final PropertyValue[] metadataValuesForGroup = new PropertyValue[docProperties.size()];
for (int i = 0; i < docProperties.size(); ++i) {
metadataValuesForGroup[i] = docProperties.get(i).get(synthesizedDocResult);
}
// precompute, it's the same for all hits in document
final int metadataValuesHash = Arrays.hashCode(metadataValuesForGroup);
numberOfHitsProcessed.addAndGet(docLength);
// Add all tokens in document to the group.
final GroupIdHash groupId = new GroupIdHash(emptyTokenValuesArray, emptyTokenValuesArray, metadataValuesForGroup, metadataValuesHash);
occurances.compute(groupId, (__, groupSizes) -> {
if (groupSizes != null) {
groupSizes.left += docLength;
groupSizes.right += 1;
return groupSizes;
} else {
return MutablePair.of(docLength, 1);
}
});
});
}
} else {
final int maxHitsToProcess = searchSettings.maxHitsToProcess() > 0 ? searchSettings.maxHitsToProcess() : Integer.MAX_VALUE;
final IntUnaryOperator incrementUntilMax = (v) -> v < maxHitsToProcess ? v + 1 : v;
final String fieldName = index.mainAnnotatedField().name();
final String lengthTokensFieldName = AnnotatedFieldNameUtil.lengthTokensField(fieldName);
numberOfDocsProcessed = docIds.parallelStream().filter(docId -> {
try {
// Step 1: read all values for the to-be-grouped annotations for this document
// This will create one int[] for every annotation, containing ids that map to the values for this document for this annotation
final Document doc = reader.document(docId);
final List<int[]> tokenValuesPerAnnotation = new ArrayList<>();
try (BlockTimer e = c.child("Read annotations from forward index")) {
for (Triple<AnnotationForwardIndex, MatchSensitivity, Terms> annot : hitProperties) {
final String annotationFIName = annot.getLeft().annotation().forwardIndexIdField();
final int fiid = doc.getField(annotationFIName).numericValue().intValue();
final List<int[]> tokenValues = annot.getLeft().retrievePartsInt(fiid, minusOne, minusOne);
tokenValuesPerAnnotation.addAll(tokenValues);
}
}
// Step 2: retrieve the to-be-grouped metadata for this document
// ignore "extra closing token"
int docLength = Integer.parseInt(doc.get(lengthTokensFieldName)) - subtractClosingToken;
final DocResult synthesizedDocResult = DocResult.fromDoc(queryInfo, new PropertyValueDoc(new DocImpl(queryInfo.index(), docId)), 0, docLength);
final PropertyValue[] metadataValuesForGroup = !docProperties.isEmpty() ? new PropertyValue[docProperties.size()] : null;
for (int i = 0; i < docProperties.size(); ++i) {
metadataValuesForGroup[i] = docProperties.get(i).get(synthesizedDocResult);
}
// precompute, it's the same for all hits in document
final int metadataValuesHash = Arrays.hashCode(metadataValuesForGroup);
// now we have all values for all relevant annotations for this document
// iterate again and pair up the nth entries for all annotations, then store that as a group.
/**
* Bookkeeping: track which groups we've already seen in this document,
* so we only count this document once per group
*/
HashSet<GroupIdHash> groupsInThisDocument = new HashSet<>();
try (BlockTimer f = c.child("Group tokens")) {
for (int tokenIndex = 0; tokenIndex < docLength; ++tokenIndex) {
if (numberOfHitsProcessed.getAndUpdate(incrementUntilMax) >= maxHitsToProcess) {
hitMaxHitsToProcess.set(true);
// true if any token of this document made the cut, false if we escaped immediately
return tokenIndex > 0;
}
// Unfortunate fact: token ids are case-sensitive, and in order to group on a token's values case and diacritics insensitively,
// we need to actually group by their "sort positions" - which is just the index the term would have if all terms would have been sorted
// so in essence it's also an "id", but a case-insensitive one.
// we could further optimize to not do this step when grouping sensitively by making a specialized instance of the GroupIdHash class
// that hashes the token ids instead of the sortpositions in that case.
int[] annotationValuesForThisToken = new int[numAnnotations];
int[] sortPositions = new int[annotationValuesForThisToken.length];
for (int annotationIndex = 0; annotationIndex < numAnnotations; ++annotationIndex) {
int[] tokenValuesThisAnnotation = tokenValuesPerAnnotation.get(annotationIndex);
final int termId = annotationValuesForThisToken[annotationIndex] = tokenValuesThisAnnotation[tokenIndex];
Triple<AnnotationForwardIndex, MatchSensitivity, Terms> currentHitProp = hitProperties.get(annotationIndex);
MatchSensitivity matchSensitivity = currentHitProp.getMiddle();
Terms terms = currentHitProp.getRight();
sortPositions[annotationIndex] = terms.idToSortPosition(termId, matchSensitivity);
}
final GroupIdHash groupId = new GroupIdHash(annotationValuesForThisToken, sortPositions, metadataValuesForGroup, metadataValuesHash);
occurances.compute(groupId, (__, groupSize) -> {
if (groupSize != null) {
groupSize.left += 1;
// second (or more) occurance of these token values in this document
groupSize.right += groupsInThisDocument.add(groupId) ? 1 : 0;
return groupSize;
} else {
// should always return true, but we need to add this group anyway!
return MutablePair.of(1, groupsInThisDocument.add(groupId) ? 1 : 0);
}
});
}
}
} catch (IOException e) {
throw BlackLabRuntimeException.wrap(e);
}
return true;
}).count();
logger.trace("Number of processed docs: " + numberOfDocsProcessed);
}
}
Set<PropertyValue> duplicateGroupsDebug = DEBUG ? new HashSet<PropertyValue>() : null;
List<HitGroup> groups;
try (final BlockTimer c = BlockTimer.create("Resolve string values for tokens")) {
final int numMetadataValues = docProperties.size();
groups = occurances.entrySet().parallelStream().map(e -> {
final int groupSizeHits = e.getValue().getLeft();
final int groupSizeDocs = e.getValue().getRight();
final int[] annotationValues = e.getKey().tokenIds;
final PropertyValue[] metadataValues = e.getKey().metadataValues;
// allocate new - is not copied when moving into propertyvaluemultiple
final PropertyValue[] groupIdAsList = new PropertyValue[numAnnotations + numMetadataValues];
// Convert all raw values (integers) into their appropriate PropertyValues
// Taking care to preserve the order of the resultant PropertyValues with the order of the input HitProperties
int indexInOutput = 0;
for (Pair<Integer, Boolean> p : originalOrderOfUnpackedProperties) {
final int indexInInput = p.getLeft();
if (p.getRight()) {
// is docprop, add PropertyValue as-is
groupIdAsList[indexInOutput++] = metadataValues[indexInInput];
} else {
// is hitprop, convert value to PropertyValue.
Annotation annot = hitProperties.get(indexInInput).getLeft().annotation();
MatchSensitivity sens = hitProperties.get(indexInInput).getMiddle();
groupIdAsList[indexInOutput++] = new PropertyValueContextWords(index, annot, sens, new int[] { annotationValues[indexInInput] }, false);
}
}
PropertyValue groupId = groupIdAsList.length > 1 ? new PropertyValueMultiple(groupIdAsList) : groupIdAsList[0];
if (DEBUG) {
synchronized (duplicateGroupsDebug) {
if (!duplicateGroupsDebug.add(groupId)) {
throw new RuntimeException("Identical groups - should never happen");
}
}
}
return new HitGroupWithoutResults(queryInfo, groupId, groupSizeHits, groupSizeDocs, false, false);
}).collect(Collectors.toList());
}
logger.debug("fast path used for grouping");
ResultsStats hitsStats = new ResultsStatsStatic(numberOfHitsProcessed.get(), numberOfHitsProcessed.get(), new MaxStats(hitMaxHitsToProcess.get(), hitMaxHitsToProcess.get()));
ResultsStats docsStats = new ResultsStatsStatic((int) numberOfDocsProcessed, (int) numberOfDocsProcessed, new MaxStats(hitMaxHitsToProcess.get(), hitMaxHitsToProcess.get()));
return HitGroups.fromList(queryInfo, groups, requestedGroupingProperty, null, null, hitsStats, docsStats);
} catch (IOException e) {
throw BlackLabRuntimeException.wrap(e);
}
}
use of nl.inl.blacklab.searches.SearchHits in project BlackLab by INL.
the class RequestHandlerHits method getQueryForHitsInSpecificGroupOnly.
/**
* Translate the normal Hits query in the searchparams object into a query yielding only those Hits in the group with the specified PropertyValue
*
* @param viewGroupVal
* @param groupByProp
* @param hitsGrouped
*
* @return the SearchHits that will yield the hits, or null if the search could not be reconstructed.
* @throws BlsException
* @throws InvalidQuery
*/
private SearchHits getQueryForHitsInSpecificGroupOnly(PropertyValue viewGroupVal, HitProperty groupByProp, HitGroups hitsGrouped) throws BlsException, InvalidQuery {
// see if we can enhance this query
if (hitsGrouped.isSample())
return null;
// see if this query matches only singular tokens
// (we can't enhance multi-token queries such as ngrams yet)
TextPattern tp = searchParam.getPattern();
if (!tp.toQuery(QueryInfo.create(blIndex())).producesSingleTokens())
return null;
// Alright, the original query for the Hits lends itself to enhancement.
// Create the Query that will do the metadata filtering portion. (Token filtering is done through the TextPattern above)
BooleanQuery.Builder fqb = new BooleanQuery.Builder();
boolean usedFilter = false;
if (searchParam.getFilterQuery() != null) {
fqb.add(searchParam.getFilterQuery(), Occur.FILTER);
usedFilter = true;
}
// Decode the grouping properties, and the values for those properties in the requested group.
// So we can enhance the BooleanQuery and TextPattern with these criteria
List<PropertyValue> vals = viewGroupVal instanceof PropertyValueMultiple ? ((PropertyValueMultiple) viewGroupVal).values() : Arrays.asList(viewGroupVal);
List<HitProperty> props = groupByProp instanceof HitPropertyMultiple ? ((HitPropertyMultiple) groupByProp).props() : Arrays.asList(groupByProp);
int i = 0;
for (HitProperty p : props) {
if (p instanceof HitPropertyHitText) {
String valueForAnnotation = vals.get(i).toString();
HitPropertyHitText prop = ((HitPropertyHitText) p);
Annotation annot = prop.needsContext().get(0);
MatchSensitivity sensitivity = prop.getSensitivities().get(0);
tp = new TextPatternAnd(tp, new TextPatternAnnotation(annot.name(), new TextPatternSensitive(sensitivity, new TextPatternTerm(valueForAnnotation))));
} else if (p instanceof HitPropertyDoc || p instanceof HitPropertyDocumentId) {
Object value = vals.get(i).value();
int luceneDocId = value instanceof Doc ? ((Doc) value).id() : BlsUtils.getDocIdFromPid(blIndex(), (String) value);
fqb.add(new SingleDocIdFilter(luceneDocId), Occur.FILTER);
usedFilter = true;
} else if (p instanceof HitPropertyDocumentStoredField) {
fqb.add(new DocValuesTermsQuery(((HitPropertyDocumentStoredField) p).fieldName(), (String) vals.get(i).value()), Occur.FILTER);
usedFilter = true;
} else {
logger.debug("Cannot merge group specifier into query: {} with value {}", p, vals.get(i));
return null;
}
++i;
}
// All specifiers merged!
// Construct the query that will get us our hits.
SearchEmpty search = blIndex().search(blIndex().mainAnnotatedField(), searchParam.getUseCache(), searchLogger);
QueryInfo queryInfo = QueryInfo.create(blIndex(), blIndex().mainAnnotatedField());
BLSpanQuery query = usedFilter ? tp.toQuery(queryInfo, fqb.build()) : tp.toQuery(queryInfo);
SearchHits hits = search.find(query, searchParam.getSearchSettings());
if (searchParam.hitsSortSettings() != null) {
hits = hits.sort(searchParam.hitsSortSettings().sortBy());
}
if (searchParam.getSampleSettings() != null) {
hits = hits.sample(searchParam.getSampleSettings());
}
return hits;
}
use of nl.inl.blacklab.searches.SearchHits in project BlackLab by INL.
the class QueryTool method parseAndExecuteQuery.
/**
* Parse and execute a query in the current query format.
*
* @param query the query
*/
private void parseAndExecuteQuery(String query) {
Timer t = new Timer();
try {
// See if we want to choose any random words
if (query.contains("@@")) {
StringBuffer resultString = new StringBuffer();
Pattern regex = Pattern.compile("@@[A-Za-z0-9_\\-]+");
Matcher regexMatcher = regex.matcher(query);
while (regexMatcher.find()) {
// You can vary the replacement text for each match on-the-fly
String wordListName = regexMatcher.group().substring(2);
List<String> list = wordLists.get(wordListName);
if (list == null) {
errprintln("Word list '" + wordListName + "' not found!");
return;
}
int randomIndex = (int) (Math.random() * list.size());
regexMatcher.appendReplacement(resultString, list.get(randomIndex));
}
regexMatcher.appendTail(resultString);
query = resultString.toString();
}
Parser parser = parsers.get(currentParserIndex);
TextPattern pattern = parser.parse(query);
if (pattern == null) {
errprintln("No query to execute.");
return;
}
// pattern = pattern.rewrite();
if (verbose)
outprintln("TextPattern: " + pattern.toString());
// If the query included filter clauses, use those. Otherwise use the global filter, if any.
Query filterForThisQuery = parser.getIncludedFilterQuery();
if (filterForThisQuery == null)
filterForThisQuery = filterQuery;
Query filter = filterForThisQuery == null ? null : filterForThisQuery;
// Execute search
BLSpanQuery spanQuery = pattern.toQuery(QueryInfo.create(index, contentsField), filter);
if (verbose)
outprintln("SpanQuery: " + spanQuery.toString(contentsField.name()));
SearchHits search = index.search().find(spanQuery);
hits = search.execute();
docs = null;
groups = null;
sortedHits = null;
collocations = null;
showWhichGroup = -1;
showSetting = ShowSetting.HITS;
firstResult = 0;
showResultsPage();
reportTime(t.elapsed());
if (determineTotalNumberOfHits)
statInfo = Integer.toString(hits.size());
else
statInfo = "?";
commandWasQuery = true;
} catch (InvalidQuery e) {
// Parse error
errprintln(e.getMessage());
errprintln("(Type 'help' for examples or see accompanying documents)");
} catch (UnsupportedOperationException e) {
// Unimplemented part of query language used
// DEBUG createWeight bug
e.printStackTrace();
errprintln("Cannot execute query; " + e.getMessage());
errprintln("(Type 'help' for examples or see accompanying documents)");
}
}
use of nl.inl.blacklab.searches.SearchHits in project BlackLab by INL.
the class RequestHandlerHits method getHitsFromGroup.
private Pair<BlsCacheEntry<?>, Hits> getHitsFromGroup(String groupBy, String viewGroup) throws InterruptedException, ExecutionException, InvalidQuery, BlsException {
PropertyValue viewGroupVal = PropertyValue.deserialize(blIndex(), blIndex().mainAnnotatedField(), viewGroup);
if (viewGroupVal == null)
throw new BadRequest("ERROR_IN_GROUP_VALUE", "Cannot deserialize group value: " + viewGroup);
BlsCacheEntry<HitGroups> jobHitGroups = (BlsCacheEntry<HitGroups>) searchParam.hitsGroupedStats().executeAsync();
HitGroups hitGroups = jobHitGroups.get();
HitGroup group = hitGroups.get(viewGroupVal);
if (group == null)
throw new BadRequest("GROUP_NOT_FOUND", "Group not found: " + viewGroup);
Hits hits = null;
// Only launch a separate search when there are ZERO hits stored in the group
if (group.storedResults().size() > 0) {
// Some hits available: return those.
hits = group.storedResults();
}
// No results were actually stored. Fire a separate query to retrieve them.
if (group.storedResults().size() == 0) {
HitProperty groupByProp = HitProperty.deserialize(blIndex(), blIndex().mainAnnotatedField(), groupBy);
SearchHits findHitsFromOnlyRequestedGroup = getQueryForHitsInSpecificGroupOnly(viewGroupVal, groupByProp, hitGroups);
if (findHitsFromOnlyRequestedGroup != null) {
// place the group-contents query in the cache and return the results.
BlsCacheEntry<ResultCount> cacheEntry = (BlsCacheEntry<ResultCount>) findHitsFromOnlyRequestedGroup.count().executeAsync();
hits = ((BlsCacheEntry<Hits>) findHitsFromOnlyRequestedGroup.executeAsync()).get();
return Pair.of(cacheEntry, hits);
}
// This is a special case:
// Since the group we got from the cached results didn't contain the hits, we need to get the hits from their original query
// and then group them here (using a different code path, since the normal code path doesn't always store the hits due to performance).
// And, since retrieving just the hits for one group couldn't be done (findHitsFromOnlyRequestedGroup == null), we need to unfortunately get all hits.
SearchHitGroupsFromHits searchGroups = (SearchHitGroupsFromHits) searchParam.hitsSample().groupWithStoredHits(groupByProp, Results.NO_LIMIT);
// now run the separate grouping search, making sure not to actually store the hits.
// Sorting of the resultant groups is not applied, but is also not required because the groups aren't shown, only their contents.
// If a later query requests the groups in a sorted order, the cache will ensure these results become the input to that query anyway, so worst case we just deferred the work.
// place groups with hits in search cache
jobHitGroups = (BlsCacheEntry<HitGroups>) searchGroups.executeAsync();
hits = jobHitGroups.get().get(// get group
viewGroupVal).storedResults();
}
// NOTE: sortBy is automatically applied to regular results, but not to results within groups
// See ResultsGrouper::init (uses hits.getByOriginalOrder(i)) and DocResults::constructor
// Also see SearchParams (hitsSortSettings, docSortSettings, hitGroupsSortSettings, docGroupsSortSettings)
// There is probably no reason why we can't just sort/use the sort of the input results, but we need some more testing to see if everything is correct if we change this
String sortBy = searchParam.getString("sort");
HitProperty sortProp = (sortBy != null && !sortBy.isEmpty()) ? HitProperty.deserialize(hits, sortBy) : null;
if (sortProp != null)
hits = hits.sort(sortProp);
return Pair.of(jobHitGroups, hits);
}
Aggregations