Example 1 with BlockTimer

use of nl.inl.util.BlockTimer in project BlackLab by INL.

the class HitGroupsTokenFrequencies method get.

 * Get the token frequencies for the given query and hit property.
 * @param source query to find token frequencies for
 * @param requestedGroupingProperty
 * @return token frequencies
public static HitGroups get(SearchHits source, HitProperty requestedGroupingProperty) {
    QueryInfo queryInfo = source.queryInfo();
    Query filterQuery = source.getFilterQuery();
    SearchSettings searchSettings = source.searchSettings();
    try {
         * This is where we store our groups while we're computing/gathering them. Maps from group Id to number of hits (left) and number of docs (right)
        final ConcurrentHashMap<GroupIdHash, MutablePair<Integer, Integer>> occurances = new ConcurrentHashMap<>();
        final BlackLabIndex index = queryInfo.index();
         * Document properties that are used in the grouping. (e.g. for query "all tokens, grouped by lemma + document year", will contain DocProperty("document year")
         * This is not necessarily limited to just metadata, can also contain any other DocProperties such as document ID, document length, etc.
        final List<DocProperty> docProperties = new ArrayList<>();
         * Token properties that need to be grouped on, with sensitivity (case-sensitive grouping or not) and Terms
        final List<Triple<AnnotationForwardIndex, MatchSensitivity, Terms>> hitProperties = new ArrayList<>();
         * Stores the original index every (doc|hit)property has in the original interleaved/intertwined list.
         * The requestedGroupingProperty sometimes represents more than one property (in the form of HitPropertyMultiple) such as 3 properties: [token text, document year, token lemma]
         * The groups always get an id that is (roughly) the concatenation of the properties (in the example case [token text, document year, token lemma]),
         * and it's important this id contains the respective values in the same order.
         * We need to keep this list because otherwise we'd potentially change the order.
         * Integer contains index in the source list (docProperties or hitProperties, from just above)
         * Boolean is true when origin list was docProperties, false for hitProperties.
        final List<Pair<Integer, Boolean>> originalOrderOfUnpackedProperties = new ArrayList<>();
        // Unpack the requestedGroupingProperty into its constituents and sort those into the appropriate categories: hit and doc properties.
            List<HitProperty> props = requestedGroupingProperty.props() != null ? requestedGroupingProperty.props() : Arrays.asList(requestedGroupingProperty);
            for (HitProperty p : props) {
                final DocProperty asDocPropIfApplicable = p.docPropsOnly();
                if (asDocPropIfApplicable != null) {
                    // property can be converted to docProperty (applies to the document instead of the token/hit)
                    if (DEBUG && asDocPropIfApplicable.props() != null) {
                        throw new RuntimeException("Nested PropertyMultiples detected, should never happen (when this code was originally written)");
                    final int positionInUnpackedList = docProperties.size();
                    originalOrderOfUnpackedProperties.add(Pair.of(positionInUnpackedList, true));
                } else {
                    // Property couldn't be converted to DocProperty (is null). The current property is an actual HitProperty (applies to annotation/token/hit value)
                    List<Annotation> annot = p.needsContext();
                    if (DEBUG && (annot == null || annot.size() != 1)) {
                        throw new RuntimeException("Grouping property does not apply to singular annotation (nested propertymultiple? non-annotation grouping?) should never happen.");
                    final int positionInUnpackedList = hitProperties.size();
                    final AnnotationForwardIndex annotationFI = index.annotationForwardIndex(annot.get(0));
                    hitProperties.add(Triple.of(annotationFI, p.getSensitivities().get(0), annotationFI.terms()));
                    originalOrderOfUnpackedProperties.add(Pair.of(positionInUnpackedList, false));
        final int numAnnotations = hitProperties.size();
        long numberOfDocsProcessed;
        final AtomicInteger numberOfHitsProcessed = new AtomicInteger();
        final AtomicBoolean hitMaxHitsToProcess = new AtomicBoolean(false);
        try (final BlockTimer c = BlockTimer.create("Top Level")) {
            final List<Integer> docIds = new ArrayList<>();
            try (BlockTimer d = c.child("Gathering documents")) {
                queryInfo.index().searcher().search(filterQuery == null ? new MatchAllDocsQuery() : filterQuery, new SimpleCollector() {

                    private int docBase;

                    protected void doSetNextReader(LeafReaderContext context) throws IOException {
                        docBase = context.docBase;

                    public void collect(int docId) throws IOException {
                        int globalDocId = docId + docBase;

                    public boolean needsScores() {
                        return false;
            numberOfDocsProcessed = docIds.size();
            final IndexReader reader = queryInfo.index().reader();
            final int[] minusOne = new int[] { -1 };
            // What we do instead is for every document just retrieve how many tokens it contains (from its metadata), and add that count to the appropriate group
            if (hitProperties.isEmpty()) {
                try (BlockTimer f = c.child("Grouping documents (metadata only path)")) {
                    String fieldName = index.mainAnnotatedField().name();
                    DocPropertyAnnotatedFieldLength propTokens = new DocPropertyAnnotatedFieldLength(index, fieldName);
                    final int[] emptyTokenValuesArray = new int[0];
                    docIds.parallelStream().forEach(docId -> {
                        // ignore "extra closing token"
                        final int docLength = (int) propTokens.get(docId) - subtractClosingToken;
                        final DocResult synthesizedDocResult = DocResult.fromDoc(queryInfo, new PropertyValueDoc(new DocImpl(queryInfo.index(), docId)), 0, docLength);
                        final PropertyValue[] metadataValuesForGroup = new PropertyValue[docProperties.size()];
                        for (int i = 0; i < docProperties.size(); ++i) {
                            metadataValuesForGroup[i] = docProperties.get(i).get(synthesizedDocResult);
                        // precompute, it's the same for all hits in document
                        final int metadataValuesHash = Arrays.hashCode(metadataValuesForGroup);
                        // Add all tokens in document to the group.
                        final GroupIdHash groupId = new GroupIdHash(emptyTokenValuesArray, emptyTokenValuesArray, metadataValuesForGroup, metadataValuesHash);
                        occurances.compute(groupId, (__, groupSizes) -> {
                            if (groupSizes != null) {
                                groupSizes.left += docLength;
                                groupSizes.right += 1;
                                return groupSizes;
                            } else {
                                return MutablePair.of(docLength, 1);
            } else {
                final int maxHitsToProcess = searchSettings.maxHitsToProcess() > 0 ? searchSettings.maxHitsToProcess() : Integer.MAX_VALUE;
                final IntUnaryOperator incrementUntilMax = (v) -> v < maxHitsToProcess ? v + 1 : v;
                final String fieldName = index.mainAnnotatedField().name();
                final String lengthTokensFieldName = AnnotatedFieldNameUtil.lengthTokensField(fieldName);
                numberOfDocsProcessed = docIds.parallelStream().filter(docId -> {
                    try {
                        // Step 1: read all values for the to-be-grouped annotations for this document
                        // This will create one int[] for every annotation, containing ids that map to the values for this document for this annotation
                        final Document doc = reader.document(docId);
                        final List<int[]> tokenValuesPerAnnotation = new ArrayList<>();
                        try (BlockTimer e = c.child("Read annotations from forward index")) {
                            for (Triple<AnnotationForwardIndex, MatchSensitivity, Terms> annot : hitProperties) {
                                final String annotationFIName = annot.getLeft().annotation().forwardIndexIdField();
                                final int fiid = doc.getField(annotationFIName).numericValue().intValue();
                                final List<int[]> tokenValues = annot.getLeft().retrievePartsInt(fiid, minusOne, minusOne);
                        // Step 2: retrieve the to-be-grouped metadata for this document
                        // ignore "extra closing token"
                        int docLength = Integer.parseInt(doc.get(lengthTokensFieldName)) - subtractClosingToken;
                        final DocResult synthesizedDocResult = DocResult.fromDoc(queryInfo, new PropertyValueDoc(new DocImpl(queryInfo.index(), docId)), 0, docLength);
                        final PropertyValue[] metadataValuesForGroup = !docProperties.isEmpty() ? new PropertyValue[docProperties.size()] : null;
                        for (int i = 0; i < docProperties.size(); ++i) {
                            metadataValuesForGroup[i] = docProperties.get(i).get(synthesizedDocResult);
                        // precompute, it's the same for all hits in document
                        final int metadataValuesHash = Arrays.hashCode(metadataValuesForGroup);
                        // now we have all values for all relevant annotations for this document
                        // iterate again and pair up the nth entries for all annotations, then store that as a group.
                         * Bookkeeping: track which groups we've already seen in this document,
                         * so we only count this document once per group
                        HashSet<GroupIdHash> groupsInThisDocument = new HashSet<>();
                        try (BlockTimer f = c.child("Group tokens")) {
                            for (int tokenIndex = 0; tokenIndex < docLength; ++tokenIndex) {
                                if (numberOfHitsProcessed.getAndUpdate(incrementUntilMax) >= maxHitsToProcess) {
                                    // true if any token of this document made the cut, false if we escaped immediately
                                    return tokenIndex > 0;
                                // Unfortunate fact: token ids are case-sensitive, and in order to group on a token's values case and diacritics insensitively,
                                // we need to actually group by their "sort positions" - which is just the index the term would have if all terms would have been sorted
                                // so in essence it's also an "id", but a case-insensitive one.
                                // we could further optimize to not do this step when grouping sensitively by making a specialized instance of the GroupIdHash class
                                // that hashes the token ids instead of the sortpositions in that case.
                                int[] annotationValuesForThisToken = new int[numAnnotations];
                                int[] sortPositions = new int[annotationValuesForThisToken.length];
                                for (int annotationIndex = 0; annotationIndex < numAnnotations; ++annotationIndex) {
                                    int[] tokenValuesThisAnnotation = tokenValuesPerAnnotation.get(annotationIndex);
                                    final int termId = annotationValuesForThisToken[annotationIndex] = tokenValuesThisAnnotation[tokenIndex];
                                    Triple<AnnotationForwardIndex, MatchSensitivity, Terms> currentHitProp = hitProperties.get(annotationIndex);
                                    MatchSensitivity matchSensitivity = currentHitProp.getMiddle();
                                    Terms terms = currentHitProp.getRight();
                                    sortPositions[annotationIndex] = terms.idToSortPosition(termId, matchSensitivity);
                                final GroupIdHash groupId = new GroupIdHash(annotationValuesForThisToken, sortPositions, metadataValuesForGroup, metadataValuesHash);
                                occurances.compute(groupId, (__, groupSize) -> {
                                    if (groupSize != null) {
                                        groupSize.left += 1;
                                        // second (or more) occurance of these token values in this document
                                        groupSize.right += groupsInThisDocument.add(groupId) ? 1 : 0;
                                        return groupSize;
                                    } else {
                                        // should always return true, but we need to add this group anyway!
                                        return MutablePair.of(1, groupsInThisDocument.add(groupId) ? 1 : 0);
                    } catch (IOException e) {
                        throw BlackLabRuntimeException.wrap(e);
                    return true;
                logger.trace("Number of processed docs: " + numberOfDocsProcessed);
        Set<PropertyValue> duplicateGroupsDebug = DEBUG ? new HashSet<PropertyValue>() : null;
        List<HitGroup> groups;
        try (final BlockTimer c = BlockTimer.create("Resolve string values for tokens")) {
            final int numMetadataValues = docProperties.size();
            groups = occurances.entrySet().parallelStream().map(e -> {
                final int groupSizeHits = e.getValue().getLeft();
                final int groupSizeDocs = e.getValue().getRight();
                final int[] annotationValues = e.getKey().tokenIds;
                final PropertyValue[] metadataValues = e.getKey().metadataValues;
                // allocate new - is not copied when moving into propertyvaluemultiple
                final PropertyValue[] groupIdAsList = new PropertyValue[numAnnotations + numMetadataValues];
                // Convert all raw values (integers) into their appropriate PropertyValues
                // Taking care to preserve the order of the resultant PropertyValues with the order of the input HitProperties
                int indexInOutput = 0;
                for (Pair<Integer, Boolean> p : originalOrderOfUnpackedProperties) {
                    final int indexInInput = p.getLeft();
                    if (p.getRight()) {
                        // is docprop, add PropertyValue as-is
                        groupIdAsList[indexInOutput++] = metadataValues[indexInInput];
                    } else {
                        // is hitprop, convert value to PropertyValue.
                        Annotation annot = hitProperties.get(indexInInput).getLeft().annotation();
                        MatchSensitivity sens = hitProperties.get(indexInInput).getMiddle();
                        groupIdAsList[indexInOutput++] = new PropertyValueContextWords(index, annot, sens, new int[] { annotationValues[indexInInput] }, false);
                PropertyValue groupId = groupIdAsList.length > 1 ? new PropertyValueMultiple(groupIdAsList) : groupIdAsList[0];
                if (DEBUG) {
                    synchronized (duplicateGroupsDebug) {
                        if (!duplicateGroupsDebug.add(groupId)) {
                            throw new RuntimeException("Identical groups - should never happen");
                return new HitGroupWithoutResults(queryInfo, groupId, groupSizeHits, groupSizeDocs, false, false);
        logger.debug("fast path used for grouping");
        ResultsStats hitsStats = new ResultsStatsStatic(numberOfHitsProcessed.get(), numberOfHitsProcessed.get(), new MaxStats(hitMaxHitsToProcess.get(), hitMaxHitsToProcess.get()));
        ResultsStats docsStats = new ResultsStatsStatic((int) numberOfDocsProcessed, (int) numberOfDocsProcessed, new MaxStats(hitMaxHitsToProcess.get(), hitMaxHitsToProcess.get()));
        return HitGroups.fromList(queryInfo, groups, requestedGroupingProperty, null, null, hitsStats, docsStats);
    } catch (IOException e) {
        throw BlackLabRuntimeException.wrap(e);
Also used : Query( java.util(java.util) BlackLabIndex( IntUnaryOperator(java.util.function.IntUnaryOperator) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) SearchHits(nl.inl.blacklab.searches.SearchHits) nl.inl.blacklab.resultproperty(nl.inl.blacklab.resultproperty) Document(org.apache.lucene.document.Document) AnnotatedFieldNameUtil( MutablePair(org.apache.commons.lang3.tuple.MutablePair) Pair(org.apache.commons.lang3.tuple.Pair) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Terms(nl.inl.blacklab.forwardindex.Terms) AnnotationForwardIndex(nl.inl.blacklab.forwardindex.AnnotationForwardIndex) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) Triple(org.apache.commons.lang3.tuple.Triple) BlackLabRuntimeException(nl.inl.blacklab.exceptions.BlackLabRuntimeException) Annotation( SimpleCollector( ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) IOException( MatchSensitivity( MatchAllDocsQuery( Collectors( Logger(org.apache.logging.log4j.Logger) DocImpl( LogManager(org.apache.logging.log4j.LogManager) IndexReader(org.apache.lucene.index.IndexReader) BlockTimer(nl.inl.util.BlockTimer) Query( MatchAllDocsQuery( IntUnaryOperator(java.util.function.IntUnaryOperator) Document(org.apache.lucene.document.Document) SimpleCollector( LeafReaderContext(org.apache.lucene.index.LeafReaderContext) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) Terms(nl.inl.blacklab.forwardindex.Terms) BlackLabIndex( Triple(org.apache.commons.lang3.tuple.Triple) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) BlockTimer(nl.inl.util.BlockTimer) MutablePair(org.apache.commons.lang3.tuple.MutablePair) BlackLabRuntimeException(nl.inl.blacklab.exceptions.BlackLabRuntimeException) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) MatchSensitivity( MutablePair(org.apache.commons.lang3.tuple.MutablePair) Pair(org.apache.commons.lang3.tuple.Pair) AnnotationForwardIndex(nl.inl.blacklab.forwardindex.AnnotationForwardIndex) IOException( MatchAllDocsQuery( Annotation( AtomicInteger(java.util.concurrent.atomic.AtomicInteger) IndexReader(org.apache.lucene.index.IndexReader) DocImpl(

Example 2 with BlockTimer

use of nl.inl.util.BlockTimer in project BlackLab by INL.

the class TermsReader method read.

private void read(FileChannel fc) throws IOException {
    try (BlockTimer t = BlockTimer.create("Initializing terms " + this.termsFile)) {
        logger.debug("Initializing termsreader " + termsFile);
        final long start = System.nanoTime();
        long fileLength = termsFile.length();
        IntBuffer ib = readFromFileChannel(fc, fileLength);
        int[] termId2SensitivePosition = new int[numberOfTerms];
        int[] termId2InsensitivePosition = new int[numberOfTerms];
        // Read the sort order arrays
        // Advance past unused sortPos -> id array (left in there for file compatibility)
        ib.position(ib.position() + numberOfTerms);
        // Advance past unused sortPos -> id array (left in there for file compatibility)
        ib.position(ib.position() + numberOfTerms);
        // garbage collect option
        ib = null;
        // Invert the mapping of term id-> insensitive into insensitive -> term ids
        int numGroupsThatAreNotSizeOne = 0;
        TIntObjectHashMap<IntArrayList> insensitivePosition2TermIds = new TIntObjectHashMap<>(terms.length);
        for (int termId = 0; termId < termId2InsensitivePosition.length; ++termId) {
            int insensitivePosition = termId2InsensitivePosition[termId];
            IntArrayList v = new IntArrayList(1);
            IntArrayList prev = insensitivePosition2TermIds.put(insensitivePosition, v);
            if (prev != null) {
                if (prev.size() == 1)
        fillTermDataGroups(termId2SensitivePosition, termId2InsensitivePosition, insensitivePosition2TermIds, numGroupsThatAreNotSizeOne);
        this.terms = null;
        logger.debug("finishing initializing termsreader " + termsFile + " - " + TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start) + "ms to process " + numberOfTerms + " terms");
Also used : IntBuffer(java.nio.IntBuffer) TIntObjectHashMap( BlockTimer(nl.inl.util.BlockTimer) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList)

Example 3 with BlockTimer

use of nl.inl.util.BlockTimer in project BlackLab by INL.

the class RequestHandlerHitsGrouped method handle.

public int handle(DataStream ds) throws BlsException, InvalidQuery {
    HitGroups groups;
    BlsCacheEntry<HitGroups> search;
    try (BlockTimer ignored = BlockTimer.create("Searching hit groups")) {
        // Get the window we're interested in
        search = (BlsCacheEntry<HitGroups>) searchParam.hitsGroupedStats().executeAsync();
        // Search is done; construct the results object
        groups = search.get();
    } catch (InterruptedException | ExecutionException e) {
        throw RequestHandler.translateSearchException(e);
    WindowSettings windowSettings = searchParam.getWindowSettings();
    final int first = Math.max(windowSettings.first(), 0);
    DefaultMax pageSize = searchMan.config().getParameters().getPageSize();
    final int requestedWindowSize = windowSettings.size() < 0 || windowSettings.size() > pageSize.getMax() ? pageSize.getDefaultValue() : windowSettings.size();
    int totalResults = groups.size();
    final int actualWindowSize = first + requestedWindowSize > totalResults ? totalResults - first : requestedWindowSize;
    WindowStats ourWindow = new WindowStats(first + requestedWindowSize < totalResults, first, requestedWindowSize, actualWindowSize);
    addSummaryCommonFields(ds, searchParam, search.timeUserWaitedMs(), 0, groups, ourWindow);
    ResultsStats hitsStats = groups.hitsStats();
    ResultsStats docsStats = groups.docsStats();
    if (docsStats == null)
        docsStats = searchParam.docsCount().execute();
    // The list of groups found
    DocProperty metadataGroupProperties = null;
    DocResults subcorpus = null;
    CorpusSize subcorpusSize = null;
        metadataGroupProperties = groups.groupCriteria().docPropsOnly();
        subcorpus = searchParam.subcorpus().execute();
        subcorpusSize = subcorpus.subcorpusSize();
    addNumberOfResultsSummaryTotalHits(ds, hitsStats, docsStats, false, subcorpusSize);
    /* Gather group values per property:
         * In the case we're grouping by multiple values, the DocPropertyMultiple and PropertyValueMultiple will
         * contain the sub properties and values in the same order.
    boolean isMultiValueGroup = groups.groupCriteria() instanceof HitPropertyMultiple;
    List<HitProperty> prop = isMultiValueGroup ? ((HitPropertyMultiple) groups.groupCriteria()).props() : Collections.singletonList(groups.groupCriteria());
    Map<Integer, String> pids = new HashMap<>();
    int last = Math.min(first + requestedWindowSize, groups.size());
    try (BlockTimer ignored = BlockTimer.create("Serializing groups to JSON")) {
        for (int i = first; i < last; ++i) {
            HitGroup group = groups.get(i);
            PropertyValue id = group.identity();
            List<PropertyValue> valuesForGroup = isMultiValueGroup ? id.values() : Collections.singletonList(id);
            if (INCLUDE_RELATIVE_FREQ && metadataGroupProperties != null) {
                // Find size of corresponding subcorpus group
                PropertyValue docPropValues = groups.groupCriteria().docPropValues(id);
                subcorpusSize = findSubcorpusSize(searchParam, subcorpus.query(), metadataGroupProperties, docPropValues, true);
            // logger.debug("## tokens in subcorpus group: " + subcorpusSize.getTokens());
            int numberOfDocsInGroup = group.storedResults().docsStats().countedTotal();
            ds.entry("identity", id.serialize()).entry("identityDisplay", id.toString()).entry("size", group.size());
            for (int j = 0; j < prop.size(); ++j) {
                final HitProperty hp = prop.get(j);
                final PropertyValue pv = valuesForGroup.get(j);
                ds.entry("name", hp.serialize());
                ds.entry("value", pv.toString());
            if (INCLUDE_RELATIVE_FREQ) {
                ds.entry("numberOfDocs", numberOfDocsInGroup);
                if (metadataGroupProperties != null) {
                    addSubcorpusSize(ds, subcorpusSize);
            if (searchParam.includeGroupContents()) {
                Hits hitsInGroup = group.storedResults();
                writeHits(ds, hitsInGroup, pids, searchParam.getContextSettings());
    if (searchParam.includeGroupContents()) {
        writeDocInfos(ds, groups, pids, first, requestedWindowSize);
    return HTTP_OK;
Also used : DefaultMax(nl.inl.blacklab.server.config.DefaultMax) ExecutionException(java.util.concurrent.ExecutionException) WindowSettings( PropertyValue(nl.inl.blacklab.resultproperty.PropertyValue) HitProperty(nl.inl.blacklab.resultproperty.HitProperty) BlockTimer(nl.inl.util.BlockTimer) HitPropertyMultiple(nl.inl.blacklab.resultproperty.HitPropertyMultiple) DocProperty(nl.inl.blacklab.resultproperty.DocProperty)


