Search in sources :

Example 1 with AnnotationForwardIndex

use of nl.inl.blacklab.forwardindex.AnnotationForwardIndex in project BlackLab by INL.

the class IndexerImpl method addToForwardIndex.

/**
 * Add a list of tokens to an annotation forward index
 *
 * @param prop the annotation to get values and position increments from
 * @return the id assigned to the content
 * @deprecated add a whole field at a time using {@link #addToForwardIndex(AnnotatedFieldWriter, Document)}
 */
@Override
@Deprecated
public int addToForwardIndex(AnnotationWriter prop) {
    Annotation annotation = indexWriter.getOrCreateAnnotation(prop.field(), prop.name());
    AnnotationForwardIndex forwardIndex = indexWriter.annotationForwardIndex(annotation);
    if (forwardIndex == null)
        throw new IllegalArgumentException("No forward index for field " + AnnotatedFieldNameUtil.annotationField(prop.field().name(), prop.name()));
    return forwardIndex.addDocument(prop.values(), prop.positionIncrements());
}
Also used : AnnotationForwardIndex(nl.inl.blacklab.forwardindex.AnnotationForwardIndex) Annotation(nl.inl.blacklab.search.indexmetadata.Annotation)

Example 2 with AnnotationForwardIndex

use of nl.inl.blacklab.forwardindex.AnnotationForwardIndex in project BlackLab by INL.

the class HitGroupsTokenFrequencies method get.

/**
 * Get the token frequencies for the given query and hit property.
 *
 * @param source query to find token frequencies for
 * @param requestedGroupingProperty
 * @return token frequencies
 */
public static HitGroups get(SearchHits source, HitProperty requestedGroupingProperty) {
    QueryInfo queryInfo = source.queryInfo();
    Query filterQuery = source.getFilterQuery();
    SearchSettings searchSettings = source.searchSettings();
    try {
        /**
         * This is where we store our groups while we're computing/gathering them. Maps from group Id to number of hits (left) and number of docs (right)
         */
        final ConcurrentHashMap<GroupIdHash, MutablePair<Integer, Integer>> occurances = new ConcurrentHashMap<>();
        final BlackLabIndex index = queryInfo.index();
        /**
         * Document properties that are used in the grouping. (e.g. for query "all tokens, grouped by lemma + document year", will contain DocProperty("document year")
         * This is not necessarily limited to just metadata, can also contain any other DocProperties such as document ID, document length, etc.
         */
        final List<DocProperty> docProperties = new ArrayList<>();
        /**
         * Token properties that need to be grouped on, with sensitivity (case-sensitive grouping or not) and Terms
         */
        final List<Triple<AnnotationForwardIndex, MatchSensitivity, Terms>> hitProperties = new ArrayList<>();
        /**
         * Stores the original index every (doc|hit)property has in the original interleaved/intertwined list.
         * The requestedGroupingProperty sometimes represents more than one property (in the form of HitPropertyMultiple) such as 3 properties: [token text, document year, token lemma]
         * The groups always get an id that is (roughly) the concatenation of the properties (in the example case [token text, document year, token lemma]),
         * and it's important this id contains the respective values in the same order.
         * We need to keep this list because otherwise we'd potentially change the order.
         *
         * Integer contains index in the source list (docProperties or hitProperties, from just above)
         * Boolean is true when origin list was docProperties, false for hitProperties.
         */
        final List<Pair<Integer, Boolean>> originalOrderOfUnpackedProperties = new ArrayList<>();
        // Unpack the requestedGroupingProperty into its constituents and sort those into the appropriate categories: hit and doc properties.
        {
            List<HitProperty> props = requestedGroupingProperty.props() != null ? requestedGroupingProperty.props() : Arrays.asList(requestedGroupingProperty);
            for (HitProperty p : props) {
                final DocProperty asDocPropIfApplicable = p.docPropsOnly();
                if (asDocPropIfApplicable != null) {
                    // property can be converted to docProperty (applies to the document instead of the token/hit)
                    if (DEBUG && asDocPropIfApplicable.props() != null) {
                        throw new RuntimeException("Nested PropertyMultiples detected, should never happen (when this code was originally written)");
                    }
                    final int positionInUnpackedList = docProperties.size();
                    docProperties.add(asDocPropIfApplicable);
                    originalOrderOfUnpackedProperties.add(Pair.of(positionInUnpackedList, true));
                } else {
                    // Property couldn't be converted to DocProperty (is null). The current property is an actual HitProperty (applies to annotation/token/hit value)
                    List<Annotation> annot = p.needsContext();
                    if (DEBUG && (annot == null || annot.size() != 1)) {
                        throw new RuntimeException("Grouping property does not apply to singular annotation (nested propertymultiple? non-annotation grouping?) should never happen.");
                    }
                    final int positionInUnpackedList = hitProperties.size();
                    final AnnotationForwardIndex annotationFI = index.annotationForwardIndex(annot.get(0));
                    hitProperties.add(Triple.of(annotationFI, p.getSensitivities().get(0), annotationFI.terms()));
                    originalOrderOfUnpackedProperties.add(Pair.of(positionInUnpackedList, false));
                }
            }
        }
        final int numAnnotations = hitProperties.size();
        long numberOfDocsProcessed;
        final AtomicInteger numberOfHitsProcessed = new AtomicInteger();
        final AtomicBoolean hitMaxHitsToProcess = new AtomicBoolean(false);
        try (final BlockTimer c = BlockTimer.create("Top Level")) {
            final List<Integer> docIds = new ArrayList<>();
            try (BlockTimer d = c.child("Gathering documents")) {
                queryInfo.index().searcher().search(filterQuery == null ? new MatchAllDocsQuery() : filterQuery, new SimpleCollector() {

                    private int docBase;

                    @Override
                    protected void doSetNextReader(LeafReaderContext context) throws IOException {
                        docBase = context.docBase;
                        super.doSetNextReader(context);
                    }

                    @Override
                    public void collect(int docId) throws IOException {
                        int globalDocId = docId + docBase;
                        docIds.add(globalDocId);
                    }

                    @Override
                    public boolean needsScores() {
                        return false;
                    }
                });
            }
            numberOfDocsProcessed = docIds.size();
            final IndexReader reader = queryInfo.index().reader();
            final int[] minusOne = new int[] { -1 };
            // What we do instead is for every document just retrieve how many tokens it contains (from its metadata), and add that count to the appropriate group
            if (hitProperties.isEmpty()) {
                try (BlockTimer f = c.child("Grouping documents (metadata only path)")) {
                    String fieldName = index.mainAnnotatedField().name();
                    DocPropertyAnnotatedFieldLength propTokens = new DocPropertyAnnotatedFieldLength(index, fieldName);
                    final int[] emptyTokenValuesArray = new int[0];
                    docIds.parallelStream().forEach(docId -> {
                        // ignore "extra closing token"
                        final int docLength = (int) propTokens.get(docId) - subtractClosingToken;
                        final DocResult synthesizedDocResult = DocResult.fromDoc(queryInfo, new PropertyValueDoc(new DocImpl(queryInfo.index(), docId)), 0, docLength);
                        final PropertyValue[] metadataValuesForGroup = new PropertyValue[docProperties.size()];
                        for (int i = 0; i < docProperties.size(); ++i) {
                            metadataValuesForGroup[i] = docProperties.get(i).get(synthesizedDocResult);
                        }
                        // precompute, it's the same for all hits in document
                        final int metadataValuesHash = Arrays.hashCode(metadataValuesForGroup);
                        numberOfHitsProcessed.addAndGet(docLength);
                        // Add all tokens in document to the group.
                        final GroupIdHash groupId = new GroupIdHash(emptyTokenValuesArray, emptyTokenValuesArray, metadataValuesForGroup, metadataValuesHash);
                        occurances.compute(groupId, (__, groupSizes) -> {
                            if (groupSizes != null) {
                                groupSizes.left += docLength;
                                groupSizes.right += 1;
                                return groupSizes;
                            } else {
                                return MutablePair.of(docLength, 1);
                            }
                        });
                    });
                }
            } else {
                final int maxHitsToProcess = searchSettings.maxHitsToProcess() > 0 ? searchSettings.maxHitsToProcess() : Integer.MAX_VALUE;
                final IntUnaryOperator incrementUntilMax = (v) -> v < maxHitsToProcess ? v + 1 : v;
                final String fieldName = index.mainAnnotatedField().name();
                final String lengthTokensFieldName = AnnotatedFieldNameUtil.lengthTokensField(fieldName);
                numberOfDocsProcessed = docIds.parallelStream().filter(docId -> {
                    try {
                        // Step 1: read all values for the to-be-grouped annotations for this document
                        // This will create one int[] for every annotation, containing ids that map to the values for this document for this annotation
                        final Document doc = reader.document(docId);
                        final List<int[]> tokenValuesPerAnnotation = new ArrayList<>();
                        try (BlockTimer e = c.child("Read annotations from forward index")) {
                            for (Triple<AnnotationForwardIndex, MatchSensitivity, Terms> annot : hitProperties) {
                                final String annotationFIName = annot.getLeft().annotation().forwardIndexIdField();
                                final int fiid = doc.getField(annotationFIName).numericValue().intValue();
                                final List<int[]> tokenValues = annot.getLeft().retrievePartsInt(fiid, minusOne, minusOne);
                                tokenValuesPerAnnotation.addAll(tokenValues);
                            }
                        }
                        // Step 2: retrieve the to-be-grouped metadata for this document
                        // ignore "extra closing token"
                        int docLength = Integer.parseInt(doc.get(lengthTokensFieldName)) - subtractClosingToken;
                        final DocResult synthesizedDocResult = DocResult.fromDoc(queryInfo, new PropertyValueDoc(new DocImpl(queryInfo.index(), docId)), 0, docLength);
                        final PropertyValue[] metadataValuesForGroup = !docProperties.isEmpty() ? new PropertyValue[docProperties.size()] : null;
                        for (int i = 0; i < docProperties.size(); ++i) {
                            metadataValuesForGroup[i] = docProperties.get(i).get(synthesizedDocResult);
                        }
                        // precompute, it's the same for all hits in document
                        final int metadataValuesHash = Arrays.hashCode(metadataValuesForGroup);
                        // now we have all values for all relevant annotations for this document
                        // iterate again and pair up the nth entries for all annotations, then store that as a group.
                        /**
                         * Bookkeeping: track which groups we've already seen in this document,
                         * so we only count this document once per group
                         */
                        HashSet<GroupIdHash> groupsInThisDocument = new HashSet<>();
                        try (BlockTimer f = c.child("Group tokens")) {
                            for (int tokenIndex = 0; tokenIndex < docLength; ++tokenIndex) {
                                if (numberOfHitsProcessed.getAndUpdate(incrementUntilMax) >= maxHitsToProcess) {
                                    hitMaxHitsToProcess.set(true);
                                    // true if any token of this document made the cut, false if we escaped immediately
                                    return tokenIndex > 0;
                                }
                                // Unfortunate fact: token ids are case-sensitive, and in order to group on a token's values case and diacritics insensitively,
                                // we need to actually group by their "sort positions" - which is just the index the term would have if all terms would have been sorted
                                // so in essence it's also an "id", but a case-insensitive one.
                                // we could further optimize to not do this step when grouping sensitively by making a specialized instance of the GroupIdHash class
                                // that hashes the token ids instead of the sortpositions in that case.
                                int[] annotationValuesForThisToken = new int[numAnnotations];
                                int[] sortPositions = new int[annotationValuesForThisToken.length];
                                for (int annotationIndex = 0; annotationIndex < numAnnotations; ++annotationIndex) {
                                    int[] tokenValuesThisAnnotation = tokenValuesPerAnnotation.get(annotationIndex);
                                    final int termId = annotationValuesForThisToken[annotationIndex] = tokenValuesThisAnnotation[tokenIndex];
                                    Triple<AnnotationForwardIndex, MatchSensitivity, Terms> currentHitProp = hitProperties.get(annotationIndex);
                                    MatchSensitivity matchSensitivity = currentHitProp.getMiddle();
                                    Terms terms = currentHitProp.getRight();
                                    sortPositions[annotationIndex] = terms.idToSortPosition(termId, matchSensitivity);
                                }
                                final GroupIdHash groupId = new GroupIdHash(annotationValuesForThisToken, sortPositions, metadataValuesForGroup, metadataValuesHash);
                                occurances.compute(groupId, (__, groupSize) -> {
                                    if (groupSize != null) {
                                        groupSize.left += 1;
                                        // second (or more) occurance of these token values in this document
                                        groupSize.right += groupsInThisDocument.add(groupId) ? 1 : 0;
                                        return groupSize;
                                    } else {
                                        // should always return true, but we need to add this group anyway!
                                        return MutablePair.of(1, groupsInThisDocument.add(groupId) ? 1 : 0);
                                    }
                                });
                            }
                        }
                    } catch (IOException e) {
                        throw BlackLabRuntimeException.wrap(e);
                    }
                    return true;
                }).count();
                logger.trace("Number of processed docs: " + numberOfDocsProcessed);
            }
        }
        Set<PropertyValue> duplicateGroupsDebug = DEBUG ? new HashSet<PropertyValue>() : null;
        List<HitGroup> groups;
        try (final BlockTimer c = BlockTimer.create("Resolve string values for tokens")) {
            final int numMetadataValues = docProperties.size();
            groups = occurances.entrySet().parallelStream().map(e -> {
                final int groupSizeHits = e.getValue().getLeft();
                final int groupSizeDocs = e.getValue().getRight();
                final int[] annotationValues = e.getKey().tokenIds;
                final PropertyValue[] metadataValues = e.getKey().metadataValues;
                // allocate new - is not copied when moving into propertyvaluemultiple
                final PropertyValue[] groupIdAsList = new PropertyValue[numAnnotations + numMetadataValues];
                // Convert all raw values (integers) into their appropriate PropertyValues
                // Taking care to preserve the order of the resultant PropertyValues with the order of the input HitProperties
                int indexInOutput = 0;
                for (Pair<Integer, Boolean> p : originalOrderOfUnpackedProperties) {
                    final int indexInInput = p.getLeft();
                    if (p.getRight()) {
                        // is docprop, add PropertyValue as-is
                        groupIdAsList[indexInOutput++] = metadataValues[indexInInput];
                    } else {
                        // is hitprop, convert value to PropertyValue.
                        Annotation annot = hitProperties.get(indexInInput).getLeft().annotation();
                        MatchSensitivity sens = hitProperties.get(indexInInput).getMiddle();
                        groupIdAsList[indexInOutput++] = new PropertyValueContextWords(index, annot, sens, new int[] { annotationValues[indexInInput] }, false);
                    }
                }
                PropertyValue groupId = groupIdAsList.length > 1 ? new PropertyValueMultiple(groupIdAsList) : groupIdAsList[0];
                if (DEBUG) {
                    synchronized (duplicateGroupsDebug) {
                        if (!duplicateGroupsDebug.add(groupId)) {
                            throw new RuntimeException("Identical groups - should never happen");
                        }
                    }
                }
                return new HitGroupWithoutResults(queryInfo, groupId, groupSizeHits, groupSizeDocs, false, false);
            }).collect(Collectors.toList());
        }
        logger.debug("fast path used for grouping");
        ResultsStats hitsStats = new ResultsStatsStatic(numberOfHitsProcessed.get(), numberOfHitsProcessed.get(), new MaxStats(hitMaxHitsToProcess.get(), hitMaxHitsToProcess.get()));
        ResultsStats docsStats = new ResultsStatsStatic((int) numberOfDocsProcessed, (int) numberOfDocsProcessed, new MaxStats(hitMaxHitsToProcess.get(), hitMaxHitsToProcess.get()));
        return HitGroups.fromList(queryInfo, groups, requestedGroupingProperty, null, null, hitsStats, docsStats);
    } catch (IOException e) {
        throw BlackLabRuntimeException.wrap(e);
    }
}
Also used : Query(org.apache.lucene.search.Query) java.util(java.util) BlackLabIndex(nl.inl.blacklab.search.BlackLabIndex) IntUnaryOperator(java.util.function.IntUnaryOperator) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) SearchHits(nl.inl.blacklab.searches.SearchHits) nl.inl.blacklab.resultproperty(nl.inl.blacklab.resultproperty) Document(org.apache.lucene.document.Document) AnnotatedFieldNameUtil(nl.inl.blacklab.search.indexmetadata.AnnotatedFieldNameUtil) MutablePair(org.apache.commons.lang3.tuple.MutablePair) Pair(org.apache.commons.lang3.tuple.Pair) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Terms(nl.inl.blacklab.forwardindex.Terms) AnnotationForwardIndex(nl.inl.blacklab.forwardindex.AnnotationForwardIndex) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) Triple(org.apache.commons.lang3.tuple.Triple) BlackLabRuntimeException(nl.inl.blacklab.exceptions.BlackLabRuntimeException) Annotation(nl.inl.blacklab.search.indexmetadata.Annotation) SimpleCollector(org.apache.lucene.search.SimpleCollector) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) IOException(java.io.IOException) MatchSensitivity(nl.inl.blacklab.search.indexmetadata.MatchSensitivity) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) Collectors(java.util.stream.Collectors) Logger(org.apache.logging.log4j.Logger) DocImpl(nl.inl.blacklab.search.DocImpl) LogManager(org.apache.logging.log4j.LogManager) IndexReader(org.apache.lucene.index.IndexReader) BlockTimer(nl.inl.util.BlockTimer) Query(org.apache.lucene.search.Query) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) IntUnaryOperator(java.util.function.IntUnaryOperator) Document(org.apache.lucene.document.Document) SimpleCollector(org.apache.lucene.search.SimpleCollector) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) Terms(nl.inl.blacklab.forwardindex.Terms) BlackLabIndex(nl.inl.blacklab.search.BlackLabIndex) Triple(org.apache.commons.lang3.tuple.Triple) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) BlockTimer(nl.inl.util.BlockTimer) MutablePair(org.apache.commons.lang3.tuple.MutablePair) BlackLabRuntimeException(nl.inl.blacklab.exceptions.BlackLabRuntimeException) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) MatchSensitivity(nl.inl.blacklab.search.indexmetadata.MatchSensitivity) MutablePair(org.apache.commons.lang3.tuple.MutablePair) Pair(org.apache.commons.lang3.tuple.Pair) AnnotationForwardIndex(nl.inl.blacklab.forwardindex.AnnotationForwardIndex) IOException(java.io.IOException) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) Annotation(nl.inl.blacklab.search.indexmetadata.Annotation) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) IndexReader(org.apache.lucene.index.IndexReader) DocImpl(nl.inl.blacklab.search.DocImpl)

Example 3 with AnnotationForwardIndex

use of nl.inl.blacklab.forwardindex.AnnotationForwardIndex in project BlackLab by INL.

the class Contexts method getContextWordsSingleDocument.

/**
 * Get context words from the forward index.
 *
 * @param hits the hits
 * @param start inclusive
 * @param end exclusive
 * @param contextSize how many words of context we want
 * @param contextSources forward indices to get context from
 * @param fiidLookups how to find the forward index ids of documents
 */
private static int[][] getContextWordsSingleDocument(HitsArrays hits, int start, int end, ContextSize contextSize, List<AnnotationForwardIndex> contextSources, List<FiidLookup> fiidLookups) {
    final int n = end - start;
    if (n == 0)
        return new int[0][];
    int[] startsOfSnippets = new int[n];
    int[] endsOfSnippets = new int[n];
    EphemeralHit hit = new EphemeralHit();
    for (int i = start; i < end; ++i) {
        hits.getEphemeral(i, hit);
        startsOfSnippets[i - start] = Math.max(0, hit.start - contextSize.left());
        endsOfSnippets[i - start] = hit.end + contextSize.right();
    }
    int fiNumber = 0;
    int doc = hits.doc(start);
    int[][] contexts = new int[n][];
    for (AnnotationForwardIndex forwardIndex : contextSources) {
        FiidLookup fiidLookup = fiidLookups.get(fiNumber);
        // Get all the words from the forward index
        List<int[]> words;
        if (forwardIndex != null) {
            // We have a forward index for this field. Use it.
            int fiid = fiidLookup.get(doc);
            words = forwardIndex.retrievePartsInt(fiid, startsOfSnippets, endsOfSnippets);
        } else {
            throw new BlackLabRuntimeException("Cannot get context without a forward index");
        }
        // int hitNum = 0;
        for (int i = 0; i < n; ++i) {
            int hitIndex = start + i;
            int[] theseWords = words.get(i);
            hits.getEphemeral(hitIndex, hit);
            int firstWordIndex = startsOfSnippets[i];
            if (fiNumber == 0) {
                // Allocate context array and set hit and right start and context length
                contexts[i] = new int[NUMBER_OF_BOOKKEEPING_INTS + theseWords.length * contextSources.size()];
                contexts[i][HIT_START_INDEX] = hit.start - firstWordIndex;
                contexts[i][RIGHT_START_INDEX] = hit.end - firstWordIndex;
                contexts[i][LENGTH_INDEX] = theseWords.length;
            }
            // Copy the context we just retrieved into the context array
            int copyStart = fiNumber * theseWords.length + NUMBER_OF_BOOKKEEPING_INTS;
            System.arraycopy(theseWords, 0, contexts[i], copyStart, theseWords.length);
        }
        fiNumber++;
    }
    return contexts;
}
Also used : BlackLabRuntimeException(nl.inl.blacklab.exceptions.BlackLabRuntimeException) AnnotationForwardIndex(nl.inl.blacklab.forwardindex.AnnotationForwardIndex) FiidLookup(nl.inl.blacklab.forwardindex.FiidLookup) EphemeralHit(nl.inl.blacklab.search.results.Hits.EphemeralHit)

Example 4 with AnnotationForwardIndex

use of nl.inl.blacklab.forwardindex.AnnotationForwardIndex in project BlackLab by INL.

the class Contexts method makeKwicsSingleDocForwardIndex.

// Instance variables
// ------------------------------------------------------------------------------
/**
 * Retrieves the KWIC information (KeyWord In Context: left, hit and right
 * context) for a number of hits in the same document from the ContentStore.
 *
 * NOTE: this destroys any existing contexts!
 *
 * @param forwardIndex Forward index for the words
 * @param punctForwardIndex Forward index for the punctuation
 * @param attrForwardIndices Forward indices for the attributes, or null if none
 * @param wordsAroundHit number of words left and right of hit to fetch
 * @param theKwics where to add the KWICs
 */
static void makeKwicsSingleDocForwardIndex(Hits hits, AnnotationForwardIndex forwardIndex, AnnotationForwardIndex punctForwardIndex, Map<Annotation, AnnotationForwardIndex> attrForwardIndices, Map<Annotation, FiidLookup> fiidLookups, ContextSize wordsAroundHit, Map<Hit, Kwic> theKwics) {
    if (hits.size() == 0)
        return;
    // TODO: more efficient to get all contexts with one getContextWords() call!
    // Get punctuation context
    int[][] punctContext = null;
    if (punctForwardIndex != null) {
        punctContext = getContextWordsSingleDocument(hits.hitsArrays, 0, hits.size(), wordsAroundHit, Arrays.asList(punctForwardIndex), Arrays.asList(fiidLookups.get(punctForwardIndex.annotation())));
    }
    Terms punctTerms = punctForwardIndex == null ? null : punctForwardIndex.terms();
    // Get attributes context
    Annotation[] attrName = null;
    Terms[] attrTerms = null;
    int[][][] attrContext = null;
    if (attrForwardIndices != null) {
        int n = attrForwardIndices.size();
        attrName = new Annotation[n];
        AnnotationForwardIndex[] attrFI = new AnnotationForwardIndex[n];
        attrTerms = new Terms[n];
        attrContext = new int[n][][];
        int i = 0;
        for (Map.Entry<Annotation, AnnotationForwardIndex> e : attrForwardIndices.entrySet()) {
            attrName[i] = e.getKey();
            attrFI[i] = e.getValue();
            attrTerms[i] = attrFI[i].terms();
            attrContext[i] = getContextWordsSingleDocument(hits.hitsArrays, 0, hits.size(), wordsAroundHit, Arrays.asList(attrFI[i]), Arrays.asList(fiidLookups.get(attrName[i])));
            i++;
        }
    }
    // Get word context
    int[][] wordContext = getContextWordsSingleDocument(hits.hitsArrays, 0, hits.size(), wordsAroundHit, Arrays.asList(forwardIndex), Arrays.asList(fiidLookups.get(forwardIndex.annotation())));
    Terms terms = forwardIndex.terms();
    // Make the concordances from the context
    AnnotatedField field = forwardIndex.annotation().field();
    Annotation concPunctFI = field.annotation(Kwic.DEFAULT_CONC_PUNCT_PROP);
    Annotation concWordFI = field.annotation(Kwic.DEFAULT_CONC_WORD_PROP);
    for (int i = 0; i < hits.size(); i++) {
        Hit h = hits.get(i);
        List<String> tokens = new ArrayList<>();
        int[] context = wordContext[i];
        int contextLength = context[Contexts.LENGTH_INDEX];
        int contextRightStart = context[Contexts.RIGHT_START_INDEX];
        int contextHitStart = context[Contexts.HIT_START_INDEX];
        int indexInContext = Contexts.NUMBER_OF_BOOKKEEPING_INTS;
        for (int j = 0; j < contextLength; j++, indexInContext++) {
            // (Applications may choose to ignore punctuation before the first word)
            if (punctTerms == null) {
                // There is no punctuation forward index. Just put a space
                // between every word.
                tokens.add(" ");
            } else
                tokens.add(punctTerms.get(punctContext[i][indexInContext]));
            // Add extra attributes (e.g. lemma, pos)
            if (attrContext != null) {
                for (int k = 0; k < attrContext.length; k++) {
                    tokens.add(attrTerms[k].get(attrContext[k][i][indexInContext]));
                }
            }
            // Add word
            if (terms != null)
                tokens.add(terms.get(context[indexInContext]));
            else
                // weird, but make sure the numbers add up at the end
                tokens.add("");
        }
        List<Annotation> annotations = new ArrayList<>();
        annotations.add(concPunctFI);
        if (attrContext != null) {
            annotations.addAll(Arrays.asList(attrName));
        }
        annotations.add(concWordFI);
        Kwic kwic = new Kwic(annotations, tokens, contextHitStart, contextRightStart);
        theKwics.put(h, kwic);
    }
}
Also used : Kwic(nl.inl.blacklab.search.Kwic) Terms(nl.inl.blacklab.forwardindex.Terms) AnnotationForwardIndex(nl.inl.blacklab.forwardindex.AnnotationForwardIndex) ArrayList(java.util.ArrayList) Annotation(nl.inl.blacklab.search.indexmetadata.Annotation) EphemeralHit(nl.inl.blacklab.search.results.Hits.EphemeralHit) AnnotatedField(nl.inl.blacklab.search.indexmetadata.AnnotatedField) Map(java.util.Map)

Example 5 with AnnotationForwardIndex

use of nl.inl.blacklab.forwardindex.AnnotationForwardIndex in project BlackLab by INL.

the class Kwics method retrieveKwics.

/**
 * Retrieve KWICs for a (sub)list of hits.
 *
 * KWICs are the hit words 'centered' with a certain number of context words
 * around them.
 *
 * @param contextSize how many words around the hit to retrieve
 * @param fieldName field to use for building KWICs
 *
 * @return the KWICs
 */
private static Map<Hit, Kwic> retrieveKwics(Hits hits, ContextSize contextSize, AnnotatedField field) {
    // Group hits per document
    MutableIntObjectMap<List<Hit>> hitsPerDocument = IntObjectMaps.mutable.empty();
    for (Hit key : hits) {
        List<Hit> hitsInDoc = hitsPerDocument.get(key.doc());
        if (hitsInDoc == null) {
            hitsInDoc = new ArrayList<>();
            hitsPerDocument.put(key.doc(), hitsInDoc);
        }
        hitsInDoc.add(key);
    }
    // All FIs except word and punct are attributes
    Map<Annotation, AnnotationForwardIndex> attrForwardIndices = new HashMap<>();
    BlackLabIndex index = hits.index();
    for (Annotation annotation : field.annotations()) {
        if (annotation.hasForwardIndex() && !annotation.name().equals(Kwic.DEFAULT_CONC_WORD_PROP) && !annotation.name().equals(Kwic.DEFAULT_CONC_PUNCT_PROP)) {
            attrForwardIndices.put(annotation, index.annotationForwardIndex(annotation));
        }
    }
    Annotation wordAnnot = field.annotation(Kwic.DEFAULT_CONC_WORD_PROP);
    AnnotationForwardIndex wordForwardIndex = index.annotationForwardIndex(wordAnnot);
    Annotation punctAnnot = field.annotation(Kwic.DEFAULT_CONC_PUNCT_PROP);
    AnnotationForwardIndex punctForwardIndex = index.annotationForwardIndex(punctAnnot);
    // Get FiidLookups for all required forward indexes
    IndexReader reader = hits.queryInfo().index().reader();
    Map<Annotation, FiidLookup> fiidLookups = new HashMap<>();
    fiidLookups.put(wordAnnot, new FiidLookup(reader, wordAnnot));
    fiidLookups.put(punctAnnot, new FiidLookup(reader, punctAnnot));
    for (Map.Entry<Annotation, AnnotationForwardIndex> e : attrForwardIndices.entrySet()) {
        fiidLookups.put(e.getKey(), new FiidLookup(reader, e.getKey()));
    }
    Map<Hit, Kwic> conc1 = new HashMap<>();
    /*
         * if doc not is last doc 
         *  process section if needed
         *  save start of new section
         *  
         * process end section
         */
    int lastDocId = -1;
    int firstIndexWithCurrentDocId = 0;
    for (int i = 1; i < hits.size(); ++i) {
        int curDocId = hits.hitsArrays().doc(i);
        if (curDocId != lastDocId) {
            if (firstIndexWithCurrentDocId != i) {
                Contexts.makeKwicsSingleDocForwardIndex(hits.window(firstIndexWithCurrentDocId, i - firstIndexWithCurrentDocId), wordForwardIndex, punctForwardIndex, attrForwardIndices, fiidLookups, contextSize, conc1);
            }
            firstIndexWithCurrentDocId = i;
            lastDocId = curDocId;
        }
    }
    // last part
    Contexts.makeKwicsSingleDocForwardIndex(hits.window(firstIndexWithCurrentDocId, hits.size() - firstIndexWithCurrentDocId), wordForwardIndex, punctForwardIndex, attrForwardIndices, fiidLookups, contextSize, conc1);
    return conc1;
}
Also used : HashMap(java.util.HashMap) Kwic(nl.inl.blacklab.search.Kwic) AnnotationForwardIndex(nl.inl.blacklab.forwardindex.AnnotationForwardIndex) FiidLookup(nl.inl.blacklab.forwardindex.FiidLookup) Annotation(nl.inl.blacklab.search.indexmetadata.Annotation) BlackLabIndex(nl.inl.blacklab.search.BlackLabIndex) IndexReader(org.apache.lucene.index.IndexReader) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) MutableIntObjectMap(org.eclipse.collections.api.map.primitive.MutableIntObjectMap) Map(java.util.Map)

Aggregations

AnnotationForwardIndex (nl.inl.blacklab.forwardindex.AnnotationForwardIndex)7 Annotation (nl.inl.blacklab.search.indexmetadata.Annotation)4 BlackLabRuntimeException (nl.inl.blacklab.exceptions.BlackLabRuntimeException)3 ArrayList (java.util.ArrayList)2 Map (java.util.Map)2 FiidLookup (nl.inl.blacklab.forwardindex.FiidLookup)2 Terms (nl.inl.blacklab.forwardindex.Terms)2 BlackLabIndex (nl.inl.blacklab.search.BlackLabIndex)2 Kwic (nl.inl.blacklab.search.Kwic)2 EphemeralHit (nl.inl.blacklab.search.results.Hits.EphemeralHit)2 IndexReader (org.apache.lucene.index.IndexReader)2 File (java.io.File)1 IOException (java.io.IOException)1 java.util (java.util)1 HashMap (java.util.HashMap)1 List (java.util.List)1 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)1 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 IntUnaryOperator (java.util.function.IntUnaryOperator)1