Search in sources :

Example 1 with BlackLabIndex

use of nl.inl.blacklab.search.BlackLabIndex in project BlackLab by INL.

the class HitGroupsTokenFrequencies method get.

/**
 * Get the token frequencies for the given query and hit property.
 *
 * @param source query to find token frequencies for
 * @param requestedGroupingProperty
 * @return token frequencies
 */
public static HitGroups get(SearchHits source, HitProperty requestedGroupingProperty) {
    QueryInfo queryInfo = source.queryInfo();
    Query filterQuery = source.getFilterQuery();
    SearchSettings searchSettings = source.searchSettings();
    try {
        /**
         * This is where we store our groups while we're computing/gathering them. Maps from group Id to number of hits (left) and number of docs (right)
         */
        final ConcurrentHashMap<GroupIdHash, MutablePair<Integer, Integer>> occurances = new ConcurrentHashMap<>();
        final BlackLabIndex index = queryInfo.index();
        /**
         * Document properties that are used in the grouping. (e.g. for query "all tokens, grouped by lemma + document year", will contain DocProperty("document year")
         * This is not necessarily limited to just metadata, can also contain any other DocProperties such as document ID, document length, etc.
         */
        final List<DocProperty> docProperties = new ArrayList<>();
        /**
         * Token properties that need to be grouped on, with sensitivity (case-sensitive grouping or not) and Terms
         */
        final List<Triple<AnnotationForwardIndex, MatchSensitivity, Terms>> hitProperties = new ArrayList<>();
        /**
         * Stores the original index every (doc|hit)property has in the original interleaved/intertwined list.
         * The requestedGroupingProperty sometimes represents more than one property (in the form of HitPropertyMultiple) such as 3 properties: [token text, document year, token lemma]
         * The groups always get an id that is (roughly) the concatenation of the properties (in the example case [token text, document year, token lemma]),
         * and it's important this id contains the respective values in the same order.
         * We need to keep this list because otherwise we'd potentially change the order.
         *
         * Integer contains index in the source list (docProperties or hitProperties, from just above)
         * Boolean is true when origin list was docProperties, false for hitProperties.
         */
        final List<Pair<Integer, Boolean>> originalOrderOfUnpackedProperties = new ArrayList<>();
        // Unpack the requestedGroupingProperty into its constituents and sort those into the appropriate categories: hit and doc properties.
        {
            List<HitProperty> props = requestedGroupingProperty.props() != null ? requestedGroupingProperty.props() : Arrays.asList(requestedGroupingProperty);
            for (HitProperty p : props) {
                final DocProperty asDocPropIfApplicable = p.docPropsOnly();
                if (asDocPropIfApplicable != null) {
                    // property can be converted to docProperty (applies to the document instead of the token/hit)
                    if (DEBUG && asDocPropIfApplicable.props() != null) {
                        throw new RuntimeException("Nested PropertyMultiples detected, should never happen (when this code was originally written)");
                    }
                    final int positionInUnpackedList = docProperties.size();
                    docProperties.add(asDocPropIfApplicable);
                    originalOrderOfUnpackedProperties.add(Pair.of(positionInUnpackedList, true));
                } else {
                    // Property couldn't be converted to DocProperty (is null). The current property is an actual HitProperty (applies to annotation/token/hit value)
                    List<Annotation> annot = p.needsContext();
                    if (DEBUG && (annot == null || annot.size() != 1)) {
                        throw new RuntimeException("Grouping property does not apply to singular annotation (nested propertymultiple? non-annotation grouping?) should never happen.");
                    }
                    final int positionInUnpackedList = hitProperties.size();
                    final AnnotationForwardIndex annotationFI = index.annotationForwardIndex(annot.get(0));
                    hitProperties.add(Triple.of(annotationFI, p.getSensitivities().get(0), annotationFI.terms()));
                    originalOrderOfUnpackedProperties.add(Pair.of(positionInUnpackedList, false));
                }
            }
        }
        final int numAnnotations = hitProperties.size();
        long numberOfDocsProcessed;
        final AtomicInteger numberOfHitsProcessed = new AtomicInteger();
        final AtomicBoolean hitMaxHitsToProcess = new AtomicBoolean(false);
        try (final BlockTimer c = BlockTimer.create("Top Level")) {
            final List<Integer> docIds = new ArrayList<>();
            try (BlockTimer d = c.child("Gathering documents")) {
                queryInfo.index().searcher().search(filterQuery == null ? new MatchAllDocsQuery() : filterQuery, new SimpleCollector() {

                    private int docBase;

                    @Override
                    protected void doSetNextReader(LeafReaderContext context) throws IOException {
                        docBase = context.docBase;
                        super.doSetNextReader(context);
                    }

                    @Override
                    public void collect(int docId) throws IOException {
                        int globalDocId = docId + docBase;
                        docIds.add(globalDocId);
                    }

                    @Override
                    public boolean needsScores() {
                        return false;
                    }
                });
            }
            numberOfDocsProcessed = docIds.size();
            final IndexReader reader = queryInfo.index().reader();
            final int[] minusOne = new int[] { -1 };
            // What we do instead is for every document just retrieve how many tokens it contains (from its metadata), and add that count to the appropriate group
            if (hitProperties.isEmpty()) {
                try (BlockTimer f = c.child("Grouping documents (metadata only path)")) {
                    String fieldName = index.mainAnnotatedField().name();
                    DocPropertyAnnotatedFieldLength propTokens = new DocPropertyAnnotatedFieldLength(index, fieldName);
                    final int[] emptyTokenValuesArray = new int[0];
                    docIds.parallelStream().forEach(docId -> {
                        // ignore "extra closing token"
                        final int docLength = (int) propTokens.get(docId) - subtractClosingToken;
                        final DocResult synthesizedDocResult = DocResult.fromDoc(queryInfo, new PropertyValueDoc(new DocImpl(queryInfo.index(), docId)), 0, docLength);
                        final PropertyValue[] metadataValuesForGroup = new PropertyValue[docProperties.size()];
                        for (int i = 0; i < docProperties.size(); ++i) {
                            metadataValuesForGroup[i] = docProperties.get(i).get(synthesizedDocResult);
                        }
                        // precompute, it's the same for all hits in document
                        final int metadataValuesHash = Arrays.hashCode(metadataValuesForGroup);
                        numberOfHitsProcessed.addAndGet(docLength);
                        // Add all tokens in document to the group.
                        final GroupIdHash groupId = new GroupIdHash(emptyTokenValuesArray, emptyTokenValuesArray, metadataValuesForGroup, metadataValuesHash);
                        occurances.compute(groupId, (__, groupSizes) -> {
                            if (groupSizes != null) {
                                groupSizes.left += docLength;
                                groupSizes.right += 1;
                                return groupSizes;
                            } else {
                                return MutablePair.of(docLength, 1);
                            }
                        });
                    });
                }
            } else {
                final int maxHitsToProcess = searchSettings.maxHitsToProcess() > 0 ? searchSettings.maxHitsToProcess() : Integer.MAX_VALUE;
                final IntUnaryOperator incrementUntilMax = (v) -> v < maxHitsToProcess ? v + 1 : v;
                final String fieldName = index.mainAnnotatedField().name();
                final String lengthTokensFieldName = AnnotatedFieldNameUtil.lengthTokensField(fieldName);
                numberOfDocsProcessed = docIds.parallelStream().filter(docId -> {
                    try {
                        // Step 1: read all values for the to-be-grouped annotations for this document
                        // This will create one int[] for every annotation, containing ids that map to the values for this document for this annotation
                        final Document doc = reader.document(docId);
                        final List<int[]> tokenValuesPerAnnotation = new ArrayList<>();
                        try (BlockTimer e = c.child("Read annotations from forward index")) {
                            for (Triple<AnnotationForwardIndex, MatchSensitivity, Terms> annot : hitProperties) {
                                final String annotationFIName = annot.getLeft().annotation().forwardIndexIdField();
                                final int fiid = doc.getField(annotationFIName).numericValue().intValue();
                                final List<int[]> tokenValues = annot.getLeft().retrievePartsInt(fiid, minusOne, minusOne);
                                tokenValuesPerAnnotation.addAll(tokenValues);
                            }
                        }
                        // Step 2: retrieve the to-be-grouped metadata for this document
                        // ignore "extra closing token"
                        int docLength = Integer.parseInt(doc.get(lengthTokensFieldName)) - subtractClosingToken;
                        final DocResult synthesizedDocResult = DocResult.fromDoc(queryInfo, new PropertyValueDoc(new DocImpl(queryInfo.index(), docId)), 0, docLength);
                        final PropertyValue[] metadataValuesForGroup = !docProperties.isEmpty() ? new PropertyValue[docProperties.size()] : null;
                        for (int i = 0; i < docProperties.size(); ++i) {
                            metadataValuesForGroup[i] = docProperties.get(i).get(synthesizedDocResult);
                        }
                        // precompute, it's the same for all hits in document
                        final int metadataValuesHash = Arrays.hashCode(metadataValuesForGroup);
                        // now we have all values for all relevant annotations for this document
                        // iterate again and pair up the nth entries for all annotations, then store that as a group.
                        /**
                         * Bookkeeping: track which groups we've already seen in this document,
                         * so we only count this document once per group
                         */
                        HashSet<GroupIdHash> groupsInThisDocument = new HashSet<>();
                        try (BlockTimer f = c.child("Group tokens")) {
                            for (int tokenIndex = 0; tokenIndex < docLength; ++tokenIndex) {
                                if (numberOfHitsProcessed.getAndUpdate(incrementUntilMax) >= maxHitsToProcess) {
                                    hitMaxHitsToProcess.set(true);
                                    // true if any token of this document made the cut, false if we escaped immediately
                                    return tokenIndex > 0;
                                }
                                // Unfortunate fact: token ids are case-sensitive, and in order to group on a token's values case and diacritics insensitively,
                                // we need to actually group by their "sort positions" - which is just the index the term would have if all terms would have been sorted
                                // so in essence it's also an "id", but a case-insensitive one.
                                // we could further optimize to not do this step when grouping sensitively by making a specialized instance of the GroupIdHash class
                                // that hashes the token ids instead of the sortpositions in that case.
                                int[] annotationValuesForThisToken = new int[numAnnotations];
                                int[] sortPositions = new int[annotationValuesForThisToken.length];
                                for (int annotationIndex = 0; annotationIndex < numAnnotations; ++annotationIndex) {
                                    int[] tokenValuesThisAnnotation = tokenValuesPerAnnotation.get(annotationIndex);
                                    final int termId = annotationValuesForThisToken[annotationIndex] = tokenValuesThisAnnotation[tokenIndex];
                                    Triple<AnnotationForwardIndex, MatchSensitivity, Terms> currentHitProp = hitProperties.get(annotationIndex);
                                    MatchSensitivity matchSensitivity = currentHitProp.getMiddle();
                                    Terms terms = currentHitProp.getRight();
                                    sortPositions[annotationIndex] = terms.idToSortPosition(termId, matchSensitivity);
                                }
                                final GroupIdHash groupId = new GroupIdHash(annotationValuesForThisToken, sortPositions, metadataValuesForGroup, metadataValuesHash);
                                occurances.compute(groupId, (__, groupSize) -> {
                                    if (groupSize != null) {
                                        groupSize.left += 1;
                                        // second (or more) occurance of these token values in this document
                                        groupSize.right += groupsInThisDocument.add(groupId) ? 1 : 0;
                                        return groupSize;
                                    } else {
                                        // should always return true, but we need to add this group anyway!
                                        return MutablePair.of(1, groupsInThisDocument.add(groupId) ? 1 : 0);
                                    }
                                });
                            }
                        }
                    } catch (IOException e) {
                        throw BlackLabRuntimeException.wrap(e);
                    }
                    return true;
                }).count();
                logger.trace("Number of processed docs: " + numberOfDocsProcessed);
            }
        }
        Set<PropertyValue> duplicateGroupsDebug = DEBUG ? new HashSet<PropertyValue>() : null;
        List<HitGroup> groups;
        try (final BlockTimer c = BlockTimer.create("Resolve string values for tokens")) {
            final int numMetadataValues = docProperties.size();
            groups = occurances.entrySet().parallelStream().map(e -> {
                final int groupSizeHits = e.getValue().getLeft();
                final int groupSizeDocs = e.getValue().getRight();
                final int[] annotationValues = e.getKey().tokenIds;
                final PropertyValue[] metadataValues = e.getKey().metadataValues;
                // allocate new - is not copied when moving into propertyvaluemultiple
                final PropertyValue[] groupIdAsList = new PropertyValue[numAnnotations + numMetadataValues];
                // Convert all raw values (integers) into their appropriate PropertyValues
                // Taking care to preserve the order of the resultant PropertyValues with the order of the input HitProperties
                int indexInOutput = 0;
                for (Pair<Integer, Boolean> p : originalOrderOfUnpackedProperties) {
                    final int indexInInput = p.getLeft();
                    if (p.getRight()) {
                        // is docprop, add PropertyValue as-is
                        groupIdAsList[indexInOutput++] = metadataValues[indexInInput];
                    } else {
                        // is hitprop, convert value to PropertyValue.
                        Annotation annot = hitProperties.get(indexInInput).getLeft().annotation();
                        MatchSensitivity sens = hitProperties.get(indexInInput).getMiddle();
                        groupIdAsList[indexInOutput++] = new PropertyValueContextWords(index, annot, sens, new int[] { annotationValues[indexInInput] }, false);
                    }
                }
                PropertyValue groupId = groupIdAsList.length > 1 ? new PropertyValueMultiple(groupIdAsList) : groupIdAsList[0];
                if (DEBUG) {
                    synchronized (duplicateGroupsDebug) {
                        if (!duplicateGroupsDebug.add(groupId)) {
                            throw new RuntimeException("Identical groups - should never happen");
                        }
                    }
                }
                return new HitGroupWithoutResults(queryInfo, groupId, groupSizeHits, groupSizeDocs, false, false);
            }).collect(Collectors.toList());
        }
        logger.debug("fast path used for grouping");
        ResultsStats hitsStats = new ResultsStatsStatic(numberOfHitsProcessed.get(), numberOfHitsProcessed.get(), new MaxStats(hitMaxHitsToProcess.get(), hitMaxHitsToProcess.get()));
        ResultsStats docsStats = new ResultsStatsStatic((int) numberOfDocsProcessed, (int) numberOfDocsProcessed, new MaxStats(hitMaxHitsToProcess.get(), hitMaxHitsToProcess.get()));
        return HitGroups.fromList(queryInfo, groups, requestedGroupingProperty, null, null, hitsStats, docsStats);
    } catch (IOException e) {
        throw BlackLabRuntimeException.wrap(e);
    }
}
Also used : Query(org.apache.lucene.search.Query) java.util(java.util) BlackLabIndex(nl.inl.blacklab.search.BlackLabIndex) IntUnaryOperator(java.util.function.IntUnaryOperator) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) SearchHits(nl.inl.blacklab.searches.SearchHits) nl.inl.blacklab.resultproperty(nl.inl.blacklab.resultproperty) Document(org.apache.lucene.document.Document) AnnotatedFieldNameUtil(nl.inl.blacklab.search.indexmetadata.AnnotatedFieldNameUtil) MutablePair(org.apache.commons.lang3.tuple.MutablePair) Pair(org.apache.commons.lang3.tuple.Pair) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Terms(nl.inl.blacklab.forwardindex.Terms) AnnotationForwardIndex(nl.inl.blacklab.forwardindex.AnnotationForwardIndex) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) Triple(org.apache.commons.lang3.tuple.Triple) BlackLabRuntimeException(nl.inl.blacklab.exceptions.BlackLabRuntimeException) Annotation(nl.inl.blacklab.search.indexmetadata.Annotation) SimpleCollector(org.apache.lucene.search.SimpleCollector) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) IOException(java.io.IOException) MatchSensitivity(nl.inl.blacklab.search.indexmetadata.MatchSensitivity) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) Collectors(java.util.stream.Collectors) Logger(org.apache.logging.log4j.Logger) DocImpl(nl.inl.blacklab.search.DocImpl) LogManager(org.apache.logging.log4j.LogManager) IndexReader(org.apache.lucene.index.IndexReader) BlockTimer(nl.inl.util.BlockTimer) Query(org.apache.lucene.search.Query) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) IntUnaryOperator(java.util.function.IntUnaryOperator) Document(org.apache.lucene.document.Document) SimpleCollector(org.apache.lucene.search.SimpleCollector) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) Terms(nl.inl.blacklab.forwardindex.Terms) BlackLabIndex(nl.inl.blacklab.search.BlackLabIndex) Triple(org.apache.commons.lang3.tuple.Triple) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) BlockTimer(nl.inl.util.BlockTimer) MutablePair(org.apache.commons.lang3.tuple.MutablePair) BlackLabRuntimeException(nl.inl.blacklab.exceptions.BlackLabRuntimeException) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) MatchSensitivity(nl.inl.blacklab.search.indexmetadata.MatchSensitivity) MutablePair(org.apache.commons.lang3.tuple.MutablePair) Pair(org.apache.commons.lang3.tuple.Pair) AnnotationForwardIndex(nl.inl.blacklab.forwardindex.AnnotationForwardIndex) IOException(java.io.IOException) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) Annotation(nl.inl.blacklab.search.indexmetadata.Annotation) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) IndexReader(org.apache.lucene.index.IndexReader) DocImpl(nl.inl.blacklab.search.DocImpl)

Example 2 with BlackLabIndex

use of nl.inl.blacklab.search.BlackLabIndex in project BlackLab by INL.

the class RequestHandlerDocSnippet method handle.

@Override
public int handle(DataStream ds) throws BlsException {
    int i = urlPathInfo.indexOf('/');
    String docId = i >= 0 ? urlPathInfo.substring(0, i) : urlPathInfo;
    if (docId.length() == 0)
        throw new BadRequest("NO_DOC_ID", "Specify document pid.");
    BlackLabIndex blIndex = blIndex();
    int luceneDocId = BlsUtils.getDocIdFromPid(blIndex, docId);
    if (luceneDocId < 0)
        throw new NotFound("DOC_NOT_FOUND", "Document with pid '" + docId + "' not found.");
    Document document = blIndex.doc(luceneDocId).luceneDoc();
    if (document == null)
        throw new InternalServerError("Couldn't fetch document with pid '" + docId + "'.", "INTERR_FETCHING_DOCUMENT_SNIPPET");
    ContextSize wordsAroundHit;
    int start, end;
    boolean isHit = false;
    if (searchParam.containsKey("hitstart")) {
        start = searchParam.getInteger("hitstart");
        end = searchParam.getInteger("hitend");
        wordsAroundHit = ContextSize.get(searchParam.getInteger("wordsaroundhit"));
        isHit = true;
    } else {
        start = searchParam.getInteger("wordstart");
        end = searchParam.getInteger("wordend");
        wordsAroundHit = ContextSize.hitOnly();
    }
    if (start < 0 || end < 0 || wordsAroundHit.left() < 0 || wordsAroundHit.right() < 0 || start > end) {
        throw new BadRequest("ILLEGAL_BOUNDARIES", "Illegal word boundaries specified. Please check parameters.");
    }
    // Clamp snippet to max size
    int snippetStart = Math.max(0, start - wordsAroundHit.left());
    int snippetEnd = end + wordsAroundHit.right();
    int maxContextSize = searchMan.config().getParameters().getContextSize().getMax();
    if (snippetEnd - snippetStart > maxContextSize) {
        int clampedWindow = Math.max(0, (maxContextSize - (end - start)) / 2);
        snippetStart = Math.max(0, start - clampedWindow);
        snippetEnd = end + clampedWindow;
    // throw new BadRequest("SNIPPET_TOO_LARGE", "Snippet too large. Maximum size for a snippet is " + searchMan.config().maxSnippetSize() + " words.");
    }
    HitsArrays hitsArrays = new HitsArrays();
    hitsArrays.add(luceneDocId, start, end);
    boolean origContent = searchParam.getString("usecontent").equals("orig");
    Hits hits = Hits.fromList(QueryInfo.create(blIndex), hitsArrays, null);
    getHitOrFragmentInfo(ds, hits, hitsArrays.get(0), wordsAroundHit, origContent, !isHit, null, new HashSet<>(this.getAnnotationsToWrite()));
    return HTTP_OK;
}
Also used : BadRequest(nl.inl.blacklab.server.exceptions.BadRequest) Hits(nl.inl.blacklab.search.results.Hits) ContextSize(nl.inl.blacklab.search.results.ContextSize) Document(org.apache.lucene.document.Document) InternalServerError(nl.inl.blacklab.server.exceptions.InternalServerError) BlackLabIndex(nl.inl.blacklab.search.BlackLabIndex) NotFound(nl.inl.blacklab.server.exceptions.NotFound) HitsArrays(nl.inl.blacklab.search.results.Hits.HitsArrays)

Example 3 with BlackLabIndex

use of nl.inl.blacklab.search.BlackLabIndex in project BlackLab by INL.

the class RequestHandlerDocs method doResponse.

private int doResponse(DataStream ds, boolean isViewGroup, Set<Annotation> annotationsTolist, Set<MetadataField> metadataFieldsToList) throws BlsException, InvalidQuery {
    BlackLabIndex blIndex = blIndex();
    boolean includeTokenCount = searchParam.getBoolean("includetokencount");
    long totalTokens = -1;
    if (includeTokenCount) {
        // Determine total number of tokens in result set
        totalTokens = totalDocResults.subcorpusSize().getTokens();
    }
    // Search is done; construct the results object
    ds.startMap();
    // The summary
    ds.startEntry("summary").startMap();
    ResultCount totalHits;
    try {
        totalHits = originalHitsSearch == null ? null : originalHitsSearch.get();
    } catch (InterruptedException | ExecutionException e) {
        throw RequestHandler.translateSearchException(e);
    }
    ResultCount docsStats = searchParam.docsCount().execute();
    addSummaryCommonFields(ds, searchParam, search.timeUserWaitedMs(), totalTime, null, window.windowStats());
    boolean countFailed = totalTime < 0;
    if (totalHits == null)
        addNumberOfResultsSummaryDocResults(ds, isViewGroup, docResults, countFailed, null);
    else
        addNumberOfResultsSummaryTotalHits(ds, totalHits, docsStats, countFailed, null);
    if (includeTokenCount)
        ds.entry("tokensInMatchingDocuments", totalTokens);
    ds.startEntry("docFields");
    RequestHandler.dataStreamDocFields(ds, blIndex.metadata());
    ds.endEntry();
    ds.startEntry("metadataFieldDisplayNames");
    RequestHandler.dataStreamMetadataFieldDisplayNames(ds, blIndex.metadata());
    ds.endEntry();
    ds.endMap().endEntry();
    searchLogger.setResultsFound(docsStats.processedSoFar());
    // The hits and document info
    ds.startEntry("docs").startList();
    for (DocResult result : window) {
        ds.startItem("doc").startMap();
        // Find pid
        Document document = result.identity().luceneDoc();
        String pid = getDocumentPid(blIndex, result.identity().id(), document);
        // Combine all
        ds.entry("docPid", pid);
        int numHits = result.size();
        if (numHits > 0)
            ds.entry("numberOfHits", numHits);
        // Doc info (metadata, etc.)
        ds.startEntry("docInfo");
        dataStreamDocumentInfo(ds, blIndex, document, metadataFieldsToList);
        ds.endEntry();
        // Snippets
        // TODO: make num. snippets configurable
        Hits hits2 = result.storedResults().window(0, 5);
        if (hits2.hitsStats().processedAtLeast(1)) {
            ds.startEntry("snippets").startList();
            ContextSettings contextSettings = searchParam.getContextSettings();
            Concordances concordances = null;
            Kwics kwics = null;
            if (contextSettings.concType() == ConcordanceType.CONTENT_STORE)
                concordances = hits2.concordances(contextSettings.size(), ConcordanceType.CONTENT_STORE);
            else
                kwics = hits2.kwics(blIndex.defaultContextSize());
            for (Hit hit : hits2) {
                // TODO: use RequestHandlerDocSnippet.getHitOrFragmentInfo()
                ds.startItem("snippet").startMap();
                if (contextSettings.concType() == ConcordanceType.CONTENT_STORE) {
                    // Add concordance from original XML
                    Concordance c = concordances.get(hit);
                    ds.startEntry("left").plain(c.left()).endEntry().startEntry("match").plain(c.match()).endEntry().startEntry("right").plain(c.right()).endEntry();
                } else {
                    // Add KWIC info
                    Kwic c = kwics.get(hit);
                    ds.startEntry("left").contextList(c.annotations(), annotationsTolist, c.left()).endEntry().startEntry("match").contextList(c.annotations(), annotationsTolist, c.match()).endEntry().startEntry("right").contextList(c.annotations(), annotationsTolist, c.right()).endEntry();
                }
                ds.endMap().endItem();
            }
            ds.endList().endEntry();
        }
        ds.endMap().endItem();
    }
    ds.endList().endEntry();
    if (searchParam.hasFacets()) {
        // Now, group the docs according to the requested facets.
        ds.startEntry("facets");
        dataStreamFacets(ds, totalDocResults, searchParam.facets());
        ds.endEntry();
    }
    ds.endMap();
    return HTTP_OK;
}
Also used : Concordances(nl.inl.blacklab.search.results.Concordances) Hits(nl.inl.blacklab.search.results.Hits) Kwic(nl.inl.blacklab.search.Kwic) ResultCount(nl.inl.blacklab.search.results.ResultCount) Document(org.apache.lucene.document.Document) BlackLabIndex(nl.inl.blacklab.search.BlackLabIndex) Hit(nl.inl.blacklab.search.results.Hit) Kwics(nl.inl.blacklab.search.results.Kwics) Concordance(nl.inl.blacklab.search.Concordance) ContextSettings(nl.inl.blacklab.server.jobs.ContextSettings) ExecutionException(java.util.concurrent.ExecutionException) DocResult(nl.inl.blacklab.search.results.DocResult)

Example 4 with BlackLabIndex

use of nl.inl.blacklab.search.BlackLabIndex in project BlackLab by INL.

the class RequestHandlerExplain method handle.

@Override
public int handle(DataStream ds) throws BlsException {
    BlackLabIndex blIndex = blIndex();
    String patt = searchParam.getString("patt");
    try {
        TextPattern tp = CorpusQueryLanguageParser.parse(patt);
        BLSpanQuery q = tp.toQuery(QueryInfo.create(blIndex));
        QueryExplanation explanation = blIndex.explain(q);
        // Assemble response
        ds.startMap().entry("textPattern", patt).entry("originalQuery", explanation.originalQuery()).entry("rewrittenQuery", explanation.rewrittenQuery());
        ds.endMap();
    } catch (TooManyClauses e) {
        return Response.badRequest(ds, "QUERY_TOO_BROAD", "Query too broad, too many matching terms. Please be more specific.");
    } catch (InvalidQuery e) {
        return Response.badRequest(ds, "PATT_SYNTAX_ERROR", "Syntax error in gapped CorpusQL pattern: " + e.getMessage());
    }
    return HTTP_OK;
}
Also used : BLSpanQuery(nl.inl.blacklab.search.lucene.BLSpanQuery) TextPattern(nl.inl.blacklab.search.textpattern.TextPattern) InvalidQuery(nl.inl.blacklab.exceptions.InvalidQuery) TooManyClauses(org.apache.lucene.search.BooleanQuery.TooManyClauses) BlackLabIndex(nl.inl.blacklab.search.BlackLabIndex) QueryExplanation(nl.inl.blacklab.search.QueryExplanation)

Example 5 with BlackLabIndex

use of nl.inl.blacklab.search.BlackLabIndex in project BlackLab by INL.

the class RequestHandlerFieldInfo method describeAnnotatedField.

public static void describeAnnotatedField(DataStream ds, String indexName, AnnotatedField fieldDesc, BlackLabIndex index, Set<String> showValuesFor, Set<String> showSubpropsFor) {
    if (fieldDesc.isDummyFieldToStoreLinkedDocuments())
        // skip this, not really an annotated field, just exists to store linked (metadata) document.
        return;
    ds.startMap();
    if (indexName != null)
        ds.entry("indexName", indexName);
    Annotations annotations = fieldDesc.annotations();
    ds.entry("fieldName", fieldDesc.name()).entry("isAnnotatedField", true).entry("displayName", fieldDesc.displayName()).entry("description", fieldDesc.description()).entry("hasContentStore", fieldDesc.hasContentStore()).entry("hasXmlTags", fieldDesc.hasXmlTags()).entry("hasLengthTokens", fieldDesc.hasLengthTokens());
    ds.entry("mainAnnotation", annotations.main().name());
    ds.startEntry("displayOrder").startList();
    annotations.stream().map(f -> f.name()).forEach(id -> ds.item("fieldName", id));
    ds.endList().endEntry();
    ds.startEntry("annotations").startMap();
    for (Annotation annotation : annotations) {
        ds.startAttrEntry("annotation", "name", annotation.name()).startMap();
        AnnotationSensitivity offsetsSensitivity = annotation.offsetsSensitivity();
        String offsetsAlternative = offsetsSensitivity == null ? "" : offsetsSensitivity.sensitivity().luceneFieldSuffix();
        ds.entry("displayName", annotation.displayName()).entry("description", annotation.description()).entry("uiType", annotation.uiType()).entry("hasForwardIndex", annotation.hasForwardIndex()).entry("sensitivity", sensitivitySettingDesc(annotation)).entry("offsetsAlternative", offsetsAlternative).entry("isInternal", annotation.isInternal());
        AnnotationSensitivity as = annotation.sensitivity(annotation.hasSensitivity(MatchSensitivity.INSENSITIVE) ? MatchSensitivity.INSENSITIVE : MatchSensitivity.SENSITIVE);
        String luceneField = as.luceneField();
        if (annotationMatches(annotation.name(), showValuesFor)) {
            boolean isInlineTagAnnotation = annotation.name().equals(AnnotatedFieldNameUtil.TAGS_ANNOT_NAME);
            ds.startEntry("values").startList();
            // Arrays because we have to access them from the closures
            boolean[] valueListComplete = { true };
            final Set<String> terms = new TreeSet<>();
            if (isInlineTagAnnotation) {
                LuceneUtil.getFieldTerms(index.reader(), luceneField, null, term -> {
                    if (!term.startsWith("@") && !terms.contains(term)) {
                        if (terms.size() >= MAX_FIELD_VALUES) {
                            valueListComplete[0] = false;
                            return false;
                        }
                        terms.add(term);
                    }
                    return true;
                });
            } else {
                LuceneUtil.getFieldTerms(index.reader(), luceneField, null, term -> {
                    if (!term.contains(AnnotatedFieldNameUtil.SUBANNOTATION_SEPARATOR) && !terms.contains(term)) {
                        if (terms.size() >= MAX_FIELD_VALUES) {
                            valueListComplete[0] = false;
                            return false;
                        }
                        terms.add(term);
                    }
                    return true;
                });
            }
            for (String term : terms) {
                ds.item("value", term);
            }
            ds.endList().endEntry();
            ds.entry("valueListComplete", valueListComplete[0]);
        }
        boolean subannotationsStoredWithParent = index.metadata().subannotationsStoredWithParent();
        if (!subannotationsStoredWithParent || showSubpropsFor.contains(annotation.name())) {
            if (subannotationsStoredWithParent) {
                // Older index, where the subannotations are stored in the same Lucene field as their parent annotation.
                // Detecting these requires enumerating all terms, so only do it when asked.
                Map<String, Set<String>> subprops = LuceneUtil.getOldSingleFieldSubprops(index.reader(), luceneField);
                ds.startEntry("subannotations").startMap();
                for (Map.Entry<String, Set<String>> subprop : subprops.entrySet()) {
                    String name = subprop.getKey();
                    Set<String> values = subprop.getValue();
                    ds.startAttrEntry("subannotation", "name", name).startList();
                    for (String value : values) {
                        ds.item("value", value);
                    }
                    ds.endList().endAttrEntry();
                }
                ds.endMap().endEntry();
            } else if (!annotation.subannotationNames().isEmpty()) {
                // Newer index, where the subannotations are stored in their own Lucene fields.
                // Always show these.
                ds.startEntry("subannotations").startList();
                for (String name : annotation.subannotationNames()) {
                    ds.item("subannotation", name);
                }
                ds.endList().endEntry();
            }
        }
        if (annotation.isSubannotation()) {
            ds.entry("parentAnnotation", annotation.parentAnnotation().name());
        }
        ds.endMap().endAttrEntry();
    }
    ds.endMap().endEntry();
    ds.endMap();
}
Also used : BlackLabIndex(nl.inl.blacklab.search.BlackLabIndex) BlackLabServer(nl.inl.blacklab.server.BlackLabServer) AnnotationSensitivity(nl.inl.blacklab.search.indexmetadata.AnnotationSensitivity) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) IndexMetadata(nl.inl.blacklab.search.indexmetadata.IndexMetadata) HashSet(java.util.HashSet) Annotations(nl.inl.blacklab.search.indexmetadata.Annotations) HttpServletRequest(javax.servlet.http.HttpServletRequest) BadRequest(nl.inl.blacklab.server.exceptions.BadRequest) AnnotatedFieldNameUtil(nl.inl.blacklab.search.indexmetadata.AnnotatedFieldNameUtil) BlsException(nl.inl.blacklab.server.exceptions.BlsException) Map(java.util.Map) DataStream(nl.inl.blacklab.server.datastream.DataStream) ParseException(java.text.ParseException) Collator(java.text.Collator) MetadataField(nl.inl.blacklab.search.indexmetadata.MetadataField) User(nl.inl.blacklab.server.jobs.User) Annotation(nl.inl.blacklab.search.indexmetadata.Annotation) RuleBasedCollator(java.text.RuleBasedCollator) LuceneUtil(nl.inl.util.LuceneUtil) Set(java.util.Set) MatchSensitivity(nl.inl.blacklab.search.indexmetadata.MatchSensitivity) ValueListComplete(nl.inl.blacklab.search.indexmetadata.ValueListComplete) List(java.util.List) AnnotatedField(nl.inl.blacklab.search.indexmetadata.AnnotatedField) Comparator(java.util.Comparator) BlackLabIndexImpl(nl.inl.blacklab.search.BlackLabIndexImpl) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) Set(java.util.Set) Annotation(nl.inl.blacklab.search.indexmetadata.Annotation) Annotations(nl.inl.blacklab.search.indexmetadata.Annotations) TreeSet(java.util.TreeSet) AnnotationSensitivity(nl.inl.blacklab.search.indexmetadata.AnnotationSensitivity) Map(java.util.Map)

Aggregations

BlackLabIndex (nl.inl.blacklab.search.BlackLabIndex)19 Document (org.apache.lucene.document.Document)8 Annotation (nl.inl.blacklab.search.indexmetadata.Annotation)6 BadRequest (nl.inl.blacklab.server.exceptions.BadRequest)6 ArrayList (java.util.ArrayList)5 MetadataField (nl.inl.blacklab.search.indexmetadata.MetadataField)5 Hits (nl.inl.blacklab.search.results.Hits)5 File (java.io.File)4 Map (java.util.Map)4 AnnotatedField (nl.inl.blacklab.search.indexmetadata.AnnotatedField)4 IndexMetadata (nl.inl.blacklab.search.indexmetadata.IndexMetadata)4 HashSet (java.util.HashSet)3 List (java.util.List)3 Set (java.util.Set)3 Kwic (nl.inl.blacklab.search.Kwic)3 MatchSensitivity (nl.inl.blacklab.search.indexmetadata.MatchSensitivity)3 Query (org.apache.lucene.search.Query)3 HashMap (java.util.HashMap)2 TreeSet (java.util.TreeSet)2 HttpServletRequest (javax.servlet.http.HttpServletRequest)2