Search in sources :

Example 1 with OpenBitSet

use of org.apache.lucene.util.OpenBitSet in project stanbol by apache.

the class FstLinkingEngine method tag.

/**
 * Uses the {@link Corpus} to tag the the {@link AnalysedText} and adds
 * tagging results to the parsed tag map.
 * @param content the content to link
 * @param at the AnalyzedText. not required if {@link LinkingModeEnum#PLAIN}
 * @param session the tagging session of the text
 * @param corpus the corpus o the session to tag the content with
 * @param tags the Tags map used to store the tagging results
 * @return the time in milliseconds spent in the tag callback.
 * @throws IOException on any error while accessing the {@link SolrCore}
 */
private int tag(final String content, final AnalysedText at, final TaggingSession session, final Corpus corpus, final Map<int[], Tag> tags) throws IOException {
    final OpenBitSet matchDocIdsBS = new OpenBitSet(session.getSearcher().maxDoc());
    TokenStream baseTokenStream = corpus.getTaggingAnalyzer().tokenStream("", new CharSequenceReader(content));
    final TokenStream tokenStream;
    final TagClusterReducer reducer;
    log.debug(" ... set up TokenStream and TagClusterReducer for linking mode {}", linkingMode);
    switch(linkingMode) {
        case // will link all tokens and search longest dominant right
        PLAIN:
            tokenStream = baseTokenStream;
            reducer = TagClusterReducer.LONGEST_DOMINANT_RIGHT;
            break;
        case NER:
            // this uses the NamedEntityTokenFilter as tokenStream and a
            // combination with the longest dominant right as reducer
            NamedEntityTokenFilter neTokenFilter = new NamedEntityTokenFilter(baseTokenStream, at, session.getLanguage(), neTypeMappings.keySet(), session.entityMentionTypes);
            tokenStream = neTokenFilter;
            reducer = new ChainedTagClusterReducer(neTokenFilter, TagClusterReducer.LONGEST_DOMINANT_RIGHT);
            break;
        case LINKABLE_TOKEN:
            // this uses the LinkableTokenFilter as tokenStream
            LinkableTokenFilter linkableTokenFilter = new LinkableTokenFilter(baseTokenStream, at, session.getLanguage(), tpConfig.getConfiguration(session.getLanguage()), elConfig.getMinChunkMatchScore(), elConfig.getMinFoundTokens());
            // NOTE that the  LinkableTokenFilter implements longest dominant right
            // based on the matchable span of tags (instead of the whole span).
            reducer = new ChainedTagClusterReducer(linkableTokenFilter, TagClusterReducer.ALL);
            tokenStream = linkableTokenFilter;
            break;
        default:
            throw new IllegalStateException("Unrecognized LinkingMode '" + linkingMode + "! Please adapt implementation to changed Enumeration!");
    }
    log.debug(" - tokenStream: {}", tokenStream);
    log.debug(" - reducer: {} (class: {})", reducer, reducer.getClass().getName());
    // Now process the document
    final long[] time = new long[] { 0 };
    new Tagger(corpus.getFst(), tokenStream, reducer, session.isSkipAltTokens()) {

        @Override
        protected void tagCallback(int startOffset, int endOffset, long docIdsKey) {
            long start = System.nanoTime();
            if (log.isTraceEnabled()) {
                log.trace(" > tagCallback for {}", content.subSequence(startOffset, endOffset));
            }
            int[] span = new int[] { startOffset, endOffset };
            Tag tag = tags.get(span);
            if (tag == null) {
                tag = new Tag(span);
                tags.put(span, tag);
            }
            // below caches, and also flags matchDocIdsBS
            Set<Match> matches = createMatches(docIdsKey);
            if (log.isTraceEnabled()) {
                log.trace("  - {} matches", matches.size());
            }
            tag.addIds(matches);
            long dif = System.nanoTime() - start;
            time[0] = time[0] + dif;
        }

        // NOTE: We can not use a cache, because we need to create different
        // Match instances even for the same 'docIdsKey'. This is because
        // the same result list might get generated for different
        // surface forms in the text (e.g. if the SolrIndex is case
        // insensitive, but the linking does consider the case when
        // calculating the score). If we would use this cache Match
        // instances would be used for several occurrences in the text
        // and Match#getScore() values would get overridden when
        // processing those multiple occurrences.
        // Map<Long,Set<Match>> docIdsListCache = new HashMap<Long,Set<Match>>(1024);
        private Set<Match> createMatches(long docIdsKey) {
            IntsRef docIds = lookupDocIds(docIdsKey);
            Set<Match> matches = new HashSet<Match>(docIds.length);
            for (int i = docIds.offset; i < docIds.offset + docIds.length; i++) {
                int docId = docIds.ints[i];
                // also, flip docid in bitset
                matchDocIdsBS.set(docId);
                // translates here
                matches.add(session.createMatch(docId));
            }
            return matches;
        }
    }.process();
    return (int) (time[0] / 1000000);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) OpenBitSet(org.apache.lucene.util.OpenBitSet) Set(java.util.Set) OpenBitSet(org.apache.lucene.util.OpenBitSet) HashSet(java.util.HashSet) Tagger(org.opensextant.solrtexttagger.Tagger) TagClusterReducer(org.opensextant.solrtexttagger.TagClusterReducer) CharSequenceReader(org.apache.commons.io.input.CharSequenceReader) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) IntsRef(org.apache.lucene.util.IntsRef)

Example 2 with OpenBitSet

use of org.apache.lucene.util.OpenBitSet in project zm-mailbox by Zimbra.

the class TermsFilter method getDocIdSet.

/**
 * (non-Javadoc)
 * @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader)
 */
@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
    OpenBitSet result = new OpenBitSet(reader.maxDoc());
    try (TermDocs td = reader.termDocs()) {
        for (Iterator<Term> iter = terms.iterator(); iter.hasNext(); ) {
            Term term = iter.next();
            td.seek(term);
            while (td.next()) {
                result.set(td.doc());
            }
        }
    }
    return result;
}
Also used : OpenBitSet(org.apache.lucene.util.OpenBitSet) TermDocs(org.apache.lucene.index.TermDocs) Term(org.apache.lucene.index.Term)

Aggregations

OpenBitSet (org.apache.lucene.util.OpenBitSet)2 HashSet (java.util.HashSet)1 Set (java.util.Set)1 CharSequenceReader (org.apache.commons.io.input.CharSequenceReader)1 TokenStream (org.apache.lucene.analysis.TokenStream)1 Term (org.apache.lucene.index.Term)1 TermDocs (org.apache.lucene.index.TermDocs)1 IntsRef (org.apache.lucene.util.IntsRef)1 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)1 TagClusterReducer (org.opensextant.solrtexttagger.TagClusterReducer)1 Tagger (org.opensextant.solrtexttagger.Tagger)1