use of org.opensextant.solrtexttagger.TagClusterReducer in project stanbol by apache.
the class FstLinkingEngine method tag.
/**
* Uses the {@link Corpus} to tag the the {@link AnalysedText} and adds
* tagging results to the parsed tag map.
* @param content the content to link
* @param at the AnalyzedText. not required if {@link LinkingModeEnum#PLAIN}
* @param session the tagging session of the text
* @param corpus the corpus o the session to tag the content with
* @param tags the Tags map used to store the tagging results
* @return the time in milliseconds spent in the tag callback.
* @throws IOException on any error while accessing the {@link SolrCore}
*/
private int tag(final String content, final AnalysedText at, final TaggingSession session, final Corpus corpus, final Map<int[], Tag> tags) throws IOException {
final OpenBitSet matchDocIdsBS = new OpenBitSet(session.getSearcher().maxDoc());
TokenStream baseTokenStream = corpus.getTaggingAnalyzer().tokenStream("", new CharSequenceReader(content));
final TokenStream tokenStream;
final TagClusterReducer reducer;
log.debug(" ... set up TokenStream and TagClusterReducer for linking mode {}", linkingMode);
switch(linkingMode) {
case // will link all tokens and search longest dominant right
PLAIN:
tokenStream = baseTokenStream;
reducer = TagClusterReducer.LONGEST_DOMINANT_RIGHT;
break;
case NER:
// this uses the NamedEntityTokenFilter as tokenStream and a
// combination with the longest dominant right as reducer
NamedEntityTokenFilter neTokenFilter = new NamedEntityTokenFilter(baseTokenStream, at, session.getLanguage(), neTypeMappings.keySet(), session.entityMentionTypes);
tokenStream = neTokenFilter;
reducer = new ChainedTagClusterReducer(neTokenFilter, TagClusterReducer.LONGEST_DOMINANT_RIGHT);
break;
case LINKABLE_TOKEN:
// this uses the LinkableTokenFilter as tokenStream
LinkableTokenFilter linkableTokenFilter = new LinkableTokenFilter(baseTokenStream, at, session.getLanguage(), tpConfig.getConfiguration(session.getLanguage()), elConfig.getMinChunkMatchScore(), elConfig.getMinFoundTokens());
// NOTE that the LinkableTokenFilter implements longest dominant right
// based on the matchable span of tags (instead of the whole span).
reducer = new ChainedTagClusterReducer(linkableTokenFilter, TagClusterReducer.ALL);
tokenStream = linkableTokenFilter;
break;
default:
throw new IllegalStateException("Unrecognized LinkingMode '" + linkingMode + "! Please adapt implementation to changed Enumeration!");
}
log.debug(" - tokenStream: {}", tokenStream);
log.debug(" - reducer: {} (class: {})", reducer, reducer.getClass().getName());
// Now process the document
final long[] time = new long[] { 0 };
new Tagger(corpus.getFst(), tokenStream, reducer, session.isSkipAltTokens()) {
@Override
protected void tagCallback(int startOffset, int endOffset, long docIdsKey) {
long start = System.nanoTime();
if (log.isTraceEnabled()) {
log.trace(" > tagCallback for {}", content.subSequence(startOffset, endOffset));
}
int[] span = new int[] { startOffset, endOffset };
Tag tag = tags.get(span);
if (tag == null) {
tag = new Tag(span);
tags.put(span, tag);
}
// below caches, and also flags matchDocIdsBS
Set<Match> matches = createMatches(docIdsKey);
if (log.isTraceEnabled()) {
log.trace(" - {} matches", matches.size());
}
tag.addIds(matches);
long dif = System.nanoTime() - start;
time[0] = time[0] + dif;
}
// NOTE: We can not use a cache, because we need to create different
// Match instances even for the same 'docIdsKey'. This is because
// the same result list might get generated for different
// surface forms in the text (e.g. if the SolrIndex is case
// insensitive, but the linking does consider the case when
// calculating the score). If we would use this cache Match
// instances would be used for several occurrences in the text
// and Match#getScore() values would get overridden when
// processing those multiple occurrences.
// Map<Long,Set<Match>> docIdsListCache = new HashMap<Long,Set<Match>>(1024);
private Set<Match> createMatches(long docIdsKey) {
IntsRef docIds = lookupDocIds(docIdsKey);
Set<Match> matches = new HashSet<Match>(docIds.length);
for (int i = docIds.offset; i < docIds.offset + docIds.length; i++) {
int docId = docIds.ints[i];
// also, flip docid in bitset
matchDocIdsBS.set(docId);
// translates here
matches.add(session.createMatch(docId));
}
return matches;
}
}.process();
return (int) (time[0] / 1000000);
}
Aggregations