use of org.apache.lucene.util.IntsRef in project stanbol by apache.
the class FstLinkingEngine method tag.
/**
* Uses the {@link Corpus} to tag the the {@link AnalysedText} and adds
* tagging results to the parsed tag map.
* @param content the content to link
* @param at the AnalyzedText. not required if {@link LinkingModeEnum#PLAIN}
* @param session the tagging session of the text
* @param corpus the corpus o the session to tag the content with
* @param tags the Tags map used to store the tagging results
* @return the time in milliseconds spent in the tag callback.
* @throws IOException on any error while accessing the {@link SolrCore}
*/
private int tag(final String content, final AnalysedText at, final TaggingSession session, final Corpus corpus, final Map<int[], Tag> tags) throws IOException {
final OpenBitSet matchDocIdsBS = new OpenBitSet(session.getSearcher().maxDoc());
TokenStream baseTokenStream = corpus.getTaggingAnalyzer().tokenStream("", new CharSequenceReader(content));
final TokenStream tokenStream;
final TagClusterReducer reducer;
log.debug(" ... set up TokenStream and TagClusterReducer for linking mode {}", linkingMode);
switch(linkingMode) {
case //will link all tokens and search longest dominant right
PLAIN:
tokenStream = baseTokenStream;
reducer = TagClusterReducer.LONGEST_DOMINANT_RIGHT;
break;
case NER:
//this uses the NamedEntityTokenFilter as tokenStream and a
//combination with the longest dominant right as reducer
NamedEntityTokenFilter neTokenFilter = new NamedEntityTokenFilter(baseTokenStream, at, session.getLanguage(), neTypeMappings.keySet(), session.entityMentionTypes);
tokenStream = neTokenFilter;
reducer = new ChainedTagClusterReducer(neTokenFilter, TagClusterReducer.LONGEST_DOMINANT_RIGHT);
break;
case LINKABLE_TOKEN:
//this uses the LinkableTokenFilter as tokenStream
LinkableTokenFilter linkableTokenFilter = new LinkableTokenFilter(baseTokenStream, at, session.getLanguage(), tpConfig.getConfiguration(session.getLanguage()), elConfig.getMinChunkMatchScore(), elConfig.getMinFoundTokens());
//NOTE that the LinkableTokenFilter implements longest dominant right
// based on the matchable span of tags (instead of the whole span).
reducer = new ChainedTagClusterReducer(linkableTokenFilter, TagClusterReducer.ALL);
tokenStream = linkableTokenFilter;
break;
default:
throw new IllegalStateException("Unrecognized LinkingMode '" + linkingMode + "! Please adapt implementation to changed Enumeration!");
}
log.debug(" - tokenStream: {}", tokenStream);
log.debug(" - reducer: {} (class: {})", reducer, reducer.getClass().getName());
//Now process the document
final long[] time = new long[] { 0 };
new Tagger(corpus.getFst(), tokenStream, reducer, session.isSkipAltTokens()) {
@Override
protected void tagCallback(int startOffset, int endOffset, long docIdsKey) {
long start = System.nanoTime();
if (log.isTraceEnabled()) {
log.trace(" > tagCallback for {}", content.subSequence(startOffset, endOffset));
}
int[] span = new int[] { startOffset, endOffset };
Tag tag = tags.get(span);
if (tag == null) {
tag = new Tag(span);
tags.put(span, tag);
}
// below caches, and also flags matchDocIdsBS
Set<Match> matches = createMatches(docIdsKey);
if (log.isTraceEnabled()) {
log.trace(" - {} matches", matches.size());
}
tag.addIds(matches);
long dif = System.nanoTime() - start;
time[0] = time[0] + dif;
}
//NOTE: We can not use a cache, because we need to create different
// Match instances even for the same 'docIdsKey'. This is because
// the same result list might get generated for different
// surface forms in the text (e.g. if the SolrIndex is case
// insensitive, but the linking does consider the case when
// calculating the score). If we would use this cache Match
// instances would be used for several occurrences in the text
// and Match#getScore() values would get overridden when
// processing those multiple occurrences.
//Map<Long,Set<Match>> docIdsListCache = new HashMap<Long,Set<Match>>(1024);
private Set<Match> createMatches(long docIdsKey) {
IntsRef docIds = lookupDocIds(docIdsKey);
Set<Match> matches = new HashSet<Match>(docIds.length);
for (int i = docIds.offset; i < docIds.offset + docIds.length; i++) {
int docId = docIds.ints[i];
// also, flip docid in bitset
matchDocIdsBS.set(docId);
// translates here
matches.add(session.createMatch(docId));
}
return matches;
}
}.process();
return (int) (time[0] / 1000000);
}
use of org.apache.lucene.util.IntsRef in project elasticsearch by elastic.
the class XAnalyzingSuggester method lookup.
@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
assert num > 0;
if (onlyMorePopular) {
throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
}
if (fst == null) {
return Collections.emptyList();
}
//System.out.println("lookup key=" + key + " num=" + num);
for (int i = 0; i < key.length(); i++) {
if (key.charAt(i) == holeCharacter) {
throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
}
if (key.charAt(i) == sepLabel) {
throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
}
}
final BytesRef utf8Key = new BytesRef(key);
try {
Automaton lookupAutomaton = toLookupAutomaton(key);
final CharsRefBuilder spare = new CharsRefBuilder();
//System.out.println(" now intersect exactFirst=" + exactFirst);
// Intersect automaton w/ suggest wFST and get all
// prefix starting nodes & their outputs:
//final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
//System.out.println(" prefixPaths: " + prefixPaths.size());
BytesReader bytesReader = fst.getBytesReader();
FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
final List<LookupResult> results = new ArrayList<>();
List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
if (exactFirst) {
int count = 0;
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
count++;
}
}
// Searcher just to find the single exact only
// match, if present:
Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
// ...:
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
}
}
Util.TopResults<Pair<Long, BytesRef>> completions = searcher.search();
// maxSurfaceFormsPerAnalyzedForm:
for (Result<Pair<Long, BytesRef>> completion : completions) {
BytesRef output2 = completion.output.output2;
if (sameSurfaceForm(utf8Key, output2)) {
results.add(getLookupResult(completion.output.output1, output2, spare));
break;
}
}
if (results.size() == num) {
// That was quick:
return results;
}
}
Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {
private final Set<BytesRef> seen = new HashSet<>();
@Override
protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
// can get duplicate surface forms:
if (seen.contains(output.output2)) {
return false;
}
seen.add(output.output2);
if (!exactFirst) {
return true;
} else {
// create duplicate results:
if (sameSurfaceForm(utf8Key, output.output2)) {
// have already found it in the first search:
assert results.size() == 1;
return false;
} else {
return true;
}
}
}
};
prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
searcher.addStartPaths(path.fstNode, path.output, true, path.input);
}
TopResults<Pair<Long, BytesRef>> completions = searcher.search();
for (Result<Pair<Long, BytesRef>> completion : completions) {
LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
// TODO: for fuzzy case would be nice to return
// how many edits were required
//System.out.println(" result=" + result);
results.add(result);
if (results.size() == num) {
// produce one extra path
break;
}
}
return results;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
use of org.apache.lucene.util.IntsRef in project elasticsearch by elastic.
the class XFuzzySuggester method toLevenshteinAutomata.
Automaton toLevenshteinAutomata(Automaton automaton) {
List<Automaton> subs = new ArrayList<>();
FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
for (IntsRef string; (string = finiteStrings.next()) != null; ) {
if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) {
subs.add(Automata.makeString(string.ints, string.offset, string.length));
} else {
int[] ints = new int[string.length - nonFuzzyPrefix];
System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length);
// TODO: maybe add alphaMin to LevenshteinAutomata,
// and pass 1 instead of 0? We probably don't want
// to allow the trailing dedup bytes to be
// edited... but then 0 byte is "in general" allowed
// on input (but not in UTF8).
LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
subs.add(lev.toAutomaton(maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
}
}
if (subs.isEmpty()) {
// matches nothing
return Automata.makeEmpty();
} else if (subs.size() == 1) {
// no synonyms or anything: just a single path through the tokenstream
return subs.get(0);
} else {
// multiple paths: this is really scary! is it slow?
// maybe we should not do this and throw UOE?
Automaton a = Operations.union(subs);
// this only happens if you have multiple paths anyway (e.g. synonyms)
return Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
}
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class MemoryDocValuesProducer method getSortedNonIterator.
private LegacySortedDocValues getSortedNonIterator(FieldInfo field) throws IOException {
final FSTEntry entry = fsts.get(field.name);
if (entry.numOrds == 0) {
return DocValues.emptyLegacySorted();
}
FST<Long> instance;
synchronized (this) {
instance = fstInstances.get(field.name);
if (instance == null) {
IndexInput data = this.data.clone();
data.seek(entry.offset);
instance = new FST<>(data, PositiveIntOutputs.getSingleton());
if (!merging) {
ramBytesUsed.addAndGet(instance.ramBytesUsed());
fstInstances.put(field.name, instance);
}
}
}
final LegacyNumericDocValues docToOrd = getNumericNonIterator(field);
final FST<Long> fst = instance;
// per-thread resources
final BytesReader in = fst.getBytesReader();
final Arc<Long> firstArc = new Arc<>();
final Arc<Long> scratchArc = new Arc<>();
final IntsRefBuilder scratchInts = new IntsRefBuilder();
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
return new LegacySortedDocValues() {
final BytesRefBuilder term = new BytesRefBuilder();
@Override
public int getOrd(int docID) {
return (int) docToOrd.get(docID);
}
@Override
public BytesRef lookupOrd(int ord) {
try {
in.setPosition(0);
fst.getFirstArc(firstArc);
IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts);
return Util.toBytesRef(output, term);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
@Override
public int lookupTerm(BytesRef key) {
try {
InputOutput<Long> o = fstEnum.seekCeil(key);
if (o == null) {
return -getValueCount() - 1;
} else if (o.input.equals(key)) {
return o.output.intValue();
} else {
return (int) -o.output - 1;
}
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
@Override
public int getValueCount() {
return (int) entry.numOrds;
}
@Override
public TermsEnum termsEnum() {
return new FSTTermsEnum(fst);
}
};
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class CompletionTokenStream method incrementToken.
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
if (finiteStrings == null) {
Automaton automaton = toAutomaton();
finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
}
IntsRef string = finiteStrings.next();
if (string == null) {
return false;
}
// now we have UTF-8
Util.toBytesRef(string, bytesAtt.builder());
if (charTermAttribute != null) {
charTermAttribute.setLength(0);
charTermAttribute.append(bytesAtt.toUTF16());
}
if (payload != null) {
payloadAttr.setPayload(this.payload);
}
return true;
}
Aggregations