use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.
the class Stemmer method uniqueStems.
/**
* Find the unique stem(s) of the provided word
*
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<CharsRef> uniqueStems(char[] word, int length) {
List<CharsRef> stems = stem(word, length);
if (stems.size() < 2) {
return stems;
}
CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
List<CharsRef> deduped = new ArrayList<>();
for (CharsRef s : stems) {
if (!terms.contains(s)) {
deduped.add(s);
terms.add(s);
}
}
return deduped;
}
use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.
the class Test64kAffixes method test.
public void test() throws Exception {
Path tempDir = createTempDir("64kaffixes");
Path affix = tempDir.resolve("64kaffixes.aff");
Path dict = tempDir.resolve("64kaffixes.dic");
BufferedWriter affixWriter = Files.newBufferedWriter(affix, StandardCharsets.UTF_8);
// 65k affixes with flag 1, then an affix with flag 2
affixWriter.write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n");
for (int i = 0; i < 65536; i++) {
affixWriter.write("SFX 1 0 " + Integer.toHexString(i) + " .\n");
}
affixWriter.write("SFX 2 Y 1\nSFX 2 0 s\n");
affixWriter.close();
BufferedWriter dictWriter = Files.newBufferedWriter(dict, StandardCharsets.UTF_8);
// drink signed with affix 2 (takes -s)
dictWriter.write("1\ndrink/2\n");
dictWriter.close();
try (InputStream affStream = Files.newInputStream(affix);
InputStream dictStream = Files.newInputStream(dict);
Directory tempDir2 = newDirectory()) {
Dictionary dictionary = new Dictionary(tempDir2, "dictionary", affStream, dictStream);
Stemmer stemmer = new Stemmer(dictionary);
// drinks should still stem to drink
List<CharsRef> stems = stemmer.stem("drinks");
assertEquals(1, stems.size());
assertEquals("drink", stems.get(0).toString());
}
}
use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.
the class TestDictionary method testReplacements.
public void testReplacements() throws Exception {
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder();
// a -> b
Util.toUTF16("a", scratchInts);
builder.add(scratchInts.get(), new CharsRef("b"));
// ab -> c
Util.toUTF16("ab", scratchInts);
builder.add(scratchInts.get(), new CharsRef("c"));
// c -> de
Util.toUTF16("c", scratchInts);
builder.add(scratchInts.get(), new CharsRef("de"));
// def -> gh
Util.toUTF16("def", scratchInts);
builder.add(scratchInts.get(), new CharsRef("gh"));
FST<CharsRef> fst = builder.finish();
StringBuilder sb = new StringBuilder("atestanother");
Dictionary.applyMappings(fst, sb);
assertEquals("btestbnother", sb.toString());
sb = new StringBuilder("abtestanother");
Dictionary.applyMappings(fst, sb);
assertEquals("ctestbnother", sb.toString());
sb = new StringBuilder("atestabnother");
Dictionary.applyMappings(fst, sb);
assertEquals("btestcnother", sb.toString());
sb = new StringBuilder("abtestabnother");
Dictionary.applyMappings(fst, sb);
assertEquals("ctestcnother", sb.toString());
sb = new StringBuilder("abtestabcnother");
Dictionary.applyMappings(fst, sb);
assertEquals("ctestcdenother", sb.toString());
sb = new StringBuilder("defdefdefc");
Dictionary.applyMappings(fst, sb);
assertEquals("ghghghde", sb.toString());
}
use of org.apache.lucene.util.CharsRef in project elasticsearch by elastic.
the class DirectCandidateGenerator method analyze.
public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare) throws IOException {
spare.copyUTF8Bytes(toAnalyze);
CharsRef charsRef = spare.get();
try (TokenStream ts = analyzer.tokenStream(field, new FastCharArrayReader(charsRef.chars, charsRef.offset, charsRef.length))) {
return analyze(ts, consumer);
}
}
use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.
the class SynonymFilter method incrementToken.
@Override
public boolean incrementToken() throws IOException {
while (true) {
// w/o running parsing again:
while (inputSkipCount != 0) {
// At each position, we first output the original
// token
// TODO: maybe just a PendingState class, holding
// both input & outputs?
final PendingInput input = futureInputs[nextRead];
final PendingOutputs outputs = futureOutputs[nextRead];
if (!input.consumed && (input.keepOrig || !input.matched)) {
if (input.state != null) {
// Return a previously saved token (because we
// had to lookahead):
restoreState(input.state);
} else {
// but didn't capture:
assert inputSkipCount == 1 : "inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead;
}
input.reset();
if (outputs.count > 0) {
outputs.posIncr = 0;
} else {
nextRead = rollIncr(nextRead);
inputSkipCount--;
}
//System.out.println(" return token=" + termAtt.toString());
return true;
} else if (outputs.upto < outputs.count) {
// Still have pending outputs to replay at this
// position
input.reset();
final int posIncr = outputs.posIncr;
final CharsRef output = outputs.pullNext();
clearAttributes();
termAtt.copyBuffer(output.chars, output.offset, output.length);
typeAtt.setType(TYPE_SYNONYM);
int endOffset = outputs.getLastEndOffset();
if (endOffset == -1) {
endOffset = input.endOffset;
}
offsetAtt.setOffset(input.startOffset, endOffset);
posIncrAtt.setPositionIncrement(posIncr);
posLenAtt.setPositionLength(outputs.getLastPosLength());
if (outputs.count == 0) {
// Done with the buffered input and all outputs at
// this position
nextRead = rollIncr(nextRead);
inputSkipCount--;
}
//System.out.println(" return token=" + termAtt.toString());
return true;
} else {
// Done with the buffered input and all outputs at
// this position
input.reset();
nextRead = rollIncr(nextRead);
inputSkipCount--;
}
}
if (finished && nextRead == nextWrite) {
// End case: if any output syns went beyond end of
// input stream, enumerate them now:
final PendingOutputs outputs = futureOutputs[nextRead];
if (outputs.upto < outputs.count) {
final int posIncr = outputs.posIncr;
final CharsRef output = outputs.pullNext();
futureInputs[nextRead].reset();
if (outputs.count == 0) {
nextWrite = nextRead = rollIncr(nextRead);
}
clearAttributes();
// Keep offset from last input token:
offsetAtt.setOffset(lastStartOffset, lastEndOffset);
termAtt.copyBuffer(output.chars, output.offset, output.length);
typeAtt.setType(TYPE_SYNONYM);
//System.out.println(" set posIncr=" + outputs.posIncr + " outputs=" + outputs);
posIncrAtt.setPositionIncrement(posIncr);
//System.out.println(" return token=" + termAtt.toString());
return true;
} else {
return false;
}
}
// Find new synonym matches:
parse();
}
}
Aggregations