Search in sources :

Example 6 with CharsRef

use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.

the class Stemmer method uniqueStems.

/**
   * Find the unique stem(s) of the provided word
   * 
   * @param word Word to find the stems for
   * @return List of stems for the word
   */
public List<CharsRef> uniqueStems(char[] word, int length) {
    List<CharsRef> stems = stem(word, length);
    if (stems.size() < 2) {
        return stems;
    }
    CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
    List<CharsRef> deduped = new ArrayList<>();
    for (CharsRef s : stems) {
        if (!terms.contains(s)) {
            deduped.add(s);
            terms.add(s);
        }
    }
    return deduped;
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) ArrayList(java.util.ArrayList) CharsRef(org.apache.lucene.util.CharsRef)

Example 7 with CharsRef

use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.

the class Test64kAffixes method test.

public void test() throws Exception {
    Path tempDir = createTempDir("64kaffixes");
    Path affix = tempDir.resolve("64kaffixes.aff");
    Path dict = tempDir.resolve("64kaffixes.dic");
    BufferedWriter affixWriter = Files.newBufferedWriter(affix, StandardCharsets.UTF_8);
    // 65k affixes with flag 1, then an affix with flag 2
    affixWriter.write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n");
    for (int i = 0; i < 65536; i++) {
        affixWriter.write("SFX 1 0 " + Integer.toHexString(i) + " .\n");
    }
    affixWriter.write("SFX 2 Y 1\nSFX 2 0 s\n");
    affixWriter.close();
    BufferedWriter dictWriter = Files.newBufferedWriter(dict, StandardCharsets.UTF_8);
    // drink signed with affix 2 (takes -s)
    dictWriter.write("1\ndrink/2\n");
    dictWriter.close();
    try (InputStream affStream = Files.newInputStream(affix);
        InputStream dictStream = Files.newInputStream(dict);
        Directory tempDir2 = newDirectory()) {
        Dictionary dictionary = new Dictionary(tempDir2, "dictionary", affStream, dictStream);
        Stemmer stemmer = new Stemmer(dictionary);
        // drinks should still stem to drink
        List<CharsRef> stems = stemmer.stem("drinks");
        assertEquals(1, stems.size());
        assertEquals("drink", stems.get(0).toString());
    }
}
Also used : Path(java.nio.file.Path) InputStream(java.io.InputStream) CharsRef(org.apache.lucene.util.CharsRef) BufferedWriter(java.io.BufferedWriter) Directory(org.apache.lucene.store.Directory)

Example 8 with CharsRef

use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.

the class TestDictionary method testReplacements.

public void testReplacements() throws Exception {
    Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    // a -> b
    Util.toUTF16("a", scratchInts);
    builder.add(scratchInts.get(), new CharsRef("b"));
    // ab -> c
    Util.toUTF16("ab", scratchInts);
    builder.add(scratchInts.get(), new CharsRef("c"));
    // c -> de
    Util.toUTF16("c", scratchInts);
    builder.add(scratchInts.get(), new CharsRef("de"));
    // def -> gh
    Util.toUTF16("def", scratchInts);
    builder.add(scratchInts.get(), new CharsRef("gh"));
    FST<CharsRef> fst = builder.finish();
    StringBuilder sb = new StringBuilder("atestanother");
    Dictionary.applyMappings(fst, sb);
    assertEquals("btestbnother", sb.toString());
    sb = new StringBuilder("abtestanother");
    Dictionary.applyMappings(fst, sb);
    assertEquals("ctestbnother", sb.toString());
    sb = new StringBuilder("atestabnother");
    Dictionary.applyMappings(fst, sb);
    assertEquals("btestcnother", sb.toString());
    sb = new StringBuilder("abtestabnother");
    Dictionary.applyMappings(fst, sb);
    assertEquals("ctestcnother", sb.toString());
    sb = new StringBuilder("abtestabcnother");
    Dictionary.applyMappings(fst, sb);
    assertEquals("ctestcdenother", sb.toString());
    sb = new StringBuilder("defdefdefc");
    Dictionary.applyMappings(fst, sb);
    assertEquals("ghghghde", sb.toString());
}
Also used : IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Builder(org.apache.lucene.util.fst.Builder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) CharsRef(org.apache.lucene.util.CharsRef)

Example 9 with CharsRef

use of org.apache.lucene.util.CharsRef in project elasticsearch by elastic.

the class DirectCandidateGenerator method analyze.

public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare) throws IOException {
    spare.copyUTF8Bytes(toAnalyze);
    CharsRef charsRef = spare.get();
    try (TokenStream ts = analyzer.tokenStream(field, new FastCharArrayReader(charsRef.chars, charsRef.offset, charsRef.length))) {
        return analyze(ts, consumer);
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) FastCharArrayReader(org.elasticsearch.common.io.FastCharArrayReader) CharsRef(org.apache.lucene.util.CharsRef)

Example 10 with CharsRef

use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.

the class SynonymFilter method incrementToken.

@Override
public boolean incrementToken() throws IOException {
    while (true) {
        // w/o running parsing again:
        while (inputSkipCount != 0) {
            // At each position, we first output the original
            // token
            // TODO: maybe just a PendingState class, holding
            // both input & outputs?
            final PendingInput input = futureInputs[nextRead];
            final PendingOutputs outputs = futureOutputs[nextRead];
            if (!input.consumed && (input.keepOrig || !input.matched)) {
                if (input.state != null) {
                    // Return a previously saved token (because we
                    // had to lookahead):
                    restoreState(input.state);
                } else {
                    // but didn't capture:
                    assert inputSkipCount == 1 : "inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead;
                }
                input.reset();
                if (outputs.count > 0) {
                    outputs.posIncr = 0;
                } else {
                    nextRead = rollIncr(nextRead);
                    inputSkipCount--;
                }
                //System.out.println("  return token=" + termAtt.toString());
                return true;
            } else if (outputs.upto < outputs.count) {
                // Still have pending outputs to replay at this
                // position
                input.reset();
                final int posIncr = outputs.posIncr;
                final CharsRef output = outputs.pullNext();
                clearAttributes();
                termAtt.copyBuffer(output.chars, output.offset, output.length);
                typeAtt.setType(TYPE_SYNONYM);
                int endOffset = outputs.getLastEndOffset();
                if (endOffset == -1) {
                    endOffset = input.endOffset;
                }
                offsetAtt.setOffset(input.startOffset, endOffset);
                posIncrAtt.setPositionIncrement(posIncr);
                posLenAtt.setPositionLength(outputs.getLastPosLength());
                if (outputs.count == 0) {
                    // Done with the buffered input and all outputs at
                    // this position
                    nextRead = rollIncr(nextRead);
                    inputSkipCount--;
                }
                //System.out.println("  return token=" + termAtt.toString());
                return true;
            } else {
                // Done with the buffered input and all outputs at
                // this position
                input.reset();
                nextRead = rollIncr(nextRead);
                inputSkipCount--;
            }
        }
        if (finished && nextRead == nextWrite) {
            // End case: if any output syns went beyond end of
            // input stream, enumerate them now:
            final PendingOutputs outputs = futureOutputs[nextRead];
            if (outputs.upto < outputs.count) {
                final int posIncr = outputs.posIncr;
                final CharsRef output = outputs.pullNext();
                futureInputs[nextRead].reset();
                if (outputs.count == 0) {
                    nextWrite = nextRead = rollIncr(nextRead);
                }
                clearAttributes();
                // Keep offset from last input token:
                offsetAtt.setOffset(lastStartOffset, lastEndOffset);
                termAtt.copyBuffer(output.chars, output.offset, output.length);
                typeAtt.setType(TYPE_SYNONYM);
                //System.out.println("  set posIncr=" + outputs.posIncr + " outputs=" + outputs);
                posIncrAtt.setPositionIncrement(posIncr);
                //System.out.println("  return token=" + termAtt.toString());
                return true;
            } else {
                return false;
            }
        }
        // Find new synonym matches:
        parse();
    }
}
Also used : CharsRef(org.apache.lucene.util.CharsRef)

Aggregations

CharsRef (org.apache.lucene.util.CharsRef)27 BytesRef (org.apache.lucene.util.BytesRef)8 ArrayList (java.util.ArrayList)6 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)6 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)6 StringReader (java.io.StringReader)4 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)4 SynonymFilter (org.apache.lucene.analysis.synonym.SynonymFilter)4 SynonymMap (org.apache.lucene.analysis.synonym.SynonymMap)4 HashMap (java.util.HashMap)3 TokenStream (org.apache.lucene.analysis.TokenStream)3 IntsRef (org.apache.lucene.util.IntsRef)3 Test (org.junit.Test)3 ParseException (java.text.ParseException)2 HashSet (java.util.HashSet)2 Map (java.util.Map)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2 Analyzer (org.apache.lucene.analysis.Analyzer)2 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)2 Tokenizer (org.apache.lucene.analysis.Tokenizer)2