Search in sources :

Example 11 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project ansj_seg by NLPchina.

the class AppTest method parse.

private static void parse(Analysis an, List<StopRecognition> filters) throws IOException {
    Tokenizer tokenizer = new AnsjTokenizer(an, filters, null);
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        System.out.println(attribute);
    }
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) AnsjTokenizer(org.ansj.lucene.util.AnsjTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) AnsjTokenizer(org.ansj.lucene.util.AnsjTokenizer)

Example 12 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project OpenGrok by OpenGrok.

the class Summarizer method getTokens.

private SToken[] getTokens(String text) throws IOException {
    //FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
    //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter        
    ArrayList<SToken> result = new ArrayList<>();
    try (TokenStream ts = analyzer.tokenStream("full", text)) {
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset());
            result.add(t);
        }
        ts.end();
    }
    return result.toArray(new SToken[result.size()]);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Example 13 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project OpenGrok by OpenGrok.

the class JavaSymbolTokenizerTest method getTermsFor.

private String[] getTermsFor(Reader r) {
    List<String> l = new LinkedList<>();
    JFlexTokenizer ts = (JFlexTokenizer) this.analyzer.tokenStream("refs", r);
    ts.setReader(r);
    ts.yyreset(r);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    try {
        while (ts.yylex()) {
            l.add(term.toString());
        }
    } catch (IOException ex) {
        throw new RuntimeException(ex);
    }
    return l.toArray(new String[l.size()]);
}
Also used : JFlexTokenizer(org.opensolaris.opengrok.analysis.JFlexTokenizer) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) IOException(java.io.IOException) LinkedList(java.util.LinkedList)

Example 14 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project OpenGrok by OpenGrok.

the class PhpSymbolTokenizerTest method getTermsFor.

private String[] getTermsFor(Reader r) {
    List<String> l = new LinkedList<>();
    JFlexTokenizer ts = (JFlexTokenizer) this.analyzer.tokenStream("refs", r);
    ts.setReader(r);
    ts.yyreset(r);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    try {
        while (ts.yylex()) {
            l.add(term.toString());
        }
    } catch (IOException ex) {
        throw new RuntimeException(ex);
    }
    return l.toArray(new String[l.size()]);
}
Also used : JFlexTokenizer(org.opensolaris.opengrok.analysis.JFlexTokenizer) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) IOException(java.io.IOException) LinkedList(java.util.LinkedList)

Example 15 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project OpenGrok by OpenGrok.

the class JFlexTokenizerTest method truncatedUuencodedFile.

/**
     * Truncated uuencoded files used to cause infinite loops. Verify that they
     * work now.
     *
     * @throws java.io.IOException
     */
@Test
public void truncatedUuencodedFile() throws IOException {
    UuencodeFullTokenizer tokenizer = new UuencodeFullTokenizer(new StringReader("begin 644 test\n"));
    CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
    assertTrue(tokenizer.incrementToken());
    assertEquals("begin", term.toString());
    assertTrue(tokenizer.incrementToken());
    assertEquals("644", term.toString());
    assertTrue(tokenizer.incrementToken());
    assertEquals("test", term.toString());
    // This call used to hang forever.
    assertFalse(tokenizer.incrementToken());
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) UuencodeFullTokenizer(org.opensolaris.opengrok.analysis.uue.UuencodeFullTokenizer) StringReader(java.io.StringReader) Test(org.junit.Test)

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)144 TokenStream (org.apache.lucene.analysis.TokenStream)88 StringReader (java.io.StringReader)42 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)33 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)33 ArrayList (java.util.ArrayList)26 Tokenizer (org.apache.lucene.analysis.Tokenizer)25 IOException (java.io.IOException)22 Analyzer (org.apache.lucene.analysis.Analyzer)18 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)16 BytesRef (org.apache.lucene.util.BytesRef)15 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)13 LinkedList (java.util.LinkedList)11 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)10 Term (org.apache.lucene.index.Term)10 HashMap (java.util.HashMap)8 Token (org.apache.lucene.analysis.Token)8 Document (org.apache.lucene.document.Document)8 List (java.util.List)7 HashSet (java.util.HashSet)6