use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project ansj_seg by NLPchina.
the class AppTest method parse.
private static void parse(Analysis an, List<StopRecognition> filters) throws IOException {
Tokenizer tokenizer = new AnsjTokenizer(an, filters, null);
while (tokenizer.incrementToken()) {
CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
System.out.println(attribute);
}
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project OpenGrok by OpenGrok.
the class Summarizer method getTokens.
private SToken[] getTokens(String text) throws IOException {
//FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
//also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter
ArrayList<SToken> result = new ArrayList<>();
try (TokenStream ts = analyzer.tokenStream("full", text)) {
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
ts.reset();
while (ts.incrementToken()) {
SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset());
result.add(t);
}
ts.end();
}
return result.toArray(new SToken[result.size()]);
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project OpenGrok by OpenGrok.
the class JavaSymbolTokenizerTest method getTermsFor.
private String[] getTermsFor(Reader r) {
List<String> l = new LinkedList<>();
JFlexTokenizer ts = (JFlexTokenizer) this.analyzer.tokenStream("refs", r);
ts.setReader(r);
ts.yyreset(r);
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
try {
while (ts.yylex()) {
l.add(term.toString());
}
} catch (IOException ex) {
throw new RuntimeException(ex);
}
return l.toArray(new String[l.size()]);
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project OpenGrok by OpenGrok.
the class PhpSymbolTokenizerTest method getTermsFor.
private String[] getTermsFor(Reader r) {
List<String> l = new LinkedList<>();
JFlexTokenizer ts = (JFlexTokenizer) this.analyzer.tokenStream("refs", r);
ts.setReader(r);
ts.yyreset(r);
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
try {
while (ts.yylex()) {
l.add(term.toString());
}
} catch (IOException ex) {
throw new RuntimeException(ex);
}
return l.toArray(new String[l.size()]);
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project OpenGrok by OpenGrok.
the class JFlexTokenizerTest method truncatedUuencodedFile.
/**
* Truncated uuencoded files used to cause infinite loops. Verify that they
* work now.
*
* @throws java.io.IOException
*/
@Test
public void truncatedUuencodedFile() throws IOException {
UuencodeFullTokenizer tokenizer = new UuencodeFullTokenizer(new StringReader("begin 644 test\n"));
CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
assertTrue(tokenizer.incrementToken());
assertEquals("begin", term.toString());
assertTrue(tokenizer.incrementToken());
assertEquals("644", term.toString());
assertTrue(tokenizer.incrementToken());
assertEquals("test", term.toString());
// This call used to hang forever.
assertFalse(tokenizer.incrementToken());
}
Aggregations