use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project textdb by TextDB.
the class DataflowUtils method tokenizeQueryWithStopwords.
public static ArrayList<String> tokenizeQueryWithStopwords(String query) {
ArrayList<String> result = new ArrayList<String>();
CharArraySet emptyStopwords = new CharArraySet(1, true);
Analyzer luceneAnalyzer = new StandardAnalyzer(emptyStopwords);
TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(query));
CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
try {
tokenStream.reset();
while (tokenStream.incrementToken()) {
String token = term.toString();
int tokenIndex = query.toLowerCase().indexOf(token);
// Since tokens are converted to lower case,
// get the exact token from the query string.
String actualQueryToken = query.substring(tokenIndex, tokenIndex + token.length());
result.add(actualQueryToken);
}
tokenStream.close();
} catch (Exception e) {
e.printStackTrace();
}
luceneAnalyzer.close();
return result;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project textdb by TextDB.
the class DataflowUtils method generatePayload.
public static List<Span> generatePayload(String attributeName, String fieldValue, Analyzer luceneAnalyzer) {
List<Span> payload = new ArrayList<>();
try {
TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(fieldValue));
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
int tokenPositionCounter = -1;
tokenStream.reset();
while (tokenStream.incrementToken()) {
tokenPositionCounter += positionIncrementAttribute.getPositionIncrement();
int tokenPosition = tokenPositionCounter;
int charStart = offsetAttribute.startOffset();
int charEnd = offsetAttribute.endOffset();
String analyzedTermStr = charTermAttribute.toString();
String originalTermStr = fieldValue.substring(charStart, charEnd);
payload.add(new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition));
}
tokenStream.close();
} catch (IOException e) {
// return empty payload
payload.clear();
}
return payload;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project zm-mailbox by Zimbra.
the class ZimbraAnalyzerTest method toTokens.
public static List<String> toTokens(TokenStream stream) throws IOException {
List<String> result = new ArrayList<String>();
CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
stream.reset();
while (stream.incrementToken()) {
result.add(termAttr.toString());
}
stream.end();
return result;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project zm-mailbox by Zimbra.
the class UniversalAnalyzerTest method testCJK.
private void testCJK(String src) throws IOException {
TokenStream cjk = cjkAnalyzer.tokenStream(null, new StringReader(src));
CharTermAttribute cjkTermAttr = cjk.addAttribute(CharTermAttribute.class);
OffsetAttribute cjkOffsetAttr = cjk.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute cjkPosIncAttr = cjk.addAttribute(PositionIncrementAttribute.class);
TokenStream uni = universalAnalyzer.tokenStream(null, new StringReader(src));
CharTermAttribute uniTermAttr = uni.addAttribute(CharTermAttribute.class);
OffsetAttribute uniOffsetAttr = uni.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute uniPosIncAttr = uni.addAttribute(PositionIncrementAttribute.class);
while (true) {
boolean result = cjk.incrementToken();
Assert.assertEquals(result, uni.incrementToken());
if (!result) {
break;
}
String term = cjkTermAttr.toString();
Assert.assertEquals(cjkTermAttr, uniTermAttr);
if (assertOffset) {
Assert.assertEquals(term, cjkOffsetAttr, uniOffsetAttr);
}
Assert.assertEquals(term, cjkPosIncAttr, uniPosIncAttr);
}
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project sukija by ahomansikka.
the class KeepFilterTester method test.
public static void test(Reader reader, Writer writer, Voikko voikko, CharArraySet wordSet, String from, String to, Suggestion[] suggestion, boolean stopOnSuccess) throws IOException {
Set<String> set = new TreeSet<String>();
TokenStream t = new HVTokenizer();
((Tokenizer) t).setReader(reader);
t = new KeepFilter(t, voikko, wordSet, from, to, suggestion);
CharTermAttribute termAtt = t.addAttribute(CharTermAttribute.class);
BaseFormAttribute baseFormAtt = t.addAttribute(BaseFormAttribute.class);
FlagsAttribute flagsAtt = t.addAttribute(FlagsAttribute.class);
OriginalWordAttribute originalWordAtt = t.addAttribute(OriginalWordAttribute.class);
try {
t.reset();
while (t.incrementToken()) {
writer.write("Sana: " + originalWordAtt.getOriginalWord() + " " + termAtt.toString() + " " + Constants.toString(flagsAtt) + " " + baseFormAtt.getBaseForms().toString() + "\n");
writer.flush();
}
t.end();
} finally {
t.close();
}
}
Aggregations