use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project languagetool by languagetool-org.
the class LanguageToolFilterTest method displayTokensWithFullDetails.
private static void displayTokensWithFullDetails(TokenStream stream) throws IOException {
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
TypeAttribute type = stream.addAttribute(TypeAttribute.class);
int position = 0;
while (stream.incrementToken()) {
int increment = posIncr.getPositionIncrement();
if (increment > 0) {
position = position + increment;
System.out.println();
System.out.print(position + ": ");
}
System.out.print("[" + term + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + "] ");
}
System.out.println();
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project textdb by TextDB.
the class DataflowUtils method tokenizeQuery.
/**
* Tokenizes the query string using the given analyser
*
* @param luceneAnalyzer
* @param query
* @return ArrayList<String> list of results
*/
public static ArrayList<String> tokenizeQuery(Analyzer luceneAnalyzer, String query) {
ArrayList<String> result = new ArrayList<String>();
TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(query));
CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
try {
tokenStream.reset();
while (tokenStream.incrementToken()) {
result.add(term.toString());
}
tokenStream.close();
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project zm-mailbox by Zimbra.
the class UniversalAnalyzerTest method testSTD.
private void testSTD(String src) throws IOException {
TokenStream std = standardAnalyzer.tokenStream(null, new StringReader(src));
CharTermAttribute stdTermAttr = std.addAttribute(CharTermAttribute.class);
OffsetAttribute stdOffsetAttr = std.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute stdPosIncAttr = std.addAttribute(PositionIncrementAttribute.class);
TokenStream uni = universalAnalyzer.tokenStream(null, new StringReader(src));
CharTermAttribute uniTermAttr = uni.addAttribute(CharTermAttribute.class);
OffsetAttribute uniOffsetAttr = uni.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute uniPosIncAttr = uni.addAttribute(PositionIncrementAttribute.class);
while (true) {
boolean result = std.incrementToken();
Assert.assertEquals(result, uni.incrementToken());
if (!result) {
break;
}
String term = stdTermAttr.toString();
Assert.assertEquals(stdTermAttr, uniTermAttr);
if (assertOffset) {
Assert.assertEquals(term, stdOffsetAttr, uniOffsetAttr);
}
Assert.assertEquals(term, stdPosIncAttr, uniPosIncAttr);
}
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project zm-mailbox by Zimbra.
the class TermInfo method updateMapWithDetailsForField.
/**
* Update {@code term2info} with information from {@code field}
*
* if the field from the Lucene document is indexed and tokenized, for each token:
* a) construct a key based on the field name and info about the token
* b) if {@code term2info} has an entry for that key, get it, otherwise create an entry
* c) update the entry with position information for this token
*
* @param pos is the current position
* @return new value for {@code pos}
*/
public static int updateMapWithDetailsForField(Analyzer analyzer, Fieldable field, Map<String, TermInfo> term2info, int pos) throws IOException {
if (!field.isIndexed()) {
return pos;
}
Character prefix = LuceneFields.FIELD2PREFIX.get(field.name());
if (prefix == null) {
ZimbraLog.index.info("TermInfo.updateMapWithDetailsForField - skipping indexed field " + field.name() + " isTokenized=" + field.isTokenized());
return pos;
}
if (field.isTokenized()) {
TokenStream stream = field.tokenStreamValue();
if (stream == null) {
stream = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
}
CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posAttr = stream.addAttribute(PositionIncrementAttribute.class);
stream.reset();
while (stream.incrementToken()) {
if (termAttr.length() == 0) {
continue;
}
String term = prefix + termAttr.toString();
TermInfo info = term2info.get(term);
if (info == null) {
info = new TermInfo();
term2info.put(term, info);
}
pos += posAttr.getPositionIncrement();
info.addPosition(pos);
}
} else {
// whole field is the only "token". Info potentially getting stored twice - here as well as where
// the field is stored.
String term = prefix + field.stringValue();
TermInfo info = term2info.get(term);
if (info == null) {
info = new TermInfo();
term2info.put(term, info);
}
}
return pos;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project zm-mailbox by Zimbra.
the class ZimbraAnalyzer method getAllTokensConcatenated.
public static String getAllTokensConcatenated(String fieldName, Reader reader) {
StringBuilder toReturn = new StringBuilder();
TokenStream stream = SINGLETON.tokenStream(fieldName, reader);
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
try {
stream.reset();
while (stream.incrementToken()) {
toReturn.append(term);
toReturn.append(' ');
}
stream.end();
stream.close();
} catch (IOException e) {
//otherwise eat it
e.printStackTrace();
}
return toReturn.toString();
}
Aggregations