use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class SpellCheckComponent method getTokens.
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
Collection<Token> result = new ArrayList<>();
assert analyzer != null;
try (TokenStream ts = analyzer.tokenStream("", q)) {
ts.reset();
// TODO: support custom attributes
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
while (ts.incrementToken()) {
Token token = new Token();
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
token.setType(typeAtt.type());
token.setFlags(flagsAtt.getFlags());
token.setPayload(payloadAtt.getPayload());
token.setPositionIncrement(posIncAtt.getPositionIncrement());
result.add(token);
}
ts.end();
return result;
}
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class QueryElevationComponent method getAnalyzedQuery.
String getAnalyzedQuery(String query) throws IOException {
if (analyzer == null) {
return query;
}
StringBuilder norm = new StringBuilder();
try (TokenStream tokens = analyzer.tokenStream("", query)) {
tokens.reset();
CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
while (tokens.incrementToken()) {
norm.append(termAtt.buffer(), 0, termAtt.length());
}
tokens.end();
return norm.toString();
}
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class SimplePreAnalyzedParser method toFormattedString.
@Override
public String toFormattedString(Field f) throws IOException {
StringBuilder sb = new StringBuilder();
sb.append(VERSION + " ");
if (f.fieldType().stored()) {
String s = f.stringValue();
if (s != null) {
// encode the equals sign
s = s.replaceAll("=", "\\=");
sb.append('=');
sb.append(s);
sb.append('=');
}
}
TokenStream ts = f.tokenStreamValue();
if (ts != null) {
StringBuilder tok = new StringBuilder();
boolean next = false;
while (ts.incrementToken()) {
if (next) {
sb.append(' ');
} else {
next = true;
}
tok.setLength(0);
Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
String cTerm = null;
String tTerm = null;
while (it.hasNext()) {
Class<? extends Attribute> cl = it.next();
Attribute att = ts.getAttribute(cl);
if (att == null) {
continue;
}
if (cl.isAssignableFrom(CharTermAttribute.class)) {
CharTermAttribute catt = (CharTermAttribute) att;
cTerm = escape(catt.buffer(), catt.length());
} else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
tTerm = escape(tTermChars, tTermChars.length);
} else {
if (tok.length() > 0)
tok.append(',');
if (cl.isAssignableFrom(FlagsAttribute.class)) {
tok.append("f=" + Integer.toHexString(((FlagsAttribute) att).getFlags()));
} else if (cl.isAssignableFrom(OffsetAttribute.class)) {
tok.append("s=" + ((OffsetAttribute) att).startOffset() + ",e=" + ((OffsetAttribute) att).endOffset());
} else if (cl.isAssignableFrom(PayloadAttribute.class)) {
BytesRef p = ((PayloadAttribute) att).getPayload();
if (p != null && p.length > 0) {
tok.append("p=" + bytesToHex(p.bytes, p.offset, p.length));
} else if (tok.length() > 0) {
// remove the last comma
tok.setLength(tok.length() - 1);
}
} else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
tok.append("i=" + ((PositionIncrementAttribute) att).getPositionIncrement());
} else if (cl.isAssignableFrom(TypeAttribute.class)) {
tok.append("y=" + escape(((TypeAttribute) att).type()));
} else {
tok.append(cl.getName() + "=" + escape(att.toString()));
}
}
}
String term = null;
if (cTerm != null) {
term = cTerm;
} else {
term = tTerm;
}
if (term != null && term.length() > 0) {
if (tok.length() > 0) {
tok.insert(0, term + ",");
} else {
tok.insert(0, term);
}
}
sb.append(tok);
}
}
return sb.toString();
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class TextField method analyzeMultiTerm.
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
if (part == null || analyzerIn == null)
return null;
try (TokenStream source = analyzerIn.tokenStream(field, part)) {
source.reset();
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
if (!source.incrementToken())
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned no terms for multiTerm term: " + part);
BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
if (source.incrementToken())
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned too many terms for multiTerm term: " + part);
source.end();
return bytes;
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e);
}
}
use of org.apache.lucene.analysis.TokenStream in project sukija by ahomansikka.
the class AppTest method test.
private boolean test(String input, String expectedOutput) throws IOException {
Reader r = new StringReader(input);
TokenStream t = new HVTokenizer();
((Tokenizer) t).setReader(r);
t = new VoikkoFilter(t, voikko);
t.reset();
VoikkoAttribute sukijaAtt = t.addAttribute(VoikkoAttribute.class);
CharTermAttribute termAtt = t.addAttribute(CharTermAttribute.class);
while (t.incrementToken()) {
System.out.println("AppTest " + termAtt.toString());
for (int i = 0; i < sukijaAtt.getAnalysis().size(); i++) {
System.out.println(sukijaAtt.getAnalysis(i).get("BASEFORM"));
// VoikkoUtils.printAnalysisResult (sukijaAtt.getAnalysis(i), System.out);
}
System.out.println("");
}
return true;
}
Aggregations