use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.
the class TruncateTokenFilterTests method testSimple.
public void testSimple() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(t, new TruncateTokenFilter(t, 3));
}
};
TokenStream test = analyzer.tokenStream("test", "a bb ccc dddd eeeee");
test.reset();
CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class);
assertThat(test.incrementToken(), equalTo(true));
assertThat(termAttribute.toString(), equalTo("a"));
assertThat(test.incrementToken(), equalTo(true));
assertThat(termAttribute.toString(), equalTo("bb"));
assertThat(test.incrementToken(), equalTo(true));
assertThat(termAttribute.toString(), equalTo("ccc"));
assertThat(test.incrementToken(), equalTo(true));
assertThat(termAttribute.toString(), equalTo("ddd"));
assertThat(test.incrementToken(), equalTo(true));
assertThat(termAttribute.toString(), equalTo("eee"));
assertThat(test.incrementToken(), equalTo(false));
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.
the class UniqueTokenFilterTests method testSimple.
public void testSimple() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(t, new UniqueTokenFilter(t));
}
};
TokenStream test = analyzer.tokenStream("test", "this test with test");
test.reset();
CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class);
assertThat(test.incrementToken(), equalTo(true));
assertThat(termAttribute.toString(), equalTo("this"));
assertThat(test.incrementToken(), equalTo(true));
assertThat(termAttribute.toString(), equalTo("test"));
assertThat(test.incrementToken(), equalTo(true));
assertThat(termAttribute.toString(), equalTo("with"));
assertThat(test.incrementToken(), equalTo(false));
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-skos by behas.
the class SKOSQueryNodeProcessor method postProcessNode.
@Override
protected QueryNode postProcessNode(QueryNode node) throws QueryNodeException {
if (node instanceof TextableQueryNode && !(node instanceof WildcardQueryNode) && !(node instanceof FuzzyQueryNode) && !(node instanceof RegexpQueryNode) && !(node.getParent() instanceof RangeQueryNode)) {
FieldQueryNode fieldNode = ((FieldQueryNode) node);
String text = fieldNode.getTextAsString();
String field = fieldNode.getFieldAsString();
CachingTokenFilter buffer = null;
PositionIncrementAttribute posIncrAtt = null;
int numTokens = 0;
int positionCount = 0;
boolean severalTokensAtSamePosition = false;
try {
try (TokenStream source = this.analyzer.tokenStream(field, text)) {
buffer = new CachingTokenFilter(source);
buffer.reset();
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
}
try {
while (buffer.incrementToken()) {
numTokens++;
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
if (positionIncrement != 0) {
positionCount += positionIncrement;
} else {
severalTokensAtSamePosition = true;
}
}
} catch (IOException e) {
// ignore
}
// rewind the buffer stream
//will never through on subsequent reset calls
buffer.reset();
} catch (IOException e) {
throw new RuntimeException(e);
}
if (!buffer.hasAttribute(CharTermAttribute.class)) {
return new NoTokenFoundQueryNode();
}
CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class);
if (numTokens == 0) {
return new NoTokenFoundQueryNode();
} else if (numTokens == 1) {
String term = null;
try {
boolean hasNext;
hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
fieldNode.setText(term);
return fieldNode;
} else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) {
if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) {
if (positionCount == 1) {
// simple case: only one position, with synonyms
LinkedList<QueryNode> children = new LinkedList<>();
for (int i = 0; i < numTokens; i++) {
String term = null;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (buffer.hasAttribute(SKOSTypeAttribute.class) && boosts != null) {
SKOSTypeAttribute skosAttr = buffer.getAttribute(SKOSTypeAttribute.class);
children.add(new BoostQueryNode(new FieldQueryNode(field, term, -1, -1), getBoost(skosAttr.getSkosType())));
} else {
children.add(new FieldQueryNode(field, term, -1, -1));
}
}
return new GroupQueryNode(new StandardBooleanQueryNode(children, positionCount == 1));
} else {
// multiple positions
QueryNode q = new StandardBooleanQueryNode(Collections.<QueryNode>emptyList(), false);
QueryNode currentQuery = null;
for (int i = 0; i < numTokens; i++) {
String term = null;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (posIncrAtt != null && posIncrAtt.getPositionIncrement() == 0) {
if (!(currentQuery instanceof BooleanQueryNode)) {
QueryNode t = currentQuery;
currentQuery = new StandardBooleanQueryNode(Collections.<QueryNode>emptyList(), true);
((BooleanQueryNode) currentQuery).add(t);
}
((BooleanQueryNode) currentQuery).add(new FieldQueryNode(field, term, -1, -1));
} else {
if (currentQuery != null) {
if (this.defaultOperator == Operator.OR) {
q.add(currentQuery);
} else {
q.add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
}
}
currentQuery = new FieldQueryNode(field, term, -1, -1);
}
}
if (this.defaultOperator == Operator.OR) {
q.add(currentQuery);
} else {
q.add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
}
if (q instanceof BooleanQueryNode) {
q = new GroupQueryNode(q);
}
return q;
}
} else {
// phrase query:
MultiPhraseQueryNode mpq = new MultiPhraseQueryNode();
List<FieldQueryNode> multiTerms = new ArrayList<>();
int position = -1;
int i = 0;
int termGroupCount = 0;
for (; i < numTokens; i++) {
String term = null;
int positionIncrement = 1;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (positionIncrement > 0 && multiTerms.size() > 0) {
for (FieldQueryNode termNode : multiTerms) {
if (this.positionIncrementsEnabled) {
termNode.setPositionIncrement(position);
} else {
termNode.setPositionIncrement(termGroupCount);
}
mpq.add(termNode);
}
// Only increment once for each "group" of
// terms that were in the same position:
termGroupCount++;
multiTerms.clear();
}
position += positionIncrement;
multiTerms.add(new FieldQueryNode(field, term, -1, -1));
}
for (FieldQueryNode termNode : multiTerms) {
if (this.positionIncrementsEnabled) {
termNode.setPositionIncrement(position);
} else {
termNode.setPositionIncrement(termGroupCount);
}
mpq.add(termNode);
}
return mpq;
}
} else {
TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();
int position = -1;
for (int i = 0; i < numTokens; i++) {
String term = null;
int positionIncrement = 1;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1);
if (this.positionIncrementsEnabled) {
position += positionIncrement;
newFieldNode.setPositionIncrement(position);
} else {
newFieldNode.setPositionIncrement(i);
}
pq.add(newFieldNode);
}
return pq;
}
} finally {
if (buffer != null) {
try {
buffer.close();
} catch (IOException e) {
// safe to ignore
}
}
}
}
return node;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-skos by behas.
the class AbstractSKOSFilter method analyze.
public static CharsRef analyze(Analyzer analyzer, String text, CharsRefBuilder buffer) throws IOException {
TokenStream ts = analyzer.tokenStream("", new StringReader(text));
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
int length = termAtt.length();
if (length == 0) {
throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
}
if (buffer.length() > 0) {
buffer.append(' ');
}
buffer.append(termAtt.buffer(), 0, length);
}
ts.end();
ts.close();
if (buffer.length() == 0) {
throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
}
return buffer.get();
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr-analysis-turkish by iorixxx.
the class Zemberek2DeASCIIfyFilterFactory method main.
public static void main(String[] args) throws IOException {
StringReader reader = new StringReader("kus asisi ortaklar çekişme masali");
Map<String, String> map = new HashMap<>();
Zemberek2DeASCIIfyFilterFactory factory = new Zemberek2DeASCIIfyFilterFactory(map);
WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer();
whitespaceTokenizer.setReader(reader);
TokenStream stream = factory.create(whitespaceTokenizer);
CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);
stream.reset();
while (stream.incrementToken()) {
String term = termAttribute.toString();
System.out.println(term);
}
stream.end();
reader.close();
}
Aggregations