use of org.apache.lucene.analysis.CachingTokenFilter in project lucene-skos by behas.
the class SKOSQueryNodeProcessor method postProcessNode.
@Override
protected QueryNode postProcessNode(QueryNode node) throws QueryNodeException {
if (node instanceof TextableQueryNode && !(node instanceof WildcardQueryNode) && !(node instanceof FuzzyQueryNode) && !(node instanceof RegexpQueryNode) && !(node.getParent() instanceof RangeQueryNode)) {
FieldQueryNode fieldNode = ((FieldQueryNode) node);
String text = fieldNode.getTextAsString();
String field = fieldNode.getFieldAsString();
CachingTokenFilter buffer = null;
PositionIncrementAttribute posIncrAtt = null;
int numTokens = 0;
int positionCount = 0;
boolean severalTokensAtSamePosition = false;
try {
try (TokenStream source = this.analyzer.tokenStream(field, text)) {
buffer = new CachingTokenFilter(source);
buffer.reset();
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
}
try {
while (buffer.incrementToken()) {
numTokens++;
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
if (positionIncrement != 0) {
positionCount += positionIncrement;
} else {
severalTokensAtSamePosition = true;
}
}
} catch (IOException e) {
// ignore
}
// rewind the buffer stream
// will never through on subsequent reset calls
buffer.reset();
} catch (IOException e) {
throw new RuntimeException(e);
}
if (!buffer.hasAttribute(CharTermAttribute.class)) {
return new NoTokenFoundQueryNode();
}
CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class);
if (numTokens == 0) {
return new NoTokenFoundQueryNode();
} else if (numTokens == 1) {
String term = null;
try {
boolean hasNext;
hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
fieldNode.setText(term);
return fieldNode;
} else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) {
if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) {
if (positionCount == 1) {
// simple case: only one position, with synonyms
LinkedList<QueryNode> children = new LinkedList<>();
for (int i = 0; i < numTokens; i++) {
String term = null;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (buffer.hasAttribute(SKOSTypeAttribute.class) && boosts != null) {
SKOSTypeAttribute skosAttr = buffer.getAttribute(SKOSTypeAttribute.class);
children.add(new BoostQueryNode(new FieldQueryNode(field, term, -1, -1), getBoost(skosAttr.getSkosType())));
} else {
children.add(new FieldQueryNode(field, term, -1, -1));
}
}
return new GroupQueryNode(new StandardBooleanQueryNode(children, positionCount == 1));
} else {
// multiple positions
QueryNode q = new StandardBooleanQueryNode(Collections.<QueryNode>emptyList(), false);
QueryNode currentQuery = null;
for (int i = 0; i < numTokens; i++) {
String term = null;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (posIncrAtt != null && posIncrAtt.getPositionIncrement() == 0) {
if (!(currentQuery instanceof BooleanQueryNode)) {
QueryNode t = currentQuery;
currentQuery = new StandardBooleanQueryNode(Collections.<QueryNode>emptyList(), true);
((BooleanQueryNode) currentQuery).add(t);
}
((BooleanQueryNode) currentQuery).add(new FieldQueryNode(field, term, -1, -1));
} else {
if (currentQuery != null) {
if (this.defaultOperator == Operator.OR) {
q.add(currentQuery);
} else {
q.add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
}
}
currentQuery = new FieldQueryNode(field, term, -1, -1);
}
}
if (this.defaultOperator == Operator.OR) {
q.add(currentQuery);
} else {
q.add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
}
if (q instanceof BooleanQueryNode) {
q = new GroupQueryNode(q);
}
return q;
}
} else {
// phrase query:
MultiPhraseQueryNode mpq = new MultiPhraseQueryNode();
List<FieldQueryNode> multiTerms = new ArrayList<>();
int position = -1;
int i = 0;
int termGroupCount = 0;
for (; i < numTokens; i++) {
String term = null;
int positionIncrement = 1;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (positionIncrement > 0 && multiTerms.size() > 0) {
for (FieldQueryNode termNode : multiTerms) {
if (this.positionIncrementsEnabled) {
termNode.setPositionIncrement(position);
} else {
termNode.setPositionIncrement(termGroupCount);
}
mpq.add(termNode);
}
// Only increment once for each "group" of
// terms that were in the same position:
termGroupCount++;
multiTerms.clear();
}
position += positionIncrement;
multiTerms.add(new FieldQueryNode(field, term, -1, -1));
}
for (FieldQueryNode termNode : multiTerms) {
if (this.positionIncrementsEnabled) {
termNode.setPositionIncrement(position);
} else {
termNode.setPositionIncrement(termGroupCount);
}
mpq.add(termNode);
}
return mpq;
}
} else {
TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();
int position = -1;
for (int i = 0; i < numTokens; i++) {
String term = null;
int positionIncrement = 1;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1);
if (this.positionIncrementsEnabled) {
position += positionIncrement;
newFieldNode.setPositionIncrement(position);
} else {
newFieldNode.setPositionIncrement(i);
}
pq.add(newFieldNode);
}
return pq;
}
} finally {
if (buffer != null) {
try {
buffer.close();
} catch (IOException e) {
// safe to ignore
}
}
}
}
return node;
}
use of org.apache.lucene.analysis.CachingTokenFilter in project janusgraph by JanusGraph.
the class SolrIndex method customTokenize.
@SuppressWarnings("unchecked")
private List<String> customTokenize(String tokenizerClass, String value) {
CachingTokenFilter stream = null;
try {
final List<String> terms = new ArrayList<>();
final Tokenizer tokenizer = ((Constructor<Tokenizer>) ClassLoader.getSystemClassLoader().loadClass(tokenizerClass).getConstructor()).newInstance();
tokenizer.setReader(new StringReader(value));
stream = new CachingTokenFilter(tokenizer);
final TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
stream.reset();
while (stream.incrementToken()) {
terms.add(termAtt.getBytesRef().utf8ToString());
}
return terms;
} catch (ReflectiveOperationException | IOException e) {
throw new IllegalArgumentException(e.getMessage(), e);
} finally {
IOUtils.closeQuietly(stream);
}
}
Aggregations