Search in sources :

Example 91 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.

the class TruncateTokenFilterTests method testSimple.

public void testSimple() throws IOException {
    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(t, new TruncateTokenFilter(t, 3));
        }
    };
    TokenStream test = analyzer.tokenStream("test", "a bb ccc dddd eeeee");
    test.reset();
    CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class);
    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("a"));
    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("bb"));
    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("ccc"));
    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("ddd"));
    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("eee"));
    assertThat(test.incrementToken(), equalTo(false));
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer)

Example 92 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.

the class UniqueTokenFilterTests method testSimple.

public void testSimple() throws IOException {
    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(t, new UniqueTokenFilter(t));
        }
    };
    TokenStream test = analyzer.tokenStream("test", "this test with test");
    test.reset();
    CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class);
    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("this"));
    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("test"));
    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("with"));
    assertThat(test.incrementToken(), equalTo(false));
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer)

Example 93 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-skos by behas.

the class SKOSQueryNodeProcessor method postProcessNode.

@Override
protected QueryNode postProcessNode(QueryNode node) throws QueryNodeException {
    if (node instanceof TextableQueryNode && !(node instanceof WildcardQueryNode) && !(node instanceof FuzzyQueryNode) && !(node instanceof RegexpQueryNode) && !(node.getParent() instanceof RangeQueryNode)) {
        FieldQueryNode fieldNode = ((FieldQueryNode) node);
        String text = fieldNode.getTextAsString();
        String field = fieldNode.getFieldAsString();
        CachingTokenFilter buffer = null;
        PositionIncrementAttribute posIncrAtt = null;
        int numTokens = 0;
        int positionCount = 0;
        boolean severalTokensAtSamePosition = false;
        try {
            try (TokenStream source = this.analyzer.tokenStream(field, text)) {
                buffer = new CachingTokenFilter(source);
                buffer.reset();
                if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
                    posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
                }
                try {
                    while (buffer.incrementToken()) {
                        numTokens++;
                        int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
                        if (positionIncrement != 0) {
                            positionCount += positionIncrement;
                        } else {
                            severalTokensAtSamePosition = true;
                        }
                    }
                } catch (IOException e) {
                // ignore
                }
                // rewind the buffer stream
                //will never through on subsequent reset calls
                buffer.reset();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            if (!buffer.hasAttribute(CharTermAttribute.class)) {
                return new NoTokenFoundQueryNode();
            }
            CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class);
            if (numTokens == 0) {
                return new NoTokenFoundQueryNode();
            } else if (numTokens == 1) {
                String term = null;
                try {
                    boolean hasNext;
                    hasNext = buffer.incrementToken();
                    assert hasNext == true;
                    term = termAtt.toString();
                } catch (IOException e) {
                // safe to ignore, because we know the number of tokens
                }
                fieldNode.setText(term);
                return fieldNode;
            } else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) {
                if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) {
                    if (positionCount == 1) {
                        // simple case: only one position, with synonyms
                        LinkedList<QueryNode> children = new LinkedList<>();
                        for (int i = 0; i < numTokens; i++) {
                            String term = null;
                            try {
                                boolean hasNext = buffer.incrementToken();
                                assert hasNext == true;
                                term = termAtt.toString();
                            } catch (IOException e) {
                            // safe to ignore, because we know the number of tokens
                            }
                            if (buffer.hasAttribute(SKOSTypeAttribute.class) && boosts != null) {
                                SKOSTypeAttribute skosAttr = buffer.getAttribute(SKOSTypeAttribute.class);
                                children.add(new BoostQueryNode(new FieldQueryNode(field, term, -1, -1), getBoost(skosAttr.getSkosType())));
                            } else {
                                children.add(new FieldQueryNode(field, term, -1, -1));
                            }
                        }
                        return new GroupQueryNode(new StandardBooleanQueryNode(children, positionCount == 1));
                    } else {
                        // multiple positions
                        QueryNode q = new StandardBooleanQueryNode(Collections.<QueryNode>emptyList(), false);
                        QueryNode currentQuery = null;
                        for (int i = 0; i < numTokens; i++) {
                            String term = null;
                            try {
                                boolean hasNext = buffer.incrementToken();
                                assert hasNext == true;
                                term = termAtt.toString();
                            } catch (IOException e) {
                            // safe to ignore, because we know the number of tokens
                            }
                            if (posIncrAtt != null && posIncrAtt.getPositionIncrement() == 0) {
                                if (!(currentQuery instanceof BooleanQueryNode)) {
                                    QueryNode t = currentQuery;
                                    currentQuery = new StandardBooleanQueryNode(Collections.<QueryNode>emptyList(), true);
                                    ((BooleanQueryNode) currentQuery).add(t);
                                }
                                ((BooleanQueryNode) currentQuery).add(new FieldQueryNode(field, term, -1, -1));
                            } else {
                                if (currentQuery != null) {
                                    if (this.defaultOperator == Operator.OR) {
                                        q.add(currentQuery);
                                    } else {
                                        q.add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
                                    }
                                }
                                currentQuery = new FieldQueryNode(field, term, -1, -1);
                            }
                        }
                        if (this.defaultOperator == Operator.OR) {
                            q.add(currentQuery);
                        } else {
                            q.add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
                        }
                        if (q instanceof BooleanQueryNode) {
                            q = new GroupQueryNode(q);
                        }
                        return q;
                    }
                } else {
                    // phrase query:
                    MultiPhraseQueryNode mpq = new MultiPhraseQueryNode();
                    List<FieldQueryNode> multiTerms = new ArrayList<>();
                    int position = -1;
                    int i = 0;
                    int termGroupCount = 0;
                    for (; i < numTokens; i++) {
                        String term = null;
                        int positionIncrement = 1;
                        try {
                            boolean hasNext = buffer.incrementToken();
                            assert hasNext == true;
                            term = termAtt.toString();
                            if (posIncrAtt != null) {
                                positionIncrement = posIncrAtt.getPositionIncrement();
                            }
                        } catch (IOException e) {
                        // safe to ignore, because we know the number of tokens
                        }
                        if (positionIncrement > 0 && multiTerms.size() > 0) {
                            for (FieldQueryNode termNode : multiTerms) {
                                if (this.positionIncrementsEnabled) {
                                    termNode.setPositionIncrement(position);
                                } else {
                                    termNode.setPositionIncrement(termGroupCount);
                                }
                                mpq.add(termNode);
                            }
                            // Only increment once for each "group" of
                            // terms that were in the same position:
                            termGroupCount++;
                            multiTerms.clear();
                        }
                        position += positionIncrement;
                        multiTerms.add(new FieldQueryNode(field, term, -1, -1));
                    }
                    for (FieldQueryNode termNode : multiTerms) {
                        if (this.positionIncrementsEnabled) {
                            termNode.setPositionIncrement(position);
                        } else {
                            termNode.setPositionIncrement(termGroupCount);
                        }
                        mpq.add(termNode);
                    }
                    return mpq;
                }
            } else {
                TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();
                int position = -1;
                for (int i = 0; i < numTokens; i++) {
                    String term = null;
                    int positionIncrement = 1;
                    try {
                        boolean hasNext = buffer.incrementToken();
                        assert hasNext == true;
                        term = termAtt.toString();
                        if (posIncrAtt != null) {
                            positionIncrement = posIncrAtt.getPositionIncrement();
                        }
                    } catch (IOException e) {
                    // safe to ignore, because we know the number of tokens
                    }
                    FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1);
                    if (this.positionIncrementsEnabled) {
                        position += positionIncrement;
                        newFieldNode.setPositionIncrement(position);
                    } else {
                        newFieldNode.setPositionIncrement(i);
                    }
                    pq.add(newFieldNode);
                }
                return pq;
            }
        } finally {
            if (buffer != null) {
                try {
                    buffer.close();
                } catch (IOException e) {
                // safe to ignore
                }
            }
        }
    }
    return node;
}
Also used : FuzzyQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.FuzzyQueryNode) TokenStream(org.apache.lucene.analysis.TokenStream) SKOSTypeAttribute(at.ac.univie.mminf.luceneSKOS.analysis.SKOSTypeAttribute) QuotedFieldQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.QuotedFieldQueryNode) ArrayList(java.util.ArrayList) GroupQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.GroupQueryNode) WildcardQueryNode(org.apache.lucene.queryparser.flexible.standard.nodes.WildcardQueryNode) FieldQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode) QuotedFieldQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.QuotedFieldQueryNode) NoTokenFoundQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.NoTokenFoundQueryNode) BoostQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.BoostQueryNode) RegexpQueryNode(org.apache.lucene.queryparser.flexible.standard.nodes.RegexpQueryNode) IOException(java.io.IOException) LinkedList(java.util.LinkedList) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) StandardBooleanQueryNode(org.apache.lucene.queryparser.flexible.standard.nodes.StandardBooleanQueryNode) TokenizedPhraseQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.TokenizedPhraseQueryNode) RangeQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.RangeQueryNode) ModifierQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.ModifierQueryNode) MultiPhraseQueryNode(org.apache.lucene.queryparser.flexible.standard.nodes.MultiPhraseQueryNode) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) CachingTokenFilter(org.apache.lucene.analysis.CachingTokenFilter) TokenizedPhraseQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.TokenizedPhraseQueryNode) RangeQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.RangeQueryNode) NoTokenFoundQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.NoTokenFoundQueryNode) RegexpQueryNode(org.apache.lucene.queryparser.flexible.standard.nodes.RegexpQueryNode) GroupQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.GroupQueryNode) FieldQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode) BooleanQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.BooleanQueryNode) FuzzyQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.FuzzyQueryNode) QueryNode(org.apache.lucene.queryparser.flexible.core.nodes.QueryNode) TextableQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.TextableQueryNode) MultiPhraseQueryNode(org.apache.lucene.queryparser.flexible.standard.nodes.MultiPhraseQueryNode) BoostQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.BoostQueryNode) ModifierQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.ModifierQueryNode) QuotedFieldQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.QuotedFieldQueryNode) WildcardQueryNode(org.apache.lucene.queryparser.flexible.standard.nodes.WildcardQueryNode) StandardBooleanQueryNode(org.apache.lucene.queryparser.flexible.standard.nodes.StandardBooleanQueryNode) TextableQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.TextableQueryNode) BooleanQueryNode(org.apache.lucene.queryparser.flexible.core.nodes.BooleanQueryNode) StandardBooleanQueryNode(org.apache.lucene.queryparser.flexible.standard.nodes.StandardBooleanQueryNode)

Example 94 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-skos by behas.

the class AbstractSKOSFilter method analyze.

public static CharsRef analyze(Analyzer analyzer, String text, CharsRefBuilder buffer) throws IOException {
    TokenStream ts = analyzer.tokenStream("", new StringReader(text));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
        int length = termAtt.length();
        if (length == 0) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
        }
        if (buffer.length() > 0) {
            buffer.append(' ');
        }
        buffer.append(termAtt.buffer(), 0, length);
    }
    ts.end();
    ts.close();
    if (buffer.length() == 0) {
        throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
    }
    return buffer.get();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader)

Example 95 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr-analysis-turkish by iorixxx.

the class Zemberek2DeASCIIfyFilterFactory method main.

public static void main(String[] args) throws IOException {
    StringReader reader = new StringReader("kus asisi ortaklar çekişme masali");
    Map<String, String> map = new HashMap<>();
    Zemberek2DeASCIIfyFilterFactory factory = new Zemberek2DeASCIIfyFilterFactory(map);
    WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer();
    whitespaceTokenizer.setReader(reader);
    TokenStream stream = factory.create(whitespaceTokenizer);
    CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        String term = termAttribute.toString();
        System.out.println(term);
    }
    stream.end();
    reader.close();
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) HashMap(java.util.HashMap) StringReader(java.io.StringReader)

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)151 TokenStream (org.apache.lucene.analysis.TokenStream)95 StringReader (java.io.StringReader)46 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)35 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)34 IOException (java.io.IOException)27 ArrayList (java.util.ArrayList)27 Tokenizer (org.apache.lucene.analysis.Tokenizer)25 Analyzer (org.apache.lucene.analysis.Analyzer)20 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)16 BytesRef (org.apache.lucene.util.BytesRef)15 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)13 LinkedList (java.util.LinkedList)11 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)10 Term (org.apache.lucene.index.Term)10 HashMap (java.util.HashMap)9 Token (org.apache.lucene.analysis.Token)8 Document (org.apache.lucene.document.Document)8 List (java.util.List)7 HashSet (java.util.HashSet)6