Search in sources :

Example 6 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project jackrabbit by apache.

the class AbstractExcerpt method createTermPositionVector.

/**
     * @param text the text.
     * @return a <code>TermPositionVector</code> for the given text.
     */
private TermPositionVector createTermPositionVector(String text) {
    // term -> TermVectorOffsetInfo[]
    final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>();
    Reader r = new StringReader(text);
    TokenStream ts = index.getTextAnalyzer().tokenStream("", r);
    try {
        while (ts.incrementToken()) {
            OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
            TermAttribute term = ts.getAttribute(TermAttribute.class);
            String termText = term.term();
            TermVectorOffsetInfo[] info = termMap.get(termText);
            if (info == null) {
                info = new TermVectorOffsetInfo[1];
            } else {
                TermVectorOffsetInfo[] tmp = info;
                info = new TermVectorOffsetInfo[tmp.length + 1];
                System.arraycopy(tmp, 0, info, 0, tmp.length);
            }
            info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset());
            termMap.put(termText, info);
        }
        ts.end();
        ts.close();
    } catch (IOException e) {
    // should never happen, we are reading from a string
    }
    return new TermPositionVector() {

        private String[] terms = (String[]) termMap.keySet().toArray(new String[termMap.size()]);

        public int[] getTermPositions(int index) {
            return null;
        }

        public TermVectorOffsetInfo[] getOffsets(int index) {
            TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO;
            if (index >= 0 && index < terms.length) {
                info = termMap.get(terms[index]);
            }
            return info;
        }

        public String getField() {
            return "";
        }

        public int size() {
            return terms.length;
        }

        public String[] getTerms() {
            return terms;
        }

        public int[] getTermFrequencies() {
            int[] freqs = new int[terms.length];
            for (int i = 0; i < terms.length; i++) {
                freqs[i] = termMap.get(terms[i]).length;
            }
            return freqs;
        }

        public int indexOf(String term) {
            int res = Arrays.binarySearch(terms, term);
            return res >= 0 ? res : -1;
        }

        public int[] indexesOf(String[] terms, int start, int len) {
            int[] res = new int[len];
            for (int i = 0; i < len; i++) {
                res[i] = indexOf(terms[i]);
            }
            return res;
        }
    };
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) Reader(java.io.Reader) StringReader(java.io.StringReader) IndexReader(org.apache.lucene.index.IndexReader) IOException(java.io.IOException) TreeMap(java.util.TreeMap) TermVectorOffsetInfo(org.apache.lucene.index.TermVectorOffsetInfo) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) TermAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute) TermPositionVector(org.apache.lucene.index.TermPositionVector)

Example 7 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project jackrabbit-oak by apache.

the class LuceneIndex method tokenize.

/**
     * Tries to merge back tokens that are split on relevant fulltext query
     * wildcards ('*' or '?')
     *
     *
     * @param text
     * @param analyzer
     * @return
     */
static List<String> tokenize(String text, Analyzer analyzer) {
    List<String> tokens = new ArrayList<String>();
    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text));
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
        // TypeAttribute type = stream.addAttribute(TypeAttribute.class);
        stream.reset();
        int poz = 0;
        boolean hasFulltextToken = false;
        StringBuilder token = new StringBuilder();
        while (stream.incrementToken()) {
            String term = termAtt.toString();
            int start = offsetAtt.startOffset();
            int end = offsetAtt.endOffset();
            if (start > poz) {
                for (int i = poz; i < start; i++) {
                    for (char c : fulltextTokens) {
                        if (c == text.charAt(i)) {
                            token.append(c);
                            hasFulltextToken = true;
                        }
                    }
                }
            }
            poz = end;
            if (hasFulltextToken) {
                token.append(term);
                hasFulltextToken = false;
            } else {
                if (token.length() > 0) {
                    tokens.add(token.toString());
                }
                token = new StringBuilder();
                token.append(term);
            }
        }
        // consume to the end of the string
        if (poz < text.length()) {
            for (int i = poz; i < text.length(); i++) {
                for (char c : fulltextTokens) {
                    if (c == text.charAt(i)) {
                        token.append(c);
                    }
                }
            }
        }
        if (token.length() > 0) {
            tokens.add(token.toString());
        }
        stream.end();
    } catch (IOException e) {
        LOG.error("Building fulltext query failed", e.getMessage());
        return null;
    } finally {
        try {
            if (stream != null) {
                stream.close();
            }
        } catch (IOException e) {
        // ignore
        }
    }
    return tokens;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IOException(java.io.IOException)

Example 8 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class ShingleFilter method getNextToken.

/**
   * <p>Get the next token from the input stream.
   * <p>If the next token has <code>positionIncrement &gt; 1</code>,
   * <code>positionIncrement - 1</code> {@link #fillerToken}s are
   * inserted first.
   * @param target Where to put the new token; if null, a new instance is created.
   * @return On success, the populated token; null otherwise
   * @throws IOException if the input stream has a problem
   */
private InputWindowToken getNextToken(InputWindowToken target) throws IOException {
    InputWindowToken newTarget = target;
    if (numFillerTokensToInsert > 0) {
        if (null == target) {
            newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
        } else {
            nextInputStreamToken.copyTo(target.attSource);
        }
        // A filler token occupies no space
        newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset());
        newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
        newTarget.isFiller = true;
        --numFillerTokensToInsert;
    } else if (isNextInputStreamToken) {
        if (null == target) {
            newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
        } else {
            nextInputStreamToken.copyTo(target.attSource);
        }
        isNextInputStreamToken = false;
        newTarget.isFiller = false;
    } else if (!exhausted) {
        if (input.incrementToken()) {
            if (null == target) {
                newTarget = new InputWindowToken(cloneAttributes());
            } else {
                this.copyTo(target.attSource);
            }
            if (posIncrAtt.getPositionIncrement() > 1) {
                // Each output shingle must contain at least one input token, 
                // so no more than (maxShingleSize - 1) filler tokens will be inserted.
                numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
                // Save the current token as the next input stream token
                if (null == nextInputStreamToken) {
                    nextInputStreamToken = cloneAttributes();
                } else {
                    this.copyTo(nextInputStreamToken);
                }
                isNextInputStreamToken = true;
                // A filler token occupies no space
                newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
                newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
                newTarget.isFiller = true;
                --numFillerTokensToInsert;
            } else {
                newTarget.isFiller = false;
            }
        } else {
            exhausted = true;
            input.end();
            endState = captureState();
            numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1);
            if (numFillerTokensToInsert > 0) {
                nextInputStreamToken = new AttributeSource(getAttributeFactory());
                nextInputStreamToken.addAttribute(CharTermAttribute.class);
                OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(OffsetAttribute.class);
                newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset());
                // Recurse/loop just once:
                return getNextToken(target);
            } else {
                newTarget = null;
            }
        }
    } else {
        newTarget = null;
    }
    return newTarget;
}
Also used : AttributeSource(org.apache.lucene.util.AttributeSource) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Example 9 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class EdgeNGramTokenFilterTest method testSupplementaryCharacters.

public void testSupplementaryCharacters() throws IOException {
    final String s = TestUtil.randomUnicodeString(random(), 10);
    final int codePointCount = s.codePointCount(0, s.length());
    final int minGram = TestUtil.nextInt(random(), 1, 3);
    final int maxGram = TestUtil.nextInt(random(), minGram, 10);
    TokenStream tk = new KeywordTokenizer();
    ((Tokenizer) tk).setReader(new StringReader(s));
    tk = new EdgeNGramTokenFilter(tk, minGram, maxGram);
    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
    tk.reset();
    for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
        assertTrue(tk.incrementToken());
        assertEquals(0, offsetAtt.startOffset());
        assertEquals(s.length(), offsetAtt.endOffset());
        final int end = Character.offsetByCodePoints(s, 0, i);
        assertEquals(s.substring(0, end), termAtt.toString());
    }
    assertFalse(tk.incrementToken());
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer)

Example 10 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class NGramTokenFilterTest method testSupplementaryCharacters.

public void testSupplementaryCharacters() throws IOException {
    final String s = TestUtil.randomUnicodeString(random(), 10);
    final int codePointCount = s.codePointCount(0, s.length());
    final int minGram = TestUtil.nextInt(random(), 1, 3);
    final int maxGram = TestUtil.nextInt(random(), minGram, 10);
    TokenStream tk = new KeywordTokenizer();
    ((Tokenizer) tk).setReader(new StringReader(s));
    tk = new NGramTokenFilter(tk, minGram, maxGram);
    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
    tk.reset();
    for (int start = 0; start < codePointCount; ++start) {
        for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
            assertTrue(tk.incrementToken());
            assertEquals(0, offsetAtt.startOffset());
            assertEquals(s.length(), offsetAtt.endOffset());
            final int startIndex = Character.offsetByCodePoints(s, 0, start);
            final int endIndex = Character.offsetByCodePoints(s, 0, end);
            assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
        }
    }
    assertFalse(tk.incrementToken());
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Aggregations

OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)53 TokenStream (org.apache.lucene.analysis.TokenStream)35 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)33 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)25 StringReader (java.io.StringReader)20 IOException (java.io.IOException)15 ArrayList (java.util.ArrayList)14 BytesRef (org.apache.lucene.util.BytesRef)14 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)12 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)10 Tokenizer (org.apache.lucene.analysis.Tokenizer)9 Token (org.apache.lucene.analysis.Token)7 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)7 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)6 List (java.util.List)5 Analyzer (org.apache.lucene.analysis.Analyzer)5 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)5 IndexReader (org.apache.lucene.index.IndexReader)5 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)4 Document (org.apache.lucene.document.Document)4