Search in sources :

Example 61 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class TestCharTokenizers method testCrossPlaneNormalization.

// LUCENE-3642: normalize SMP->BMP and check that offsets are correct
public void testCrossPlaneNormalization() throws IOException {
    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()) {

                @Override
                protected int normalize(int c) {
                    if (c > 0xffff) {
                        return 'δ';
                    } else {
                        return c;
                    }
                }
            };
            return new TokenStreamComponents(tokenizer, tokenizer);
        }
    };
    int num = 1000 * RANDOM_MULTIPLIER;
    for (int i = 0; i < num; i++) {
        String s = TestUtil.randomUnicodeString(random());
        try (TokenStream ts = analyzer.tokenStream("foo", s)) {
            ts.reset();
            OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
            while (ts.incrementToken()) {
                String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
                for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
                    cp = highlightedText.codePointAt(j);
                    assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
                }
            }
            ts.end();
        }
    }
    // just for fun
    checkRandomData(random(), analyzer, num);
    analyzer.close();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer) Analyzer(org.apache.lucene.analysis.Analyzer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer)

Example 62 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project textdb by TextDB.

the class DataflowUtils method generatePayload.

public static List<Span> generatePayload(String attributeName, String fieldValue, Analyzer luceneAnalyzer) {
    List<Span> payload = new ArrayList<>();
    try {
        TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(fieldValue));
        OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
        int tokenPositionCounter = -1;
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            tokenPositionCounter += positionIncrementAttribute.getPositionIncrement();
            int tokenPosition = tokenPositionCounter;
            int charStart = offsetAttribute.startOffset();
            int charEnd = offsetAttribute.endOffset();
            String analyzedTermStr = charTermAttribute.toString();
            String originalTermStr = fieldValue.substring(charStart, charEnd);
            payload.add(new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition));
        }
        tokenStream.close();
    } catch (IOException e) {
        // return empty payload
        payload.clear();
    }
    return payload;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IOException(java.io.IOException) Span(edu.uci.ics.textdb.api.span.Span) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 63 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class AnalyzingInfixSuggesterTest method testHighlightAsObject.

@SuppressWarnings("unchecked")
public void testHighlightAsObject() throws Exception {
    Input[] keys = new Input[] { new Input("a penny saved is a penny earned", 10, new BytesRef("foobaz")) };
    Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(newDirectory(), a, a, 3, false) {

        @Override
        protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
            try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) {
                CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
                OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
                ts.reset();
                List<LookupHighlightFragment> fragments = new ArrayList<>();
                int upto = 0;
                while (ts.incrementToken()) {
                    String token = termAtt.toString();
                    int startOffset = offsetAtt.startOffset();
                    int endOffset = offsetAtt.endOffset();
                    if (upto < startOffset) {
                        fragments.add(new LookupHighlightFragment(text.substring(upto, startOffset), false));
                        upto = startOffset;
                    } else if (upto > startOffset) {
                        continue;
                    }
                    if (matchedTokens.contains(token)) {
                        // Token matches.
                        fragments.add(new LookupHighlightFragment(text.substring(startOffset, endOffset), true));
                        upto = endOffset;
                    } else if (prefixToken != null && token.startsWith(prefixToken)) {
                        fragments.add(new LookupHighlightFragment(text.substring(startOffset, startOffset + prefixToken.length()), true));
                        if (prefixToken.length() < token.length()) {
                            fragments.add(new LookupHighlightFragment(text.substring(startOffset + prefixToken.length(), startOffset + token.length()), false));
                        }
                        upto = endOffset;
                    }
                }
                ts.end();
                int endOffset = offsetAtt.endOffset();
                if (upto < endOffset) {
                    fragments.add(new LookupHighlightFragment(text.substring(upto), false));
                }
                return fragments;
            }
        }
    };
    suggester.build(new InputArrayIterator(keys));
    List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("ear", random()), 10, true, true);
    assertEquals(1, results.size());
    assertEquals("a penny saved is a penny <b>ear</b>ned", toString((List<LookupHighlightFragment>) results.get(0).highlightKey));
    assertEquals(10, results.get(0).value);
    assertEquals(new BytesRef("foobaz"), results.get(0).payload);
    suggester.close();
    a.close();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharArraySet(org.apache.lucene.analysis.CharArraySet) HashSet(java.util.HashSet) Set(java.util.Set) ArrayList(java.util.ArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Input(org.apache.lucene.search.suggest.Input) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) ArrayList(java.util.ArrayList) List(java.util.List) BytesRef(org.apache.lucene.util.BytesRef)

Example 64 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.

the class DefinitionsTokenStreamTest method testDefinitionsVsContent.

private void testDefinitionsVsContent(boolean expandTabs, String sourceResource, String tagsResource, int expectedCount, boolean doSupplement, Map<Integer, SimpleEntry<String, String>> overrides) throws IOException {
    StreamSource src = getSourceFromResource(sourceResource);
    // Deserialize the ctags.
    int tabSize = expandTabs ? 8 : 0;
    String suppResource = doSupplement ? sourceResource : null;
    Definitions defs = StreamUtils.readTagsFromResource(tagsResource, suppResource, tabSize);
    // Read the whole input.
    StringBuilder bld = new StringBuilder();
    String source;
    try (Reader rdr = ExpandTabsReader.wrap(IOUtils.createBOMStrippedReader(src.getStream(), StandardCharsets.UTF_8.name()), tabSize)) {
        int c;
        while ((c = rdr.read()) != -1) {
            bld.append((char) c);
        }
        source = bld.toString();
    }
    // Deserialize the token stream.
    DefinitionsTokenStream tokstream = new DefinitionsTokenStream();
    tokstream.initialize(defs, src, (in) -> {
        return ExpandTabsReader.wrap(in, tabSize);
    });
    // Iterate through stream.
    CharTermAttribute term = tokstream.getAttribute(CharTermAttribute.class);
    assertNotNull("CharTermAttribute", term);
    OffsetAttribute offs = tokstream.getAttribute(OffsetAttribute.class);
    assertNotNull("OffsetAttribute", offs);
    int count = 0;
    while (tokstream.incrementToken()) {
        ++count;
        String termValue = term.toString();
        String cutValue = source.substring(offs.startOffset(), offs.endOffset());
        // If an override exists, test it specially.
        if (overrides != null && overrides.containsKey(count)) {
            SimpleEntry<String, String> overkv = overrides.get(count);
            assertEquals("cut term override" + count, overkv.getKey(), cutValue);
            assertEquals("cut term w.r.t. term override" + count, overkv.getValue(), termValue);
            continue;
        }
        boolean cutContainsTerm = cutValue.endsWith(termValue);
        assertTrue("cut term" + count + " at " + (offs.startOffset()) + "-" + (offs.endOffset()) + "[" + cutValue + "] vs [" + termValue + "]", cutContainsTerm);
    }
    assertEquals("token count", expectedCount, count);
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StreamSource(org.opensolaris.opengrok.analysis.StreamSource) Definitions(org.opensolaris.opengrok.analysis.Definitions) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) ExpandTabsReader(org.opensolaris.opengrok.analysis.ExpandTabsReader) Reader(java.io.Reader)

Example 65 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.

the class PathTokenizerTest method testIncrementToken.

/**
 * Test of incrementToken method, of class PathTokenizer.
 */
@Test
public void testIncrementToken() throws Exception {
    String inputText = "alpha/beta/gamma/delta.ext";
    String[] expectedTokens = inputText.split("[/.]");
    PathTokenizer tokenizer = new PathTokenizer();
    tokenizer.setReader(new StringReader(inputText));
    CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
    OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
    int count = 0;
    int dots = 0;
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        if (term.toString().equals(".")) {
            dots++;
            break;
        }
        assertTrue("too many tokens", count < expectedTokens.length);
        String expected = expectedTokens[count];
        assertEquals("term", expected, term.toString());
        assertEquals("start", inputText.indexOf(expected), offset.startOffset());
        assertEquals("end", inputText.indexOf(expected) + expected.length(), offset.endOffset());
        count++;
    }
    tokenizer.end();
    tokenizer.close();
    assertEquals("wrong number of tokens", expectedTokens.length, count + dots);
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Test(org.junit.Test)

Aggregations

OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)82 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)59 TokenStream (org.apache.lucene.analysis.TokenStream)47 StringReader (java.io.StringReader)36 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)33 IOException (java.io.IOException)25 ArrayList (java.util.ArrayList)23 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)17 BytesRef (org.apache.lucene.util.BytesRef)14 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)12 Tokenizer (org.apache.lucene.analysis.Tokenizer)10 Reader (java.io.Reader)9 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)8 Analyzer (org.apache.lucene.analysis.Analyzer)7 Token (org.apache.lucene.analysis.Token)7 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)7 List (java.util.List)6 PackedTokenAttributeImpl (org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl)5 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)5 IndexReader (org.apache.lucene.index.IndexReader)5