Examples with JFlexTokenizer - org.opensolaris.opengrok.analysis.JFlexTokenizer

Example 1 with JFlexTokenizer

use of org.opensolaris.opengrok.analysis.JFlexTokenizer in project OpenGrok by OpenGrok.

the class JavaSymbolTokenizerTest method getTermsFor.

private String[] getTermsFor(Reader r) {
    List<String> l = new LinkedList<>();
    JFlexTokenizer ts = (JFlexTokenizer) this.analyzer.tokenStream("refs", r);
    ts.setReader(r);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    try {
        ts.reset();
        while (ts.incrementToken()) {
            l.add(term.toString());
        }
    } catch (IOException ex) {
        throw new RuntimeException(ex);
    }
    return l.toArray(new String[l.size()]);
}

Example 2 with JFlexTokenizer

use of org.opensolaris.opengrok.analysis.JFlexTokenizer in project OpenGrok by OpenGrok.

the class PerlSymbolTokenizerTest method testOffsetAttribute.

/**
 * Helper method for {@link #testOffsetAttribute()} that runs the test on
 * one single implementation class with the specified input text and
 * expected tokens.
 */
private void testOffsetAttribute(Class<? extends JFlexSymbolMatcher> klass, String inputText, String[] expectedTokens) throws Exception {
    JFlexSymbolMatcher matcher = klass.getConstructor(Reader.class).newInstance(new StringReader(inputText));
    JFlexTokenizer tokenizer = new JFlexTokenizer(matcher);
    CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
    OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
    int count = 0;
    while (tokenizer.incrementToken()) {
        assertTrue("too many tokens", count < expectedTokens.length);
        String expected = expectedTokens[count];
        // 0-based offset to accord with String[]
        assertEquals("term" + count, expected, term.toString());
        assertEquals("start" + count, inputText.indexOf(expected), offset.startOffset());
        assertEquals("end" + count, inputText.indexOf(expected) + expected.length(), offset.endOffset());
        count++;
    }
    assertEquals("wrong number of tokens", expectedTokens.length, count);
}

Also used : JFlexTokenizer(org.opensolaris.opengrok.analysis.JFlexTokenizer) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) JFlexSymbolMatcher(org.opensolaris.opengrok.analysis.JFlexSymbolMatcher) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) StringReader(java.io.StringReader) BufferedReader(java.io.BufferedReader)

Example 3 with JFlexTokenizer

use of org.opensolaris.opengrok.analysis.JFlexTokenizer in project OpenGrok by OpenGrok.

the class PhpSymbolTokenizerTest method getTermsFor.

private String[] getTermsFor(Reader r) {
    List<String> l = new LinkedList<>();
    JFlexTokenizer ts = (JFlexTokenizer) this.analyzer.tokenStream("refs", r);
    ts.setReader(r);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    try {
        ts.reset();
        while (ts.incrementToken()) {
            l.add(term.toString());
        }
    } catch (IOException ex) {
        throw new RuntimeException(ex);
    }
    return l.toArray(new String[l.size()]);
}

Example 4 with JFlexTokenizer

use of org.opensolaris.opengrok.analysis.JFlexTokenizer in project OpenGrok by OpenGrok.

the class HaskellSymbolTokenizerTest method getTermsFor.

private String[] getTermsFor(Reader r) {
    List<String> l = new LinkedList<>();
    JFlexTokenizer ts = (JFlexTokenizer) this.analyzer.tokenStream("refs", r);
    ts.setReader(r);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    try {
        ts.reset();
        while (ts.incrementToken()) {
            l.add(term.toString());
        }
    } catch (IOException ex) {
        throw new RuntimeException(ex);
    }
    return l.toArray(new String[l.size()]);
}

Example 5 with JFlexTokenizer

use of org.opensolaris.opengrok.analysis.JFlexTokenizer in project OpenGrok by OpenGrok.

the class CustomAssertions method assertSymbolStream.

/**
 * Asserts the specified tokenizer class produces an expected stream of
 * symbols from the specified input.
 * @param klass the test class
 * @param iss the input stream
 * @param expectedTokens the expected, ordered token list
 * @throws java.lang.Exception if an error occurs constructing a
 * {@code klass} instance or testing the stream
 */
public static void assertSymbolStream(Class<? extends JFlexSymbolMatcher> klass, InputStream iss, List<String> expectedTokens) throws Exception {
    byte[] inputCopy = copyStream(iss);
    String input = new String(inputCopy, StandardCharsets.UTF_8);
    JFlexTokenizer tokenizer = new JFlexTokenizer(klass.getConstructor(Reader.class).newInstance(new InputStreamReader(new ByteArrayInputStream(inputCopy), StandardCharsets.UTF_8)));
    CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
    OffsetAttribute offs = tokenizer.addAttribute(OffsetAttribute.class);
    int count = 0;
    List<String> tokens = new ArrayList<>();
    while (tokenizer.incrementToken()) {
        String termValue = term.toString();
        tokens.add(termValue);
        String cutValue = input.substring(offs.startOffset(), offs.endOffset());
        assertEquals("cut term" + (1 + count), cutValue, termValue);
        ++count;
    }
    count = 0;
    for (String token : tokens) {
        // 1-based offset to accord with line #
        if (count >= expectedTokens.size()) {
            printTokens(tokens);
            assertTrue("too many tokens at term" + (1 + count) + ": " + token, count < expectedTokens.size());
        }
        String expected = expectedTokens.get(count);
        if (!token.equals(expected)) {
            printTokens(tokens);
            assertEquals("term" + (1 + count), expected, token);
        }
        count++;
    }
    assertEquals("wrong number of tokens", expectedTokens.size(), count);
}

Also used : JFlexTokenizer(org.opensolaris.opengrok.analysis.JFlexTokenizer) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) InputStreamReader(java.io.InputStreamReader) ByteArrayInputStream(java.io.ByteArrayInputStream) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) ArrayList(java.util.ArrayList)

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)5 JFlexTokenizer (org.opensolaris.opengrok.analysis.JFlexTokenizer)5 IOException (java.io.IOException)3 LinkedList (java.util.LinkedList)3 InputStreamReader (java.io.InputStreamReader)2 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)2 BufferedReader (java.io.BufferedReader)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 Reader (java.io.Reader)1 StringReader (java.io.StringReader)1 ArrayList (java.util.ArrayList)1 JFlexSymbolMatcher (org.opensolaris.opengrok.analysis.JFlexSymbolMatcher)1