Search in sources :

Example 6 with JFlexTokenizer

use of org.opengrok.indexer.analysis.JFlexTokenizer in project OpenGrok by OpenGrok.

the class TroffAnalyzer method analyze.

@Override
public void analyze(Document doc, StreamSource src, Writer xrefOut) throws IOException {
    // this is to explicitly use appropriate analyzers tokenstream to workaround #1376 symbols search works like full text search
    JFlexTokenizer symbolTokenizer = symbolTokenizerFactory.get();
    symbolTokenizer.setReader(getReader(src.getStream()));
    OGKTextField full = new OGKTextField(QueryBuilder.FULL, symbolTokenizer);
    doc.add(full);
    if (xrefOut != null) {
        try (Reader in = getReader(src.getStream())) {
            WriteXrefArgs args = new WriteXrefArgs(in, xrefOut);
            args.setProject(project);
            Xrefer xref = writeXref(args);
            String path = doc.get(QueryBuilder.PATH);
            addNumLinesLOC(doc, new NumLinesLOC(path, xref.getLineNumber(), xref.getLOC()));
        }
    }
}
Also used : JFlexTokenizer(org.opengrok.indexer.analysis.JFlexTokenizer) OGKTextField(org.opengrok.indexer.analysis.OGKTextField) NumLinesLOC(org.opengrok.indexer.analysis.NumLinesLOC) Xrefer(org.opengrok.indexer.analysis.Xrefer) Reader(java.io.Reader) WriteXrefArgs(org.opengrok.indexer.analysis.WriteXrefArgs)

Example 7 with JFlexTokenizer

use of org.opengrok.indexer.analysis.JFlexTokenizer in project OpenGrok by OpenGrok.

the class CustomAssertions method assertSymbolStream.

/**
 * Asserts the specified tokenizer class produces an expected stream of
 * symbols from the specified input.
 * @param klass the test class
 * @param iss the input stream
 * @param expectedTokens the expected, ordered token list
 * @throws java.lang.Exception if an error occurs constructing a
 * {@code klass} instance or testing the stream
 */
public static void assertSymbolStream(Class<? extends JFlexSymbolMatcher> klass, InputStream iss, List<String> expectedTokens) throws Exception {
    byte[] inputCopy = copyStream(iss);
    String input = new String(inputCopy, StandardCharsets.UTF_8);
    JFlexTokenizer tokenizer = new JFlexTokenizer(klass.getConstructor(Reader.class).newInstance(new InputStreamReader(new ByteArrayInputStream(inputCopy), StandardCharsets.UTF_8)));
    CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
    OffsetAttribute offs = tokenizer.addAttribute(OffsetAttribute.class);
    int count = 0;
    List<String> tokens = new ArrayList<>();
    while (tokenizer.incrementToken()) {
        String termValue = term.toString();
        tokens.add(termValue);
        String cutValue = input.substring(offs.startOffset(), offs.endOffset());
        assertEquals(cutValue, termValue, "cut term" + (1 + count));
        ++count;
    }
    count = 0;
    for (String token : tokens) {
        // 1-based offset to accord with line #
        if (count >= expectedTokens.size()) {
            printTokens(tokens);
            assertTrue(count < expectedTokens.size(), "too many tokens at term" + (1 + count) + ": " + token);
        }
        String expected = expectedTokens.get(count);
        if (!token.equals(expected)) {
            printTokens(tokens);
            assertEquals(expected, token, "term" + (1 + count));
        }
        count++;
    }
    if (expectedTokens.size() != count) {
        printTokens(tokens);
        assertEquals(expectedTokens.size(), count, "wrong number of tokens");
    }
}
Also used : JFlexTokenizer(org.opengrok.indexer.analysis.JFlexTokenizer) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) InputStreamReader(java.io.InputStreamReader) ByteArrayInputStream(java.io.ByteArrayInputStream) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) ArrayList(java.util.ArrayList)

Example 8 with JFlexTokenizer

use of org.opengrok.indexer.analysis.JFlexTokenizer in project OpenGrok by OpenGrok.

the class HaskellSymbolTokenizerTest method getTermsFor.

private String[] getTermsFor(Reader r) {
    List<String> l = new LinkedList<>();
    JFlexTokenizer ts = (JFlexTokenizer) this.analyzer.tokenStream("refs", r);
    ts.setReader(r);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    try {
        ts.reset();
        while (ts.incrementToken()) {
            l.add(term.toString());
        }
    } catch (IOException ex) {
        throw new RuntimeException(ex);
    }
    return l.toArray(new String[0]);
}
Also used : JFlexTokenizer(org.opengrok.indexer.analysis.JFlexTokenizer) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) IOException(java.io.IOException) LinkedList(java.util.LinkedList)

Example 9 with JFlexTokenizer

use of org.opengrok.indexer.analysis.JFlexTokenizer in project OpenGrok by OpenGrok.

the class JavaSymbolTokenizerTest method getTermsFor.

private String[] getTermsFor(Reader r) {
    List<String> l = new LinkedList<>();
    JFlexTokenizer ts = (JFlexTokenizer) this.analyzer.tokenStream("refs", r);
    ts.setReader(r);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    try {
        ts.reset();
        while (ts.incrementToken()) {
            l.add(term.toString());
        }
    } catch (IOException ex) {
        throw new RuntimeException(ex);
    }
    return l.toArray(new String[0]);
}
Also used : JFlexTokenizer(org.opengrok.indexer.analysis.JFlexTokenizer) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) IOException(java.io.IOException) LinkedList(java.util.LinkedList)

Aggregations

JFlexTokenizer (org.opengrok.indexer.analysis.JFlexTokenizer)9 Reader (java.io.Reader)5 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)5 IOException (java.io.IOException)4 OGKTextField (org.opengrok.indexer.analysis.OGKTextField)4 WriteXrefArgs (org.opengrok.indexer.analysis.WriteXrefArgs)4 LinkedList (java.util.LinkedList)3 NumLinesLOC (org.opengrok.indexer.analysis.NumLinesLOC)3 Xrefer (org.opengrok.indexer.analysis.Xrefer)3 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 InputStreamReader (java.io.InputStreamReader)1 StringReader (java.io.StringReader)1 ArrayList (java.util.ArrayList)1 ExecutionException (java.util.concurrent.ExecutionException)1 StoredField (org.apache.lucene.document.StoredField)1 Definitions (org.opengrok.indexer.analysis.Definitions)1 ExpandTabsReader (org.opengrok.indexer.analysis.ExpandTabsReader)1 JFlexSymbolMatcher (org.opengrok.indexer.analysis.JFlexSymbolMatcher)1 Scopes (org.opengrok.indexer.analysis.Scopes)1