Search in sources :

Example 76 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project textdb by TextDB.

the class WordCloudOpPartialExec method calculateWordCount.

private static List<Tuple> calculateWordCount(List<String> texts, Analyzer luceneAnalyzer) throws Exception {
    HashMap<String, Integer> termFreqMap = new HashMap<>();
    for (String text : texts) {
        TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(text));
        OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            int charStart = offsetAttribute.startOffset();
            int charEnd = offsetAttribute.endOffset();
            String termStr = text.substring(charStart, charEnd).toLowerCase();
            if (!EnglishAnalyzer.ENGLISH_STOP_WORDS_SET.contains(termStr))
                termFreqMap.put(termStr, termFreqMap.get(termStr) == null ? 1 : termFreqMap.get(termStr) + 1);
        }
        tokenStream.close();
    }
    List<Tuple> termFreqTuples = new ArrayList<>();
    for (Map.Entry<String, Integer> e : termFreqMap.entrySet()) {
        termFreqTuples.add(Tuple.newBuilder(partialAggregateSchema).addSequentially(new Object[] { e.getKey(), e.getValue() }).build());
    }
    return termFreqTuples;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Tuple(edu.uci.ics.texera.workflow.common.tuple.Tuple)

Example 77 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project HongsCORE by ihongs.

the class DemoTest method main.

public static void main(String[] args) throws IOException {
    Analyzer az = CustomAnalyzer.builder().withTokenizer("Name").addTokenFilter("EdgeNGram", "minGramSize", "1", "maxGramSize", "20").build();
    StringReader sr = new StringReader(args[0]);
    TokenStream ts = az.tokenStream("", sr);
    OffsetAttribute oa = ts.addAttribute(OffsetAttribute.class);
    CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
    try {
        // Resets this stream to the beginning. (Required)
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println(ta.toString() + "|" + ta.length() + "[" + oa.startOffset() + "," + oa.endOffset() + "]");
        }
        // Perform end-of-stream operations, e.g. set the final offset.
        ts.end();
    } finally {
        // Release resources associated with this stream.
        ts.close();
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Analyzer(org.apache.lucene.analysis.Analyzer) CustomAnalyzer(org.apache.lucene.analysis.custom.CustomAnalyzer)

Example 78 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.

the class DefinitionsTokenStreamTest method testDefinitionsVsContent.

// DefinitionsTokenStream should not be used in try-with-resources
@SuppressWarnings("java:S2095")
private void testDefinitionsVsContent(boolean expandTabs, String sourceResource, String tagsResource, int expectedCount, boolean doSupplement, Map<Integer, SimpleEntry<String, String>> overrides) throws IOException {
    StreamSource src = getSourceFromResource(sourceResource);
    // Deserialize the ctags.
    int tabSize = expandTabs ? 8 : 0;
    String suppResource = doSupplement ? sourceResource : null;
    Definitions defs = StreamUtils.readTagsFromResource(tagsResource, suppResource, tabSize);
    // Read the whole input.
    StringBuilder bld = new StringBuilder();
    String source;
    try (Reader rdr = ExpandTabsReader.wrap(IOUtils.createBOMStrippedReader(src.getStream(), StandardCharsets.UTF_8.name()), tabSize)) {
        int c;
        while ((c = rdr.read()) != -1) {
            bld.append((char) c);
        }
        source = bld.toString();
    }
    // Deserialize the token stream.
    DefinitionsTokenStream tokstream = new DefinitionsTokenStream();
    tokstream.initialize(defs, src, in -> ExpandTabsReader.wrap(in, tabSize));
    // Iterate through stream.
    CharTermAttribute term = tokstream.getAttribute(CharTermAttribute.class);
    assertNotNull(term, "CharTermAttribute");
    OffsetAttribute offs = tokstream.getAttribute(OffsetAttribute.class);
    assertNotNull(offs, "OffsetAttribute");
    int count = 0;
    while (tokstream.incrementToken()) {
        ++count;
        String termValue = term.toString();
        String cutValue = source.substring(offs.startOffset(), offs.endOffset());
        // If an override exists, test it specially.
        if (overrides != null && overrides.containsKey(count)) {
            SimpleEntry<String, String> overkv = overrides.get(count);
            assertEquals(overkv.getKey(), cutValue, "cut term override" + count);
            assertEquals(overkv.getValue(), termValue, "cut term w.r.t. term override" + count);
            continue;
        }
        boolean cutContainsTerm = cutValue.endsWith(termValue);
        assertTrue(cutContainsTerm, "cut term" + count + " at " + (offs.startOffset()) + "-" + (offs.endOffset()) + "[" + cutValue + "] vs [" + termValue + "]");
    }
    assertEquals(expectedCount, count, "token count");
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StreamSource(org.opengrok.indexer.analysis.StreamSource) Definitions(org.opengrok.indexer.analysis.Definitions) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Reader(java.io.Reader) ExpandTabsReader(org.opengrok.indexer.analysis.ExpandTabsReader)

Example 79 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.

the class JFlexTokenizerTest method testOffsetAttribute.

/**
 * Helper method for {@link #testOffsetAttribute()} that runs the test on
 * one single implementation class with the specified input text and
 * expected tokens.
 */
private void testOffsetAttribute(Class<? extends JFlexSymbolMatcher> klass, String inputText, String[] expectedTokens) throws Exception {
    JFlexSymbolMatcher matcher = klass.getConstructor(Reader.class).newInstance(new StringReader(inputText));
    JFlexTokenizer tokenizer = new JFlexTokenizer(matcher);
    CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
    OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
    int count = 0;
    while (tokenizer.incrementToken()) {
        assertTrue(count < expectedTokens.length, "too many tokens");
        String expected = expectedTokens[count];
        assertEquals(expected, term.toString(), "term");
        assertEquals(inputText.indexOf(expected), offset.startOffset(), "start");
        assertEquals(inputText.indexOf(expected) + expected.length(), offset.endOffset(), "end");
        count++;
    }
    assertEquals(expectedTokens.length, count, "wrong number of tokens");
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Reader(java.io.Reader) StringReader(java.io.StringReader)

Example 80 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.

the class CustomAssertions method assertSymbolStream.

/**
 * Asserts the specified tokenizer class produces an expected stream of
 * symbols from the specified input.
 * @param klass the test class
 * @param iss the input stream
 * @param expectedTokens the expected, ordered token list
 * @throws java.lang.Exception if an error occurs constructing a
 * {@code klass} instance or testing the stream
 */
public static void assertSymbolStream(Class<? extends JFlexSymbolMatcher> klass, InputStream iss, List<String> expectedTokens) throws Exception {
    byte[] inputCopy = copyStream(iss);
    String input = new String(inputCopy, StandardCharsets.UTF_8);
    JFlexTokenizer tokenizer = new JFlexTokenizer(klass.getConstructor(Reader.class).newInstance(new InputStreamReader(new ByteArrayInputStream(inputCopy), StandardCharsets.UTF_8)));
    CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
    OffsetAttribute offs = tokenizer.addAttribute(OffsetAttribute.class);
    int count = 0;
    List<String> tokens = new ArrayList<>();
    while (tokenizer.incrementToken()) {
        String termValue = term.toString();
        tokens.add(termValue);
        String cutValue = input.substring(offs.startOffset(), offs.endOffset());
        assertEquals(cutValue, termValue, "cut term" + (1 + count));
        ++count;
    }
    count = 0;
    for (String token : tokens) {
        // 1-based offset to accord with line #
        if (count >= expectedTokens.size()) {
            printTokens(tokens);
            assertTrue(count < expectedTokens.size(), "too many tokens at term" + (1 + count) + ": " + token);
        }
        String expected = expectedTokens.get(count);
        if (!token.equals(expected)) {
            printTokens(tokens);
            assertEquals(expected, token, "term" + (1 + count));
        }
        count++;
    }
    if (expectedTokens.size() != count) {
        printTokens(tokens);
        assertEquals(expectedTokens.size(), count, "wrong number of tokens");
    }
}
Also used : JFlexTokenizer(org.opengrok.indexer.analysis.JFlexTokenizer) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) InputStreamReader(java.io.InputStreamReader) ByteArrayInputStream(java.io.ByteArrayInputStream) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) ArrayList(java.util.ArrayList)

Aggregations

OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)82 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)59 TokenStream (org.apache.lucene.analysis.TokenStream)47 StringReader (java.io.StringReader)36 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)33 IOException (java.io.IOException)25 ArrayList (java.util.ArrayList)23 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)17 BytesRef (org.apache.lucene.util.BytesRef)14 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)12 Tokenizer (org.apache.lucene.analysis.Tokenizer)10 Reader (java.io.Reader)9 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)8 Analyzer (org.apache.lucene.analysis.Analyzer)7 Token (org.apache.lucene.analysis.Token)7 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)7 List (java.util.List)6 PackedTokenAttributeImpl (org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl)5 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)5 IndexReader (org.apache.lucene.index.IndexReader)5