Search in sources :

Example 71 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project Anserini by castorini.

the class TweetTokenizationTest method parseKeywords.

public List<String> parseKeywords(Analyzer analyzer, String keywords) throws IOException {
    List<String> list = new ArrayList<>();
    TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(keywords));
    CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        if (cattr.toString().length() == 0) {
            continue;
        }
        list.add(cattr.toString());
    }
    tokenStream.end();
    tokenStream.close();
    return list;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList) StringReader(java.io.StringReader)

Example 72 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project tika by apache.

the class TokenCounterTest method testCJKFilter.

@Test
public void testCJKFilter() throws Exception {
    String s = "then quickbrownfoxjumpedoverthelazy dogss dog 普林斯顿大学";
    Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
    TokenStream ts = analyzer.tokenStream(FIELD, s);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    ts.reset();
    Map<String, Integer> tokens = new HashMap<>();
    while (ts.incrementToken()) {
        String t = termAtt.toString();
        Integer count = tokens.get(t);
        count = (count == null) ? count = 0 : count;
        count++;
        tokens.put(t, count);
    }
    ts.end();
    ts.close();
    assertEquals(7, tokens.size());
    assertEquals(new Integer(1), tokens.get("林斯"));
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) HashMap(java.util.HashMap) Analyzer(org.apache.lucene.analysis.Analyzer) Test(org.junit.Test)

Example 73 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project tika by apache.

the class AnalyzerManagerTest method testTokenCountFilter.

@Test
public void testTokenCountFilter() throws Exception {
    AnalyzerManager analyzerManager = AnalyzerManager.newInstance(1000000);
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < 1001000; i++) {
        sb.append("the ");
    }
    TokenStream ts = analyzerManager.getGeneralAnalyzer().tokenStream("f", sb.toString());
    ts.reset();
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    int tokens = 0;
    while (ts.incrementToken()) {
        tokens++;
    }
    assertEquals(1000000, tokens);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) AnalyzerManager(org.apache.tika.eval.tokens.AnalyzerManager) Test(org.junit.Test)

Example 74 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project tika by apache.

the class AnalyzerManagerTest method testCommon.

@Test
public void testCommon() throws Exception {
    AnalyzerManager analyzerManager = AnalyzerManager.newInstance(100000);
    Analyzer common = analyzerManager.getCommonTokensAnalyzer();
    TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog");
    ts.reset();
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    Set<String> seen = new HashSet<>();
    while (ts.incrementToken()) {
        String t = termAtt.toString();
        if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray()) && t.contains("5")) {
            fail("Shouldn't have found a numeric");
        }
        seen.add(termAtt.toString());
    }
    ts.end();
    ts.close();
    assertTrue(seen.contains("dirty"));
    assertFalse(seen.contains("the"));
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) Analyzer(org.apache.lucene.analysis.Analyzer) AnalyzerManager(org.apache.tika.eval.tokens.AnalyzerManager) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 75 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project Anserini by castorini.

the class CountBigramPairs method countPairs.

/**
 * Method will count coocurrence of pairs specified in queryPairMap
 * and store counts for each window size in counters
 * NOTE method mutates inputs
 * @param singleCountMap    a count of single tokens as we encounter them, useful if any smoothing
 * @param queryPairMap      all pairs of strings we are looking for
 * @param backQueryPairMap  all pairs of reverse pairs, ei if query is test query, this would include query test
 * @param gapSizes          list of window sizes to compute for
 * @param counters          Window size to counter map
 */
public static void countPairs(Map<String, Integer> singleCountMap, Map<String, Set<String>> queryPairMap, Map<String, Set<String>> backQueryPairMap, ArrayList<Integer> gapSizes, Map<Integer, PhraseCounter> counters, Terms terms) throws IOException {
    // Construct token stream with offset 0
    TokenStreamFromTermVector stream = new TokenStreamFromTermVector(terms, -1);
    CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
    int docSize = 0;
    int maxGapSize = 0;
    for (Integer windowSize : gapSizes) {
        if (windowSize > maxGapSize) {
            maxGapSize = windowSize;
        }
    }
    // We will maintain a fifo queue of window size
    LinkedList<String> window = new LinkedList<>();
    // add to the window first and process the first tokens
    stream.reset();
    while (docSize < maxGapSize * 2 + 2 && stream.incrementToken()) {
        // First construct the window that we need to test on
        docSize++;
        String token = termAttribute.toString();
        window.add(token);
    }
    // But we need to account for the case when the tokenstream just doesn't have that many tokens
    for (int i = 0; i < Math.min(maxGapSize + 1, docSize); i++) {
        String firstToken = window.get(i);
        // Look ahead for token
        if (queryPairMap.containsKey(firstToken)) {
            // Count unigram for this token
            singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
            for (int j = i + 1; j < Math.min(i + maxGapSize + 1, docSize); j++) {
                if (queryPairMap.get(firstToken).contains(window.get(j))) {
                    for (int windowSize : counters.keySet()) {
                        if (j - i <= windowSize)
                            counters.get(windowSize).incrementCount(firstToken);
                    }
                }
            }
        }
        if (backQueryPairMap.containsKey(firstToken)) {
            // Count unigram for this token
            for (int j = i + 1; j < Math.min(i + maxGapSize + 1, docSize); j++) {
                if (backQueryPairMap.get(firstToken).contains(window.get(j))) {
                    for (int windowSize : counters.keySet()) {
                        if (j - i <= windowSize)
                            counters.get(windowSize).incrementCount(window.get(j));
                    }
                }
            }
        }
    }
    // Now we continue
    while (stream.incrementToken()) {
        docSize++;
        String token = termAttribute.toString();
        window.add(token);
        // Move the window along
        // The window at this point is guaranteed to be of size WINDOW_SIZE * 2 because of the previous loop
        // if there are not enough tokens this would not even execute
        window.removeFirst();
        // Now test for the phrase at the test index WINDOW_SIZE
        String firstToken = window.get(maxGapSize);
        if (queryPairMap.containsKey(firstToken)) {
            // Count unigram for this token
            singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
            for (int j = maxGapSize + 1; j < maxGapSize * 2 + 2; j++) {
                if (queryPairMap.get(firstToken).contains(window.get(j))) {
                    for (int windowSize : counters.keySet()) {
                        if (j - maxGapSize <= windowSize)
                            counters.get(windowSize).incrementCount(firstToken);
                    }
                }
            }
        }
        if (backQueryPairMap.containsKey(firstToken)) {
            // Count unigram for this token
            for (int j = maxGapSize + 1; j < maxGapSize * 2 + 2; j++) {
                if (backQueryPairMap.get(firstToken).contains(window.get(j))) {
                    for (int windowSize : counters.keySet()) {
                        if (j - maxGapSize <= windowSize)
                            counters.get(windowSize).incrementCount(window.get(j));
                    }
                }
            }
        }
    }
    // the unprocessed portion is the last maxGap + 1 -> end
    for (int i = maxGapSize + 1; i < Math.min(maxGapSize * 2 + 1, docSize); i++) {
        String firstToken = window.get(i);
        if (queryPairMap.containsKey(firstToken)) {
            // Count unigram for this token
            singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
            for (int j = i + 1; j < Math.min(maxGapSize * 2 + 2, docSize); j++) {
                if (queryPairMap.get(firstToken).contains(window.get(j))) {
                    for (int windowSize : counters.keySet()) {
                        if (j - i <= windowSize)
                            counters.get(windowSize).incrementCount(firstToken);
                    }
                }
            }
        }
        if (backQueryPairMap.containsKey(firstToken)) {
            // Count unigram for this token
            for (int j = i + 1; j < Math.min(maxGapSize * 2 + 2, docSize); j++) {
                if (backQueryPairMap.get(firstToken).contains(window.get(j))) {
                    for (int windowSize : counters.keySet()) {
                        if (j - i <= windowSize)
                            counters.get(windowSize).incrementCount(window.get(j));
                    }
                }
            }
        }
    }
    stream.end();
    stream.close();
}
Also used : TokenStreamFromTermVector(org.apache.lucene.search.highlight.TokenStreamFromTermVector) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute)

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)151 TokenStream (org.apache.lucene.analysis.TokenStream)95 StringReader (java.io.StringReader)46 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)35 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)34 IOException (java.io.IOException)27 ArrayList (java.util.ArrayList)27 Tokenizer (org.apache.lucene.analysis.Tokenizer)25 Analyzer (org.apache.lucene.analysis.Analyzer)20 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)16 BytesRef (org.apache.lucene.util.BytesRef)15 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)13 LinkedList (java.util.LinkedList)11 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)10 Term (org.apache.lucene.index.Term)10 HashMap (java.util.HashMap)9 Token (org.apache.lucene.analysis.Token)8 Document (org.apache.lucene.document.Document)8 List (java.util.List)7 HashSet (java.util.HashSet)6