use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project Anserini by castorini.
the class TweetTokenizationTest method parseKeywords.
public List<String> parseKeywords(Analyzer analyzer, String keywords) throws IOException {
List<String> list = new ArrayList<>();
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(keywords));
CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
if (cattr.toString().length() == 0) {
continue;
}
list.add(cattr.toString());
}
tokenStream.end();
tokenStream.close();
return list;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project tika by apache.
the class TokenCounterTest method testCJKFilter.
@Test
public void testCJKFilter() throws Exception {
String s = "then quickbrownfoxjumpedoverthelazy dogss dog 普林斯顿大学";
Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
TokenStream ts = analyzer.tokenStream(FIELD, s);
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
ts.reset();
Map<String, Integer> tokens = new HashMap<>();
while (ts.incrementToken()) {
String t = termAtt.toString();
Integer count = tokens.get(t);
count = (count == null) ? count = 0 : count;
count++;
tokens.put(t, count);
}
ts.end();
ts.close();
assertEquals(7, tokens.size());
assertEquals(new Integer(1), tokens.get("林斯"));
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project tika by apache.
the class AnalyzerManagerTest method testTokenCountFilter.
@Test
public void testTokenCountFilter() throws Exception {
AnalyzerManager analyzerManager = AnalyzerManager.newInstance(1000000);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 1001000; i++) {
sb.append("the ");
}
TokenStream ts = analyzerManager.getGeneralAnalyzer().tokenStream("f", sb.toString());
ts.reset();
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
int tokens = 0;
while (ts.incrementToken()) {
tokens++;
}
assertEquals(1000000, tokens);
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project tika by apache.
the class AnalyzerManagerTest method testCommon.
@Test
public void testCommon() throws Exception {
AnalyzerManager analyzerManager = AnalyzerManager.newInstance(100000);
Analyzer common = analyzerManager.getCommonTokensAnalyzer();
TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog");
ts.reset();
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
Set<String> seen = new HashSet<>();
while (ts.incrementToken()) {
String t = termAtt.toString();
if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray()) && t.contains("5")) {
fail("Shouldn't have found a numeric");
}
seen.add(termAtt.toString());
}
ts.end();
ts.close();
assertTrue(seen.contains("dirty"));
assertFalse(seen.contains("the"));
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project Anserini by castorini.
the class CountBigramPairs method countPairs.
/**
* Method will count coocurrence of pairs specified in queryPairMap
* and store counts for each window size in counters
* NOTE method mutates inputs
* @param singleCountMap a count of single tokens as we encounter them, useful if any smoothing
* @param queryPairMap all pairs of strings we are looking for
* @param backQueryPairMap all pairs of reverse pairs, ei if query is test query, this would include query test
* @param gapSizes list of window sizes to compute for
* @param counters Window size to counter map
*/
public static void countPairs(Map<String, Integer> singleCountMap, Map<String, Set<String>> queryPairMap, Map<String, Set<String>> backQueryPairMap, ArrayList<Integer> gapSizes, Map<Integer, PhraseCounter> counters, Terms terms) throws IOException {
// Construct token stream with offset 0
TokenStreamFromTermVector stream = new TokenStreamFromTermVector(terms, -1);
CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
int docSize = 0;
int maxGapSize = 0;
for (Integer windowSize : gapSizes) {
if (windowSize > maxGapSize) {
maxGapSize = windowSize;
}
}
// We will maintain a fifo queue of window size
LinkedList<String> window = new LinkedList<>();
// add to the window first and process the first tokens
stream.reset();
while (docSize < maxGapSize * 2 + 2 && stream.incrementToken()) {
// First construct the window that we need to test on
docSize++;
String token = termAttribute.toString();
window.add(token);
}
// But we need to account for the case when the tokenstream just doesn't have that many tokens
for (int i = 0; i < Math.min(maxGapSize + 1, docSize); i++) {
String firstToken = window.get(i);
// Look ahead for token
if (queryPairMap.containsKey(firstToken)) {
// Count unigram for this token
singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
for (int j = i + 1; j < Math.min(i + maxGapSize + 1, docSize); j++) {
if (queryPairMap.get(firstToken).contains(window.get(j))) {
for (int windowSize : counters.keySet()) {
if (j - i <= windowSize)
counters.get(windowSize).incrementCount(firstToken);
}
}
}
}
if (backQueryPairMap.containsKey(firstToken)) {
// Count unigram for this token
for (int j = i + 1; j < Math.min(i + maxGapSize + 1, docSize); j++) {
if (backQueryPairMap.get(firstToken).contains(window.get(j))) {
for (int windowSize : counters.keySet()) {
if (j - i <= windowSize)
counters.get(windowSize).incrementCount(window.get(j));
}
}
}
}
}
// Now we continue
while (stream.incrementToken()) {
docSize++;
String token = termAttribute.toString();
window.add(token);
// Move the window along
// The window at this point is guaranteed to be of size WINDOW_SIZE * 2 because of the previous loop
// if there are not enough tokens this would not even execute
window.removeFirst();
// Now test for the phrase at the test index WINDOW_SIZE
String firstToken = window.get(maxGapSize);
if (queryPairMap.containsKey(firstToken)) {
// Count unigram for this token
singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
for (int j = maxGapSize + 1; j < maxGapSize * 2 + 2; j++) {
if (queryPairMap.get(firstToken).contains(window.get(j))) {
for (int windowSize : counters.keySet()) {
if (j - maxGapSize <= windowSize)
counters.get(windowSize).incrementCount(firstToken);
}
}
}
}
if (backQueryPairMap.containsKey(firstToken)) {
// Count unigram for this token
for (int j = maxGapSize + 1; j < maxGapSize * 2 + 2; j++) {
if (backQueryPairMap.get(firstToken).contains(window.get(j))) {
for (int windowSize : counters.keySet()) {
if (j - maxGapSize <= windowSize)
counters.get(windowSize).incrementCount(window.get(j));
}
}
}
}
}
// the unprocessed portion is the last maxGap + 1 -> end
for (int i = maxGapSize + 1; i < Math.min(maxGapSize * 2 + 1, docSize); i++) {
String firstToken = window.get(i);
if (queryPairMap.containsKey(firstToken)) {
// Count unigram for this token
singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
for (int j = i + 1; j < Math.min(maxGapSize * 2 + 2, docSize); j++) {
if (queryPairMap.get(firstToken).contains(window.get(j))) {
for (int windowSize : counters.keySet()) {
if (j - i <= windowSize)
counters.get(windowSize).incrementCount(firstToken);
}
}
}
}
if (backQueryPairMap.containsKey(firstToken)) {
// Count unigram for this token
for (int j = i + 1; j < Math.min(maxGapSize * 2 + 2, docSize); j++) {
if (backQueryPairMap.get(firstToken).contains(window.get(j))) {
for (int windowSize : counters.keySet()) {
if (j - i <= windowSize)
counters.get(windowSize).incrementCount(window.get(j));
}
}
}
}
}
stream.end();
stream.close();
}
Aggregations