use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestSimplePatternSplitTokenizer method testEmptyStringPatternOneMatch.
public void testEmptyStringPatternOneMatch() throws Exception {
Tokenizer t = new SimplePatternSplitTokenizer("a*");
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
t.setReader(new StringReader("bbab"));
assertTokenStreamContents(t, new String[] { "bb", "b" }, new int[] { 0, 3 }, new int[] { 2, 4 });
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestSimplePatternSplitTokenizer method testNoTokens.
public void testNoTokens() throws Exception {
Tokenizer t = new SimplePatternSplitTokenizer(".*");
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
String s;
while (true) {
s = TestUtil.randomUnicodeString(random());
if (s.length() > 0) {
break;
}
}
t.setReader(new StringReader(s));
t.reset();
assertFalse(t.incrementToken());
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestElision method filter.
private List<String> filter(TokenFilter filter) throws IOException {
List<String> tas = new ArrayList<>();
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
filter.reset();
while (filter.incrementToken()) {
tas.add(termAtt.toString());
}
filter.end();
filter.close();
return tas;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestDocumentWriter method testTokenReuse.
public void testTokenReuse() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new TokenFilter(tokenizer) {
boolean first = true;
AttributeSource.State state;
@Override
public boolean incrementToken() throws IOException {
if (state != null) {
restoreState(state);
payloadAtt.setPayload(null);
posIncrAtt.setPositionIncrement(0);
termAtt.setEmpty().append("b");
state = null;
return true;
}
boolean hasNext = input.incrementToken();
if (!hasNext)
return false;
if (Character.isDigit(termAtt.buffer()[0])) {
posIncrAtt.setPositionIncrement(termAtt.buffer()[0] - '0');
}
if (first) {
// set payload on first position only
payloadAtt.setPayload(new BytesRef(new byte[] { 100 }));
first = false;
}
// index a "synonym" for every token
state = captureState();
return true;
}
@Override
public void reset() throws IOException {
super.reset();
first = true;
state = null;
}
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
});
}
};
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer));
Document doc = new Document();
doc.add(newTextField("f1", "a 5 a a", Field.Store.YES));
writer.addDocument(doc);
writer.commit();
SegmentCommitInfo info = writer.newestSegment();
writer.close();
SegmentReader reader = new SegmentReader(info, Version.LATEST.major, newIOContext(random()));
PostingsEnum termPositions = MultiFields.getTermPositionsEnum(reader, "f1", new BytesRef("a"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
int freq = termPositions.freq();
assertEquals(3, freq);
assertEquals(0, termPositions.nextPosition());
assertNotNull(termPositions.getPayload());
assertEquals(6, termPositions.nextPosition());
assertNull(termPositions.getPayload());
assertEquals(7, termPositions.nextPosition());
assertNull(termPositions.getPayload());
reader.close();
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project Anserini by castorini.
the class SequentialDependenceModel method computeUnorderedFrequencyScore.
private float computeUnorderedFrequencyScore(Document doc, Terms terms, RerankerContext context) throws IOException {
List<String> queryTokens = context.getQueryTokens();
// Construct token stream with offset 0
TokenStream stream = new TokenStreamFromTermVector(terms, 0);
CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
Map<String, String> queryPairMap = new HashMap<>();
Map<String, Integer> phraseCountMap = new HashMap<>();
Map<String, Integer> singleCountMap = new HashMap<>();
// Construct a count map and a map of phrase pair x y, x->y
for (int i = 0; i < queryTokens.size() - 1; i++) {
queryPairMap.put(queryTokens.get(i), queryTokens.get(i + 1));
phraseCountMap.put(queryTokens.get(i), 0);
// This will serve as our smoothing param
singleCountMap.put(queryTokens.get(i), 1);
}
int docSize = 0;
// We will maintain a fifo queue of window size
LinkedList<String> window = new LinkedList<>();
while (stream.incrementToken() && docSize <= WINDOW_SIZE * 2) {
// First construct the window that we need to test on
docSize++;
String token = termAttribute.toString();
window.add(token);
}
// But we need to account for the case when the tokenstream just doesn't have that many tokens
for (int i = 0; i < Math.min(WINDOW_SIZE - 1, docSize); i++) {
String firstToken = window.get(i);
if (queryPairMap.containsKey(firstToken) && window.contains(queryPairMap.get(firstToken))) {
phraseCountMap.put(firstToken, phraseCountMap.get(firstToken) + 1);
singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
}
}
// Now we continue
while (stream.incrementToken()) {
docSize++;
String token = termAttribute.toString();
window.add(token);
// Move the window along
// The window at this point is guaranteed to be of size WINDOW_SIZE * 2 because of the previous loop
// if there are not enough tokens this would not even execute
window.removeFirst();
// Now test for the phrase at the test index WINDOW_SIZE -1
String firstToken = window.get(WINDOW_SIZE - 1);
if (queryPairMap.containsKey(firstToken) && window.contains(queryPairMap.get(firstToken))) {
phraseCountMap.put(firstToken, phraseCountMap.get(firstToken) + 1);
singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
}
}
float score = 0.0f;
// Smoothing count of 1
docSize++;
for (String queryToken : phraseCountMap.keySet()) {
float countToUse = phraseCountMap.get(queryToken);
if (countToUse == 0) {
countToUse = singleCountMap.get(queryToken);
}
score += Math.log(countToUse / (float) docSize);
}
return score;
}
Aggregations