use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class BasePostingsFormatTestCase method testPostingsEnumAll.
public void testPostingsEnumAll() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(null);
IndexWriter iw = new IndexWriter(dir, iwc);
Document doc = new Document();
Token token1 = new Token("bar", 0, 3);
token1.setPayload(new BytesRef("pay1"));
Token token2 = new Token("bar", 4, 7);
token2.setPayload(new BytesRef("pay2"));
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
doc.add(new Field("foo", new CannedTokenStream(token1, token2), ft));
iw.addDocument(doc);
DirectoryReader reader = DirectoryReader.open(iw);
// sugar method (FREQS)
PostingsEnum postings = getOnlyLeafReader(reader).postings(new Term("foo", "bar"));
assertEquals(-1, postings.docID());
assertEquals(0, postings.nextDoc());
assertEquals(2, postings.freq());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings.nextDoc());
// termsenum reuse (FREQS)
TermsEnum termsEnum = getOnlyLeafReader(reader).terms("foo").iterator();
termsEnum.seekExact(new BytesRef("bar"));
PostingsEnum postings2 = termsEnum.postings(postings);
assertNotNull(postings2);
assertReused("foo", postings, postings2);
// and it had better work
assertEquals(-1, postings2.docID());
assertEquals(0, postings2.nextDoc());
assertEquals(2, postings2.freq());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings2.nextDoc());
// asking for docs only: ok
PostingsEnum docsOnly = termsEnum.postings(null, PostingsEnum.NONE);
assertEquals(-1, docsOnly.docID());
assertEquals(0, docsOnly.nextDoc());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsOnly.freq() == 1 || docsOnly.freq() == 2);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsOnly.nextDoc());
// reuse that too
PostingsEnum docsOnly2 = termsEnum.postings(docsOnly, PostingsEnum.NONE);
assertNotNull(docsOnly2);
assertReused("foo", docsOnly, docsOnly2);
// and it had better work
assertEquals(-1, docsOnly2.docID());
assertEquals(0, docsOnly2.nextDoc());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsOnly2.freq() == 1 || docsOnly2.freq() == 2);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsOnly2.nextDoc());
// asking for positions, ok
PostingsEnum docsAndPositionsEnum = getOnlyLeafReader(reader).postings(new Term("foo", "bar"), PostingsEnum.POSITIONS);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.startOffset() == -1 || docsAndPositionsEnum.startOffset() == 0);
assertTrue(docsAndPositionsEnum.endOffset() == -1 || docsAndPositionsEnum.endOffset() == 3);
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.getPayload() == null || new BytesRef("pay1").equals(docsAndPositionsEnum.getPayload()));
assertEquals(1, docsAndPositionsEnum.nextPosition());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.startOffset() == -1 || docsAndPositionsEnum.startOffset() == 4);
assertTrue(docsAndPositionsEnum.endOffset() == -1 || docsAndPositionsEnum.endOffset() == 7);
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.getPayload() == null || new BytesRef("pay2").equals(docsAndPositionsEnum.getPayload()));
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
// now reuse the positions
PostingsEnum docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.POSITIONS);
assertReused("foo", docsAndPositionsEnum, docsAndPositionsEnum2);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.startOffset() == -1 || docsAndPositionsEnum2.startOffset() == 0);
assertTrue(docsAndPositionsEnum2.endOffset() == -1 || docsAndPositionsEnum2.endOffset() == 3);
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.getPayload() == null || new BytesRef("pay1").equals(docsAndPositionsEnum2.getPayload()));
assertEquals(1, docsAndPositionsEnum2.nextPosition());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.startOffset() == -1 || docsAndPositionsEnum2.startOffset() == 4);
assertTrue(docsAndPositionsEnum2.endOffset() == -1 || docsAndPositionsEnum2.endOffset() == 7);
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.getPayload() == null || new BytesRef("pay2").equals(docsAndPositionsEnum2.getPayload()));
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
// payloads
docsAndPositionsEnum = getOnlyLeafReader(reader).postings(new Term("foo", "bar"), PostingsEnum.PAYLOADS);
assertNotNull(docsAndPositionsEnum);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.startOffset() == -1 || docsAndPositionsEnum.startOffset() == 0);
assertTrue(docsAndPositionsEnum.endOffset() == -1 || docsAndPositionsEnum.endOffset() == 3);
assertEquals(new BytesRef("pay1"), docsAndPositionsEnum.getPayload());
assertEquals(1, docsAndPositionsEnum.nextPosition());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.startOffset() == -1 || docsAndPositionsEnum.startOffset() == 4);
assertTrue(docsAndPositionsEnum.endOffset() == -1 || docsAndPositionsEnum.endOffset() == 7);
assertEquals(new BytesRef("pay2"), docsAndPositionsEnum.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
// reuse
docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.PAYLOADS);
assertReused("foo", docsAndPositionsEnum, docsAndPositionsEnum2);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.startOffset() == -1 || docsAndPositionsEnum2.startOffset() == 0);
assertTrue(docsAndPositionsEnum2.endOffset() == -1 || docsAndPositionsEnum2.endOffset() == 3);
assertEquals(new BytesRef("pay1"), docsAndPositionsEnum2.getPayload());
assertEquals(1, docsAndPositionsEnum2.nextPosition());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.startOffset() == -1 || docsAndPositionsEnum2.startOffset() == 4);
assertTrue(docsAndPositionsEnum2.endOffset() == -1 || docsAndPositionsEnum2.endOffset() == 7);
assertEquals(new BytesRef("pay2"), docsAndPositionsEnum2.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
docsAndPositionsEnum = getOnlyLeafReader(reader).postings(new Term("foo", "bar"), PostingsEnum.OFFSETS);
assertNotNull(docsAndPositionsEnum);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
assertEquals(0, docsAndPositionsEnum.startOffset());
assertEquals(3, docsAndPositionsEnum.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.getPayload() == null || new BytesRef("pay1").equals(docsAndPositionsEnum.getPayload()));
assertEquals(1, docsAndPositionsEnum.nextPosition());
assertEquals(4, docsAndPositionsEnum.startOffset());
assertEquals(7, docsAndPositionsEnum.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.getPayload() == null || new BytesRef("pay2").equals(docsAndPositionsEnum.getPayload()));
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
// reuse
docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.OFFSETS);
assertReused("foo", docsAndPositionsEnum, docsAndPositionsEnum2);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
assertEquals(0, docsAndPositionsEnum2.startOffset());
assertEquals(3, docsAndPositionsEnum2.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.getPayload() == null || new BytesRef("pay1").equals(docsAndPositionsEnum2.getPayload()));
assertEquals(1, docsAndPositionsEnum2.nextPosition());
assertEquals(4, docsAndPositionsEnum2.startOffset());
assertEquals(7, docsAndPositionsEnum2.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.getPayload() == null || new BytesRef("pay2").equals(docsAndPositionsEnum2.getPayload()));
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
docsAndPositionsEnum = getOnlyLeafReader(reader).postings(new Term("foo", "bar"), PostingsEnum.ALL);
assertNotNull(docsAndPositionsEnum);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
assertEquals(0, docsAndPositionsEnum.startOffset());
assertEquals(3, docsAndPositionsEnum.endOffset());
assertEquals(new BytesRef("pay1"), docsAndPositionsEnum.getPayload());
assertEquals(1, docsAndPositionsEnum.nextPosition());
assertEquals(4, docsAndPositionsEnum.startOffset());
assertEquals(7, docsAndPositionsEnum.endOffset());
assertEquals(new BytesRef("pay2"), docsAndPositionsEnum.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.ALL);
assertReused("foo", docsAndPositionsEnum, docsAndPositionsEnum2);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
assertEquals(0, docsAndPositionsEnum2.startOffset());
assertEquals(3, docsAndPositionsEnum2.endOffset());
assertEquals(new BytesRef("pay1"), docsAndPositionsEnum2.getPayload());
assertEquals(1, docsAndPositionsEnum2.nextPosition());
assertEquals(4, docsAndPositionsEnum2.startOffset());
assertEquals(7, docsAndPositionsEnum2.endOffset());
assertEquals(new BytesRef("pay2"), docsAndPositionsEnum2.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
iw.close();
reader.close();
dir.close();
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class ConjunctionSolrSpellChecker method mergeCheckers.
//TODO: This just interleaves the results. In the future, we might want to let users give each checker its
// own weight and use that in combination to score & frequency to sort the results ?
private SpellingResult mergeCheckers(SpellingResult[] results, int numSug) {
Map<Token, Integer> combinedTokenFrequency = new HashMap<>();
Map<Token, List<LinkedHashMap<String, Integer>>> allSuggestions = new LinkedHashMap<>();
for (SpellingResult result : results) {
if (result.getTokenFrequency() != null) {
combinedTokenFrequency.putAll(result.getTokenFrequency());
}
for (Map.Entry<Token, LinkedHashMap<String, Integer>> entry : result.getSuggestions().entrySet()) {
List<LinkedHashMap<String, Integer>> allForThisToken = allSuggestions.get(entry.getKey());
if (allForThisToken == null) {
allForThisToken = new ArrayList<>();
allSuggestions.put(entry.getKey(), allForThisToken);
}
allForThisToken.add(entry.getValue());
}
}
SpellingResult combinedResult = new SpellingResult();
for (Map.Entry<Token, List<LinkedHashMap<String, Integer>>> entry : allSuggestions.entrySet()) {
Token original = entry.getKey();
List<Iterator<Map.Entry<String, Integer>>> corrIters = new ArrayList<>(entry.getValue().size());
for (LinkedHashMap<String, Integer> corrections : entry.getValue()) {
corrIters.add(corrections.entrySet().iterator());
}
int numberAdded = 0;
while (numberAdded < numSug) {
boolean anyData = false;
for (Iterator<Map.Entry<String, Integer>> iter : corrIters) {
if (iter.hasNext()) {
anyData = true;
Map.Entry<String, Integer> corr = iter.next();
combinedResult.add(original, corr.getKey(), corr.getValue());
Integer tokenFrequency = combinedTokenFrequency.get(original);
combinedResult.addFrequency(original, tokenFrequency == null ? 0 : tokenFrequency);
if (++numberAdded == numSug) {
break;
}
}
}
if (!anyData) {
if (numberAdded == 0) {
combinedResult.add(original, Collections.<String>emptyList());
Integer tokenFrequency = combinedTokenFrequency.get(original);
combinedResult.addFrequency(original, tokenFrequency == null ? 0 : tokenFrequency);
}
break;
}
}
}
return combinedResult;
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class SpellCheckCollator method getCollation.
private String getCollation(String origQuery, List<SpellCheckCorrection> corrections) {
StringBuilder collation = new StringBuilder(origQuery);
int offset = 0;
String corr = "";
for (int i = 0; i < corrections.size(); i++) {
SpellCheckCorrection correction = corrections.get(i);
Token tok = correction.getOriginal();
// illegal offsets due to previous replacements.
if (tok.getPositionIncrement() == 0)
continue;
corr = correction.getCorrection();
boolean addParenthesis = false;
Character requiredOrProhibited = null;
int indexOfSpace = corr.indexOf(' ');
StringBuilder corrSb = new StringBuilder(corr);
int bump = 1;
//then be sure all of the new words have the same optional/required/prohibited status in the query.
while (indexOfSpace > -1 && indexOfSpace < corr.length() - 1) {
addParenthesis = true;
char previousChar = tok.startOffset() > 0 ? origQuery.charAt(tok.startOffset() - 1) : ' ';
if (previousChar == '-' || previousChar == '+') {
corrSb.insert(indexOfSpace + bump, previousChar);
if (requiredOrProhibited == null) {
requiredOrProhibited = previousChar;
}
bump++;
} else if ((tok.getFlags() & QueryConverter.TERM_IN_BOOLEAN_QUERY_FLAG) == QueryConverter.TERM_IN_BOOLEAN_QUERY_FLAG) {
corrSb.insert(indexOfSpace + bump, "AND ");
bump += 4;
}
indexOfSpace = correction.getCorrection().indexOf(' ', indexOfSpace + bump);
}
int oneForReqOrProhib = 0;
if (addParenthesis) {
if (requiredOrProhibited != null) {
corrSb.insert(0, requiredOrProhibited);
oneForReqOrProhib++;
}
corrSb.insert(0, '(');
corrSb.append(')');
}
corr = corrSb.toString();
int startIndex = tok.startOffset() + offset - oneForReqOrProhib;
int endIndex = tok.endOffset() + offset;
collation.replace(startIndex, endIndex, corr);
offset += corr.length() - oneForReqOrProhib - (tok.endOffset() - tok.startOffset());
}
return collation.toString();
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class SpellingQueryConverter method analyze.
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException {
TokenStream stream = analyzer.tokenStream("", text);
// TODO: support custom attributes
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
stream.reset();
while (stream.incrementToken()) {
Token token = new Token();
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset());
//overwriting any flags already set...
token.setFlags(flagsAttValue);
token.setType(typeAtt.type());
token.setPayload(payloadAtt.getPayload());
token.setPositionIncrement(posIncAtt.getPositionIncrement());
result.add(token);
}
stream.end();
stream.close();
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class DirectSolrSpellCheckerTest method test.
@Test
public void test() throws Exception {
DirectSolrSpellChecker checker = new DirectSolrSpellChecker();
NamedList spellchecker = new NamedList();
spellchecker.add("classname", DirectSolrSpellChecker.class.getName());
spellchecker.add(SolrSpellChecker.FIELD, "teststop");
// we will try "fob"
spellchecker.add(DirectSolrSpellChecker.MINQUERYLENGTH, 2);
SolrCore core = h.getCore();
checker.init(spellchecker, core);
RefCounted<SolrIndexSearcher> searcher = core.getSearcher();
Collection<Token> tokens = queryConverter.convert("fob");
SpellingOptions spellOpts = new SpellingOptions(tokens, searcher.get().getIndexReader());
SpellingResult result = checker.getSuggestions(spellOpts);
assertTrue("result is null and it shouldn't be", result != null);
Map<String, Integer> suggestions = result.get(tokens.iterator().next());
Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
assertTrue(entry.getKey() + " is not equal to " + "foo", entry.getKey().equals("foo") == true);
assertFalse(entry.getValue() + " equals: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO);
spellOpts.tokens = queryConverter.convert("super");
result = checker.getSuggestions(spellOpts);
assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(tokens.iterator().next());
assertTrue("suggestions is not null and it should be", suggestions == null);
searcher.decref();
}
Aggregations