use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class ShingleFilterTest method testTwoTrailingHolesTriShingleWithTokenFiller.
public void testTwoTrailingHolesTriShingleWithTokenFiller() throws IOException {
// Analyzing "purple wizard of the", where of and the are removed as a
// stopwords, leaving two trailing holes:
Token[] inputTokens = new Token[] { createToken("purple", 0, 6), createToken("wizard", 7, 13) };
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
filter.setFillerToken("--");
assertTokenStreamContents(filter, new String[] { "purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
filter.setFillerToken("");
assertTokenStreamContents(filter, new String[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
filter.setFillerToken(null);
assertTokenStreamContents(filter, new String[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
filter.setFillerToken(null);
filter.setTokenSeparator(null);
assertTokenStreamContents(filter, new String[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class WordBreakSolrSpellCheckerTest method testStandAlone.
@Test
public void testStandAlone() throws Exception {
SolrCore core = h.getCore();
WordBreakSolrSpellChecker checker = new WordBreakSolrSpellChecker();
NamedList<String> params = new NamedList<>();
params.add("field", "lowerfilt");
params.add(WordBreakSolrSpellChecker.PARAM_BREAK_WORDS, "true");
params.add(WordBreakSolrSpellChecker.PARAM_COMBINE_WORDS, "true");
params.add(WordBreakSolrSpellChecker.PARAM_MAX_CHANGES, "10");
checker.init(params, core);
RefCounted<SolrIndexSearcher> searcher = core.getSearcher();
QueryConverter qc = new SpellingQueryConverter();
qc.setAnalyzer(new MockAnalyzer(random()));
{
//Prior to SOLR-8175, the required term would cause an AIOOBE.
Collection<Token> tokens = qc.convert("+pine apple good ness");
SpellingOptions spellOpts = new SpellingOptions(tokens, searcher.get().getIndexReader(), 10);
SpellingResult result = checker.getSuggestions(spellOpts);
searcher.decref();
assertTrue(result != null && result.getSuggestions() != null);
assertTrue(result.getSuggestions().size() == 5);
}
Collection<Token> tokens = qc.convert("paintable pine apple good ness");
SpellingOptions spellOpts = new SpellingOptions(tokens, searcher.get().getIndexReader(), 10);
SpellingResult result = checker.getSuggestions(spellOpts);
searcher.decref();
assertTrue(result != null && result.getSuggestions() != null);
assertTrue(result.getSuggestions().size() == 9);
for (Map.Entry<Token, LinkedHashMap<String, Integer>> s : result.getSuggestions().entrySet()) {
Token orig = s.getKey();
String[] corr = s.getValue().keySet().toArray(new String[0]);
if (orig.toString().equals("paintable")) {
assertTrue(orig.startOffset() == 0);
assertTrue(orig.endOffset() == 9);
assertTrue(orig.length() == 9);
assertTrue(corr.length == 3);
//1 op ; max doc freq=5
assertTrue(corr[0].equals("paint able"));
//1 op ; max doc freq=2
assertTrue(corr[1].equals("pain table"));
//2 ops
assertTrue(corr[2].equals("pa in table"));
} else if (orig.toString().equals("pine apple")) {
assertTrue(orig.startOffset() == 10);
assertTrue(orig.endOffset() == 20);
assertTrue(orig.length() == 10);
assertTrue(corr.length == 1);
assertTrue(corr[0].equals("pineapple"));
} else if (orig.toString().equals("paintable pine")) {
assertTrue(orig.startOffset() == 0);
assertTrue(orig.endOffset() == 14);
assertTrue(orig.length() == 14);
assertTrue(corr.length == 1);
assertTrue(corr[0].equals("paintablepine"));
} else if (orig.toString().equals("good ness")) {
assertTrue(orig.startOffset() == 21);
assertTrue(orig.endOffset() == 30);
assertTrue(orig.length() == 9);
assertTrue(corr.length == 1);
assertTrue(corr[0].equals("goodness"));
} else if (orig.toString().equals("pine apple good ness")) {
assertTrue(orig.startOffset() == 10);
assertTrue(orig.endOffset() == 30);
assertTrue(orig.length() == 20);
assertTrue(corr.length == 1);
assertTrue(corr[0].equals("pineapplegoodness"));
} else if (orig.toString().equals("pine")) {
assertTrue(orig.startOffset() == 10);
assertTrue(orig.endOffset() == 14);
assertTrue(orig.length() == 4);
assertTrue(corr.length == 1);
assertTrue(corr[0].equals("pi ne"));
} else if (orig.toString().equals("pine")) {
assertTrue(orig.startOffset() == 10);
assertTrue(orig.endOffset() == 14);
assertTrue(orig.length() == 4);
assertTrue(corr.length == 1);
assertTrue(corr[0].equals("pi ne"));
} else if (orig.toString().equals("apple")) {
assertTrue(orig.startOffset() == 15);
assertTrue(orig.endOffset() == 20);
assertTrue(orig.length() == 5);
assertTrue(corr.length == 0);
} else if (orig.toString().equals("good")) {
assertTrue(orig.startOffset() == 21);
assertTrue(orig.endOffset() == 25);
assertTrue(orig.length() == 4);
assertTrue(corr.length == 0);
} else if (orig.toString().equals("ness")) {
assertTrue(orig.startOffset() == 26);
assertTrue(orig.endOffset() == 30);
assertTrue(orig.length() == 4);
assertTrue(corr.length == 0);
} else {
fail("Unexpected original result: " + orig);
}
}
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class ShingleFilterTest method createToken.
private static Token createToken(String term, int start, int offset, int positionIncrement) {
Token token = new Token();
token.setOffset(start, offset);
token.copyBuffer(term.toCharArray(), 0, term.length());
token.setPositionIncrement(positionIncrement);
return token;
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class TestAsciiFoldingFilterFactory method testMultiTermAnalysis.
public void testMultiTermAnalysis() throws IOException {
TokenFilterFactory factory = new ASCIIFoldingFilterFactory(Collections.emptyMap());
TokenStream stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete" });
factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete" });
factory = new ASCIIFoldingFilterFactory(new HashMap<>(Collections.singletonMap("preserveOriginal", "true")));
stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete", "Été" });
factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete" });
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class TestPostingsOffsets method testRandom.
public void testRandom() throws Exception {
// token -> docID -> tokens
final Map<String, Map<Integer, List<Token>>> actualTokens = new HashMap<>();
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
final int numDocs = atLeast(20);
//final int numDocs = atLeast(5);
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
// TODO: randomize what IndexOptions we use; also test
// changing this up in one IW buffered segment...:
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
if (random().nextBoolean()) {
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(random().nextBoolean());
ft.setStoreTermVectorPositions(random().nextBoolean());
}
for (int docCount = 0; docCount < numDocs; docCount++) {
Document doc = new Document();
doc.add(new NumericDocValuesField("id", docCount));
List<Token> tokens = new ArrayList<>();
final int numTokens = atLeast(100);
//final int numTokens = atLeast(20);
int pos = -1;
int offset = 0;
//System.out.println("doc id=" + docCount);
for (int tokenCount = 0; tokenCount < numTokens; tokenCount++) {
final String text;
if (random().nextBoolean()) {
text = "a";
} else if (random().nextBoolean()) {
text = "b";
} else if (random().nextBoolean()) {
text = "c";
} else {
text = "d";
}
int posIncr = random().nextBoolean() ? 1 : random().nextInt(5);
if (tokenCount == 0 && posIncr == 0) {
posIncr = 1;
}
final int offIncr = random().nextBoolean() ? 0 : random().nextInt(5);
final int tokenOffset = random().nextInt(5);
final Token token = makeToken(text, posIncr, offset + offIncr, offset + offIncr + tokenOffset);
if (!actualTokens.containsKey(text)) {
actualTokens.put(text, new HashMap<Integer, List<Token>>());
}
final Map<Integer, List<Token>> postingsByDoc = actualTokens.get(text);
if (!postingsByDoc.containsKey(docCount)) {
postingsByDoc.put(docCount, new ArrayList<Token>());
}
postingsByDoc.get(docCount).add(token);
tokens.add(token);
pos += posIncr;
// stuff abs position into type:
token.setType("" + pos);
offset += offIncr + tokenOffset;
//System.out.println(" " + token + " posIncr=" + token.getPositionIncrement() + " pos=" + pos + " off=" + token.startOffset() + "/" + token.endOffset() + " (freq=" + postingsByDoc.get(docCount).size() + ")");
}
doc.add(new Field("content", new CannedTokenStream(tokens.toArray(new Token[tokens.size()])), ft));
w.addDocument(doc);
}
final DirectoryReader r = w.getReader();
w.close();
final String[] terms = new String[] { "a", "b", "c", "d" };
for (LeafReaderContext ctx : r.leaves()) {
// TODO: improve this
LeafReader sub = ctx.reader();
//System.out.println("\nsub=" + sub);
final TermsEnum termsEnum = sub.fields().terms("content").iterator();
PostingsEnum docs = null;
PostingsEnum docsAndPositions = null;
PostingsEnum docsAndPositionsAndOffsets = null;
int[] docIDToID = new int[sub.maxDoc()];
NumericDocValues values = DocValues.getNumeric(sub, "id");
for (int i = 0; i < sub.maxDoc(); i++) {
assertEquals(i, values.nextDoc());
docIDToID[i] = (int) values.longValue();
}
for (String term : terms) {
//System.out.println(" term=" + term);
if (termsEnum.seekExact(new BytesRef(term))) {
docs = termsEnum.postings(docs);
assertNotNull(docs);
int doc;
//System.out.println(" doc/freq");
while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
//System.out.println(" doc=" + docIDToID[doc] + " docID=" + doc + " " + expected.size() + " freq");
assertNotNull(expected);
assertEquals(expected.size(), docs.freq());
}
// explicitly exclude offsets here
docsAndPositions = termsEnum.postings(docsAndPositions, PostingsEnum.ALL);
assertNotNull(docsAndPositions);
//System.out.println(" doc/freq/pos");
while ((doc = docsAndPositions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
//System.out.println(" doc=" + docIDToID[doc] + " " + expected.size() + " freq");
assertNotNull(expected);
assertEquals(expected.size(), docsAndPositions.freq());
for (Token token : expected) {
int pos = Integer.parseInt(token.type());
//System.out.println(" pos=" + pos);
assertEquals(pos, docsAndPositions.nextPosition());
}
}
docsAndPositionsAndOffsets = termsEnum.postings(docsAndPositions, PostingsEnum.ALL);
assertNotNull(docsAndPositionsAndOffsets);
//System.out.println(" doc/freq/pos/offs");
while ((doc = docsAndPositionsAndOffsets.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
//System.out.println(" doc=" + docIDToID[doc] + " " + expected.size() + " freq");
assertNotNull(expected);
assertEquals(expected.size(), docsAndPositionsAndOffsets.freq());
for (Token token : expected) {
int pos = Integer.parseInt(token.type());
//System.out.println(" pos=" + pos);
assertEquals(pos, docsAndPositionsAndOffsets.nextPosition());
assertEquals(token.startOffset(), docsAndPositionsAndOffsets.startOffset());
assertEquals(token.endOffset(), docsAndPositionsAndOffsets.endOffset());
}
}
}
}
// TODO: test advance:
}
r.close();
dir.close();
}
Aggregations