use of org.apache.lucene.analysis.MockTokenizer in project lucene-solr by apache.
the class TestSynonymMapFilter method testRandom.
public void testRandom() throws Exception {
final int alphabetSize = TestUtil.nextInt(random(), 2, 7);
final int docLen = atLeast(3000);
//final int docLen = 50;
final String document = getRandomString('a', alphabetSize, docLen);
if (VERBOSE) {
System.out.println("TEST: doc=" + document);
}
final int numSyn = atLeast(5);
//final int numSyn = 2;
final Map<String, OneSyn> synMap = new HashMap<>();
final List<OneSyn> syns = new ArrayList<>();
final boolean dedup = random().nextBoolean();
if (VERBOSE) {
System.out.println(" dedup=" + dedup);
}
b = new SynonymMap.Builder(dedup);
for (int synIDX = 0; synIDX < numSyn; synIDX++) {
final String synIn = getRandomString('a', alphabetSize, TestUtil.nextInt(random(), 1, 5)).trim();
OneSyn s = synMap.get(synIn);
if (s == null) {
s = new OneSyn();
s.in = synIn;
syns.add(s);
s.out = new ArrayList<>();
synMap.put(synIn, s);
s.keepOrig = random().nextBoolean();
}
final String synOut = getRandomString('0', 10, TestUtil.nextInt(random(), 1, 5)).trim();
s.out.add(synOut);
add(synIn, synOut, s.keepOrig);
if (VERBOSE) {
System.out.println(" syns[" + synIDX + "] = " + s.in + " -> " + s.out + " keepOrig=" + s.keepOrig);
}
}
tokensIn = new MockTokenizer(MockTokenizer.WHITESPACE, true);
tokensIn.setReader(new StringReader("a"));
tokensIn.reset();
assertTrue(tokensIn.incrementToken());
assertFalse(tokensIn.incrementToken());
tokensIn.end();
tokensIn.close();
tokensOut = new SynonymFilter(tokensIn, b.build(), true);
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
posLenAtt = tokensOut.addAttribute(PositionLengthAttribute.class);
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
if (dedup) {
pruneDups(syns);
}
final String expected = slowSynMatcher(document, syns, 5);
if (VERBOSE) {
System.out.println("TEST: expected=" + expected);
}
verify(document, expected);
}
use of org.apache.lucene.analysis.MockTokenizer in project lucene-solr by apache.
the class TestDocument method testInvalidFields.
// LUCENE-3616
public void testInvalidFields() {
expectThrows(IllegalArgumentException.class, () -> {
Tokenizer tok = new MockTokenizer();
tok.setReader(new StringReader(""));
new Field("foo", tok, StringField.TYPE_STORED);
});
}
use of org.apache.lucene.analysis.MockTokenizer in project lucene-solr by apache.
the class TestIndexWriter method testStopwordsPosIncHole.
// LUCENE-3849
public void testStopwordsPosIncHole() throws Exception {
Directory dir = newDirectory();
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer();
TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET);
return new TokenStreamComponents(tokenizer, stream);
}
};
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a);
Document doc = new Document();
doc.add(new TextField("body", "just a", Field.Store.NO));
doc.add(new TextField("body", "test of gaps", Field.Store.NO));
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher is = newSearcher(ir);
PhraseQuery.Builder builder = new PhraseQuery.Builder();
builder.add(new Term("body", "just"), 0);
builder.add(new Term("body", "test"), 2);
PhraseQuery pq = builder.build();
// body:"just ? test"
assertEquals(1, is.search(pq, 5).totalHits);
ir.close();
dir.close();
}
use of org.apache.lucene.analysis.MockTokenizer in project lucene-solr by apache.
the class TestIndexWriter method testIndexStoreCombos.
public void testIndexStoreCombos() throws Exception {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
byte[] b = new byte[50];
for (int i = 0; i < 50; i++) b[i] = (byte) (i + 77);
Document doc = new Document();
FieldType customType = new FieldType(StoredField.TYPE);
customType.setTokenized(true);
Field f = new Field("binary", b, 10, 17, customType);
// TODO: this is evil, changing the type after creating the field:
customType.setIndexOptions(IndexOptions.DOCS);
final MockTokenizer doc1field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc1field1.setReader(new StringReader("doc1field1"));
f.setTokenStream(doc1field1);
FieldType customType2 = new FieldType(TextField.TYPE_STORED);
Field f2 = newField("string", "value", customType2);
final MockTokenizer doc1field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc1field2.setReader(new StringReader("doc1field2"));
f2.setTokenStream(doc1field2);
doc.add(f);
doc.add(f2);
w.addDocument(doc);
// add 2 docs to test in-memory merging
final MockTokenizer doc2field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc2field1.setReader(new StringReader("doc2field1"));
f.setTokenStream(doc2field1);
final MockTokenizer doc2field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc2field2.setReader(new StringReader("doc2field2"));
f2.setTokenStream(doc2field2);
w.addDocument(doc);
// force segment flush so we can force a segment merge with doc3 later.
w.commit();
final MockTokenizer doc3field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc3field1.setReader(new StringReader("doc3field1"));
f.setTokenStream(doc3field1);
final MockTokenizer doc3field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc3field2.setReader(new StringReader("doc3field2"));
f2.setTokenStream(doc3field2);
w.addDocument(doc);
w.commit();
// force segment merge.
w.forceMerge(1);
w.close();
IndexReader ir = DirectoryReader.open(dir);
Document doc2 = ir.document(0);
IndexableField f3 = doc2.getField("binary");
b = f3.binaryValue().bytes;
assertTrue(b != null);
assertEquals(17, b.length, 17);
assertEquals(87, b[0]);
assertTrue(ir.document(0).getField("binary").binaryValue() != null);
assertTrue(ir.document(1).getField("binary").binaryValue() != null);
assertTrue(ir.document(2).getField("binary").binaryValue() != null);
assertEquals("value", ir.document(0).get("string"));
assertEquals("value", ir.document(1).get("string"));
assertEquals("value", ir.document(2).get("string"));
// test that the terms were indexed.
assertTrue(TestUtil.docs(random(), ir, "binary", new BytesRef("doc1field1"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertTrue(TestUtil.docs(random(), ir, "binary", new BytesRef("doc2field1"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertTrue(TestUtil.docs(random(), ir, "binary", new BytesRef("doc3field1"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertTrue(TestUtil.docs(random(), ir, "string", new BytesRef("doc1field2"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertTrue(TestUtil.docs(random(), ir, "string", new BytesRef("doc2field2"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertTrue(TestUtil.docs(random(), ir, "string", new BytesRef("doc3field2"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
ir.close();
dir.close();
}
use of org.apache.lucene.analysis.MockTokenizer in project lucene-solr by apache.
the class TestGermanStemFilter method testKeyword.
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(asSet("sängerinnen"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new GermanStemFilter(sink));
}
};
checkOneTerm(a, "sängerinnen", "sängerinnen");
a.close();
}
Aggregations