use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.
the class TestBlockPostingsFormat3 method test.
// creates 8 fields with different options and does "duels" of fields against each other
public void test() throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer();
if (fieldName.contains("payloadsFixed")) {
TokenFilter filter = new MockFixedLengthPayloadFilter(new Random(0), tokenizer, 1);
return new TokenStreamComponents(tokenizer, filter);
} else if (fieldName.contains("payloadsVariable")) {
TokenFilter filter = new MockVariableLengthPayloadFilter(new Random(0), tokenizer);
return new TokenStreamComponents(tokenizer, filter);
} else {
return new TokenStreamComponents(tokenizer);
}
}
};
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setCodec(TestUtil.alwaysPostingsFormat(new Lucene50PostingsFormat()));
// TODO we could actually add more fields implemented with different PFs
// or, just put this test into the usual rotation?
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
FieldType docsOnlyType = new FieldType(TextField.TYPE_NOT_STORED);
// turn this on for a cross-check
docsOnlyType.setStoreTermVectors(true);
docsOnlyType.setIndexOptions(IndexOptions.DOCS);
FieldType docsAndFreqsType = new FieldType(TextField.TYPE_NOT_STORED);
// turn this on for a cross-check
docsAndFreqsType.setStoreTermVectors(true);
docsAndFreqsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
FieldType positionsType = new FieldType(TextField.TYPE_NOT_STORED);
// turn these on for a cross-check
positionsType.setStoreTermVectors(true);
positionsType.setStoreTermVectorPositions(true);
positionsType.setStoreTermVectorOffsets(true);
positionsType.setStoreTermVectorPayloads(true);
FieldType offsetsType = new FieldType(positionsType);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field field1 = new Field("field1docs", "", docsOnlyType);
Field field2 = new Field("field2freqs", "", docsAndFreqsType);
Field field3 = new Field("field3positions", "", positionsType);
Field field4 = new Field("field4offsets", "", offsetsType);
Field field5 = new Field("field5payloadsFixed", "", positionsType);
Field field6 = new Field("field6payloadsVariable", "", positionsType);
Field field7 = new Field("field7payloadsFixedOffsets", "", offsetsType);
Field field8 = new Field("field8payloadsVariableOffsets", "", offsetsType);
doc.add(field1);
doc.add(field2);
doc.add(field3);
doc.add(field4);
doc.add(field5);
doc.add(field6);
doc.add(field7);
doc.add(field8);
for (int i = 0; i < MAXDOC; i++) {
String stringValue = Integer.toString(i) + " verycommon " + English.intToEnglish(i).replace('-', ' ') + " " + TestUtil.randomSimpleString(random());
field1.setStringValue(stringValue);
field2.setStringValue(stringValue);
field3.setStringValue(stringValue);
field4.setStringValue(stringValue);
field5.setStringValue(stringValue);
field6.setStringValue(stringValue);
field7.setStringValue(stringValue);
field8.setStringValue(stringValue);
iw.addDocument(doc);
}
iw.close();
verify(dir);
// for some extra coverage, checkIndex before we forceMerge
TestUtil.checkIndex(dir);
iwc = newIndexWriterConfig(analyzer);
iwc.setCodec(TestUtil.alwaysPostingsFormat(new Lucene50PostingsFormat()));
iwc.setOpenMode(OpenMode.APPEND);
IndexWriter iw2 = new IndexWriter(dir, iwc);
iw2.forceMerge(1);
iw2.close();
verify(dir);
dir.close();
}
use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.
the class TestTermAutomatonQuery method testRandom.
public void testRandom() throws Exception {
int numDocs = atLeast(100);
Directory dir = newDirectory();
// Adds occassional random synonyms:
Analyzer analyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true, 100);
tokenizer.setEnableChecks(true);
TokenFilter filt = new MockTokenFilter(tokenizer, MockTokenFilter.EMPTY_STOPSET);
filt = new RandomSynonymFilter(filt);
return new TokenStreamComponents(tokenizer, filt);
}
};
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
int numTokens = atLeast(10);
StringBuilder sb = new StringBuilder();
for (int j = 0; j < numTokens; j++) {
sb.append(' ');
sb.append((char) (97 + random().nextInt(3)));
}
String contents = sb.toString();
doc.add(newTextField("field", contents, Field.Store.NO));
doc.add(new StoredField("id", "" + i));
if (VERBOSE) {
System.out.println(" doc " + i + " -> " + contents);
}
w.addDocument(doc);
}
IndexReader r = w.getReader();
IndexSearcher s = newSearcher(r);
// Used to match ANY using MultiPhraseQuery:
Term[] allTerms = new Term[] { new Term("field", "a"), new Term("field", "b"), new Term("field", "c") };
int numIters = atLeast(1000);
for (int iter = 0; iter < numIters; iter++) {
// Build the (finite, no any transitions) TermAutomatonQuery and
// also the "equivalent" BooleanQuery and make sure they match the
// same docs:
BooleanQuery.Builder bq = new BooleanQuery.Builder();
int count = TestUtil.nextInt(random(), 1, 5);
Set<BytesRef> strings = new HashSet<>();
for (int i = 0; i < count; i++) {
StringBuilder sb = new StringBuilder();
int numTokens = TestUtil.nextInt(random(), 1, 5);
for (int j = 0; j < numTokens; j++) {
if (j > 0 && j < numTokens - 1 && random().nextInt(5) == 3) {
sb.append('*');
} else {
sb.append((char) (97 + random().nextInt(3)));
}
}
String string = sb.toString();
MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder();
for (int j = 0; j < string.length(); j++) {
if (string.charAt(j) == '*') {
mpqb.add(allTerms);
} else {
mpqb.add(new Term("field", "" + string.charAt(j)));
}
}
bq.add(mpqb.build(), BooleanClause.Occur.SHOULD);
strings.add(new BytesRef(string));
}
List<BytesRef> stringsList = new ArrayList<>(strings);
Collections.sort(stringsList);
Automaton a = Automata.makeStringUnion(stringsList);
// Translate automaton to query:
TermAutomatonQuery q = new TermAutomatonQuery("field");
int numStates = a.getNumStates();
for (int i = 0; i < numStates; i++) {
q.createState();
q.setAccept(i, a.isAccept(i));
}
Transition t = new Transition();
for (int i = 0; i < numStates; i++) {
int transCount = a.initTransition(i, t);
for (int j = 0; j < transCount; j++) {
a.getNextTransition(t);
for (int label = t.min; label <= t.max; label++) {
if ((char) label == '*') {
q.addAnyTransition(t.source, t.dest);
} else {
q.addTransition(t.source, t.dest, "" + (char) label);
}
}
}
}
q.finish();
if (VERBOSE) {
System.out.println("TEST: iter=" + iter);
for (BytesRef string : stringsList) {
System.out.println(" string: " + string.utf8ToString());
}
System.out.println(q.toDot());
}
Query q1 = q;
Query q2 = bq.build();
if (random().nextInt(5) == 1) {
if (VERBOSE) {
System.out.println(" use random filter");
}
RandomQuery filter = new RandomQuery(random().nextLong(), random().nextFloat());
q1 = new BooleanQuery.Builder().add(q1, Occur.MUST).add(filter, Occur.FILTER).build();
q2 = new BooleanQuery.Builder().add(q2, Occur.MUST).add(filter, Occur.FILTER).build();
}
TopDocs hits1 = s.search(q1, numDocs);
TopDocs hits2 = s.search(q2, numDocs);
Set<String> hits1Docs = toDocIDs(s, hits1);
Set<String> hits2Docs = toDocIDs(s, hits2);
try {
assertEquals(hits2.totalHits, hits1.totalHits);
assertEquals(hits2Docs, hits1Docs);
} catch (AssertionError ae) {
System.out.println("FAILED:");
for (String id : hits1Docs) {
if (hits2Docs.contains(id) == false) {
System.out.println(String.format(Locale.ROOT, " id=%3s matched but should not have", id));
}
}
for (String id : hits2Docs) {
if (hits1Docs.contains(id) == false) {
System.out.println(String.format(Locale.ROOT, " id=%3s did not match but should have", id));
}
}
throw ae;
}
}
IOUtils.close(w, r, dir, analyzer);
}
use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.
the class TestPerFieldAnalyzerWrapper method testReuseWrapped.
public void testReuseWrapped() throws Exception {
final String text = "Qwerty";
final Analyzer specialAnalyzer = new SimpleAnalyzer();
final Analyzer defaultAnalyzer = new WhitespaceAnalyzer();
TokenStream ts1, ts2, ts3, ts4;
final PerFieldAnalyzerWrapper wrapper1 = new PerFieldAnalyzerWrapper(defaultAnalyzer, Collections.<String, Analyzer>singletonMap("special", specialAnalyzer));
// test that the PerFieldWrapper returns the same instance as original Analyzer:
ts1 = defaultAnalyzer.tokenStream("something", text);
ts2 = wrapper1.tokenStream("something", text);
assertSame(ts1, ts2);
ts1 = specialAnalyzer.tokenStream("special", text);
ts2 = wrapper1.tokenStream("special", text);
assertSame(ts1, ts2);
// Wrap with another wrapper, which does *not* extend DelegatingAnalyzerWrapper:
final AnalyzerWrapper wrapper2 = new AnalyzerWrapper(wrapper1.getReuseStrategy()) {
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return wrapper1;
}
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
assertNotSame(specialAnalyzer.tokenStream("special", text), components.getTokenStream());
TokenFilter filter = new ASCIIFoldingFilter(components.getTokenStream());
return new TokenStreamComponents(components.getTokenizer(), filter);
}
};
ts3 = wrapper2.tokenStream("special", text);
assertNotSame(ts1, ts3);
assertTrue(ts3 instanceof ASCIIFoldingFilter);
// check that cache did not get corrumpted:
ts2 = wrapper1.tokenStream("special", text);
assertSame(ts1, ts2);
// Wrap PerField with another PerField. In that case all TokenStreams returned must be the same:
final PerFieldAnalyzerWrapper wrapper3 = new PerFieldAnalyzerWrapper(wrapper1, Collections.<String, Analyzer>singletonMap("moreSpecial", specialAnalyzer));
ts1 = specialAnalyzer.tokenStream("special", text);
ts2 = wrapper3.tokenStream("special", text);
assertSame(ts1, ts2);
ts3 = specialAnalyzer.tokenStream("moreSpecial", text);
ts4 = wrapper3.tokenStream("moreSpecial", text);
assertSame(ts3, ts4);
assertSame(ts2, ts3);
IOUtils.close(wrapper3, wrapper2, wrapper1, specialAnalyzer, defaultAnalyzer);
}
use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.
the class TestBugInSomething method test.
public void test() throws Exception {
final CharArraySet cas = new CharArraySet(3, false);
cas.add("jjp");
cas.add("wlmwoknt");
cas.add("tcgyreo");
final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add("mtqlpi", "");
builder.add("mwoknt", "jjp");
builder.add("tcgyreo", "zpfpajyws");
final NormalizeCharMap map = builder.build();
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new MockTokenizer(MockTokenFilter.ENGLISH_STOPSET, false, -65);
TokenFilter f = new CommonGramsFilter(t, cas);
return new TokenStreamComponents(t, f);
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
reader = new MockCharFilter(reader, 0);
reader = new MappingCharFilter(map, reader);
reader = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader);
return reader;
}
};
checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj");
a.close();
}
use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.
the class TestAllAnalyzersHaveFactories method test.
public void test() throws Exception {
List<Class<?>> analysisClasses = TestRandomChains.getClassesForPackage("org.apache.lucene.analysis");
for (final Class<?> c : analysisClasses) {
final int modifiers = c.getModifiers();
if (// don't waste time with abstract classes
Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers) || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface() || testComponents.contains(c) || crazyComponents.contains(c) || oddlyNamedComponents.contains(c) || tokenFiltersWithoutFactory.contains(c) || // deprecated ones are typically back compat hacks
c.isAnnotationPresent(Deprecated.class) || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c))) {
continue;
}
Map<String, String> args = new HashMap<>();
args.put("luceneMatchVersion", Version.LATEST.toString());
if (Tokenizer.class.isAssignableFrom(c)) {
String clazzName = c.getSimpleName();
assertTrue(clazzName.endsWith("Tokenizer"));
String simpleName = clazzName.substring(0, clazzName.length() - 9);
assertNotNull(TokenizerFactory.lookupClass(simpleName));
TokenizerFactory instance = null;
try {
instance = TokenizerFactory.forName(simpleName, args);
assertNotNull(instance);
if (instance instanceof ResourceLoaderAware) {
((ResourceLoaderAware) instance).inform(loader);
}
assertSame(c, instance.create().getClass());
} catch (IllegalArgumentException e) {
// TODO: For now pass because some factories have not yet a default config that always works
}
} else if (TokenFilter.class.isAssignableFrom(c)) {
String clazzName = c.getSimpleName();
assertTrue(clazzName.endsWith("Filter"));
String simpleName = clazzName.substring(0, clazzName.length() - (clazzName.endsWith("TokenFilter") ? 11 : 6));
assertNotNull(TokenFilterFactory.lookupClass(simpleName));
TokenFilterFactory instance = null;
try {
instance = TokenFilterFactory.forName(simpleName, args);
assertNotNull(instance);
if (instance instanceof ResourceLoaderAware) {
((ResourceLoaderAware) instance).inform(loader);
}
Class<? extends TokenStream> createdClazz = instance.create(new KeywordTokenizer()).getClass();
// only check instance if factory have wrapped at all!
if (KeywordTokenizer.class != createdClazz) {
assertSame(c, createdClazz);
}
} catch (IllegalArgumentException e) {
// TODO: For now pass because some factories have not yet a default config that always works
}
} else if (CharFilter.class.isAssignableFrom(c)) {
String clazzName = c.getSimpleName();
assertTrue(clazzName.endsWith("CharFilter"));
String simpleName = clazzName.substring(0, clazzName.length() - 10);
assertNotNull(CharFilterFactory.lookupClass(simpleName));
CharFilterFactory instance = null;
try {
instance = CharFilterFactory.forName(simpleName, args);
assertNotNull(instance);
if (instance instanceof ResourceLoaderAware) {
((ResourceLoaderAware) instance).inform(loader);
}
Class<? extends Reader> createdClazz = instance.create(new StringReader("")).getClass();
// only check instance if factory have wrapped at all!
if (StringReader.class != createdClazz) {
assertSame(c, createdClazz);
}
} catch (IllegalArgumentException e) {
// TODO: For now pass because some factories have not yet a default config that always works
}
}
}
}
Aggregations