use of org.apache.lucene.analysis.Analyzer in project lucene-solr by apache.
the class TestCapitalizationFilter method testRandomString.
/** blast some random strings through the analyzer */
public void testRandomString() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new CapitalizationFilter(tokenizer));
}
};
checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
a.close();
}
use of org.apache.lucene.analysis.Analyzer in project lucene-solr by apache.
the class TestSynonymGraphFilter method testRandomSyns.
public void testRandomSyns() throws Exception {
int synCount = atLeast(10);
double bias = random().nextDouble();
boolean dedup = random().nextBoolean();
boolean flatten = random().nextBoolean();
SynonymMap.Builder b = new SynonymMap.Builder(dedup);
List<OneSyn> syns = new ArrayList<>();
// Makes random syns from random a / b tokens, mapping to random x / y tokens
if (VERBOSE) {
System.out.println("TEST: make " + synCount + " syns");
System.out.println(" bias for a over b=" + bias);
System.out.println(" dedup=" + dedup);
System.out.println(" flatten=" + flatten);
}
int maxSynLength = 0;
for (int i = 0; i < synCount; i++) {
OneSyn syn = new OneSyn();
syn.in = randomBinaryChars(1, 5, bias, 'a');
syn.out = randomBinaryChars(1, 5, 0.5, 'x');
syn.keepOrig = random().nextBoolean();
syns.add(syn);
maxSynLength = Math.max(maxSynLength, syn.in.length);
if (VERBOSE) {
System.out.println(" " + syn);
}
add(b, toTokenString(syn.in), toTokenString(syn.out), syn.keepOrig);
}
// Compute max allowed lookahead for flatten filter:
int maxFlattenLookahead = 0;
if (flatten) {
for (int i = 0; i < synCount; i++) {
OneSyn syn1 = syns.get(i);
int count = syn1.out.length;
boolean keepOrig = syn1.keepOrig;
for (int j = 0; j < synCount; j++) {
OneSyn syn2 = syns.get(i);
keepOrig |= syn2.keepOrig;
if (syn1.in.equals(syn2.in)) {
count += syn2.out.length;
}
}
if (keepOrig) {
count += syn1.in.length;
}
maxFlattenLookahead = Math.max(maxFlattenLookahead, count);
}
}
// Only used w/ VERBOSE:
Analyzer aNoFlattened;
if (VERBOSE) {
aNoFlattened = getAnalyzer(b, true);
} else {
aNoFlattened = null;
}
Analyzer a;
if (flatten) {
a = getFlattenAnalyzer(b, true);
} else {
a = getAnalyzer(b, true);
}
int iters = atLeast(20);
for (int iter = 0; iter < iters; iter++) {
String doc = toTokenString(randomBinaryChars(50, 100, bias, 'a'));
if (VERBOSE) {
System.out.println("TEST: iter=" + iter + " doc=" + doc);
}
Automaton expected = slowSynFilter(doc, syns, flatten);
if (VERBOSE) {
System.out.println(" expected:\n" + expected.toDot());
if (flatten) {
Automaton unflattened = toAutomaton(aNoFlattened.tokenStream("field", new StringReader(doc)));
System.out.println(" actual unflattened:\n" + unflattened.toDot());
}
}
Automaton actual = toAutomaton(a.tokenStream("field", new StringReader(doc)));
if (VERBOSE) {
System.out.println(" actual:\n" + actual.toDot());
}
assertTrue("maxLookaheadUsed=" + synFilter.getMaxLookaheadUsed() + " maxSynLength=" + maxSynLength, synFilter.getMaxLookaheadUsed() <= maxSynLength);
if (flatten) {
assertTrue("flatten maxLookaheadUsed=" + flattenFilter.getMaxLookaheadUsed() + " maxFlattenLookahead=" + maxFlattenLookahead, flattenFilter.getMaxLookaheadUsed() <= maxFlattenLookahead);
}
checkAnalysisConsistency(random(), a, random().nextBoolean(), doc);
// output token that also happens to be in the input:
try {
actual = Operations.determinize(actual, 50000);
} catch (TooComplexToDeterminizeException tctde) {
// Unfortunately the syns can easily create difficult-to-determinize graphs:
assertTrue(approxEquals(actual, expected));
continue;
}
try {
expected = Operations.determinize(expected, 50000);
} catch (TooComplexToDeterminizeException tctde) {
// Unfortunately the syns can easily create difficult-to-determinize graphs:
assertTrue(approxEquals(actual, expected));
continue;
}
assertTrue(approxEquals(actual, expected));
assertTrue(Operations.sameLanguage(actual, expected));
}
a.close();
}
use of org.apache.lucene.analysis.Analyzer in project lucene-solr by apache.
the class TestSynonymGraphFilter method solrSynsToAnalyzer.
private Analyzer solrSynsToAnalyzer(String syns) throws IOException, ParseException {
Analyzer analyzer = new MockAnalyzer(random());
SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
parser.parse(new StringReader(syns));
analyzer.close();
return getFlattenAnalyzer(parser, true);
}
use of org.apache.lucene.analysis.Analyzer in project lucene-solr by apache.
the class TestSynonymGraphFilter method testBasic2.
public void testBasic2() throws Exception {
boolean keepOrig = true;
do {
keepOrig = !keepOrig;
SynonymMap.Builder b = new SynonymMap.Builder(true);
add(b, "aaa", "aaaa1 aaaa2 aaaa3", keepOrig);
add(b, "bbb", "bbbb1 bbbb2", keepOrig);
Analyzer a = getFlattenAnalyzer(b, true);
if (keepOrig) {
assertAnalyzesTo(a, "xyzzy bbb pot of gold", new String[] { "xyzzy", "bbbb1", "bbb", "bbbb2", "pot", "of", "gold" }, new int[] { 1, 1, 0, 1, 1, 1, 1 });
assertAnalyzesTo(a, "xyzzy aaa pot of gold", new String[] { "xyzzy", "aaaa1", "aaa", "aaaa2", "aaaa2", "pot", "of", "gold" }, new int[] { 1, 1, 0, 1, 1, 1, 1, 1 });
} else {
assertAnalyzesTo(a, "xyzzy bbb pot of gold", new String[] { "xyzzy", "bbbb1", "bbbb2", "pot", "of", "gold" }, new int[] { 1, 1, 1, 1, 1, 1 });
assertAnalyzesTo(a, "xyzzy aaa pot of gold", new String[] { "xyzzy", "aaaa1", "aaaa2", "aaaa3", "pot", "of", "gold" }, new int[] { 1, 1, 1, 1, 1, 1, 1 });
}
} while (keepOrig);
}
use of org.apache.lucene.analysis.Analyzer in project lucene-solr by apache.
the class TestSynonymGraphFilter method testFlattenedGraph.
/** If we expand synonyms during indexing, it's a bit better than
* SynonymFilter is today, but still necessarily has false
* positive and negative PhraseQuery matches because we do not
* index posLength, so we lose information. */
public void testFlattenedGraph() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "wtf", "what the fudge", true);
Analyzer a = getFlattenAnalyzer(b, true);
assertAnalyzesTo(a, "wtf happened", new String[] { "what", "wtf", "the", "fudge", "happened" }, new int[] { 0, 0, 0, 0, 4 }, new int[] { 3, 3, 3, 3, 12 }, null, new int[] { 1, 0, 1, 1, 1 }, new int[] { 1, 3, 1, 1, 1 }, true);
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir, a);
Document doc = new Document();
doc.add(newTextField("field", "wtf happened", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
w.close();
IndexSearcher s = newSearcher(r);
// Good (this should not match, and doesn't):
assertEquals(0, s.count(new PhraseQuery("field", "what", "happened")));
// Bad (this should match, but doesn't):
assertEquals(0, s.count(new PhraseQuery("field", "wtf", "happened")));
// Good (this should match, and does):
assertEquals(1, s.count(new PhraseQuery("field", "what", "the", "fudge", "happened")));
// Bad (this should not match, but does):
assertEquals(1, s.count(new PhraseQuery("field", "wtf", "the")));
IOUtils.close(r, dir);
}
Aggregations