use of org.apache.lucene.analysis.synonym.SynonymMap in project lucene-solr by apache.
the class ManagedSynonymGraphFilterFactory method onManagedResourceInitialized.
/**
* Called once, during core initialization, to initialize any analysis components
* that depend on the data managed by this resource. It is important that the
* analysis component is only initialized once during core initialization so that
* text analysis is consistent, especially in a distributed environment, as we
* don't want one server applying a different set of stop words than other servers.
*/
@SuppressWarnings("unchecked")
@Override
public void onManagedResourceInitialized(NamedList<?> initArgs, final ManagedResource res) throws SolrException {
NamedList<Object> args = (NamedList<Object>) initArgs;
args.add("synonyms", getResourceId());
args.add("expand", "false");
args.add("format", "solr");
Map<String, String> filtArgs = new HashMap<>();
for (Map.Entry<String, ?> entry : args) {
filtArgs.put(entry.getKey(), entry.getValue().toString());
}
// create the actual filter factory that pulls the synonym mappings
// from synonymMappings using a custom parser implementation
delegate = new SynonymGraphFilterFactory(filtArgs) {
@Override
protected SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
ManagedSynonymParser parser = new ManagedSynonymParser((SynonymManager) res, dedup, analyzer);
// null is safe here because there's no actual parsing done against a input Reader
parser.parse(null);
return parser.build();
}
};
try {
delegate.inform(res.getResourceLoader());
} catch (IOException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
use of org.apache.lucene.analysis.synonym.SynonymMap in project lucene-solr by apache.
the class TestLimitTokenPositionFilter method testMaxPosition3WithSynomyms.
public void testMaxPosition3WithSynomyms() throws IOException {
for (final boolean consumeAll : new boolean[] { true, false }) {
MockTokenizer tokenizer = whitespaceMockTokenizer("one two three four five");
// if we are consuming all tokens, we can use the checks, otherwise we can't
tokenizer.setEnableChecks(consumeAll);
SynonymMap.Builder builder = new SynonymMap.Builder(true);
builder.add(new CharsRef("one"), new CharsRef("first"), true);
builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
CharsRefBuilder multiWordCharsRef = new CharsRefBuilder();
SynonymMap.Builder.join(new String[] { "and", "indubitably", "single", "only" }, multiWordCharsRef);
builder.add(new CharsRef("one"), multiWordCharsRef.get(), true);
SynonymMap.Builder.join(new String[] { "dopple", "ganger" }, multiWordCharsRef);
builder.add(new CharsRef("two"), multiWordCharsRef.get(), true);
SynonymMap synonymMap = builder.build();
TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
stream = new LimitTokenPositionFilter(stream, 3, consumeAll);
// "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
assertTokenStreamContents(stream, new String[] { "one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger" }, new int[] { 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0 });
}
}
use of org.apache.lucene.analysis.synonym.SynonymMap in project lucene-solr by apache.
the class TestRemoveDuplicatesTokenFilter method testRandomStrings.
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
final int numIters = atLeast(10);
for (int i = 0; i < numIters; i++) {
SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean());
final int numEntries = atLeast(10);
for (int j = 0; j < numEntries; j++) {
add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean());
}
final SynonymMap map = b.build();
final boolean ignoreCase = random().nextBoolean();
final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
TokenStream stream = new SynonymFilter(tokenizer, map, ignoreCase);
return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream));
}
};
checkRandomData(random(), analyzer, 200);
analyzer.close();
}
}
use of org.apache.lucene.analysis.synonym.SynonymMap in project lucene-solr by apache.
the class ManagedSynonymFilterFactory method onManagedResourceInitialized.
/**
* Called once, during core initialization, to initialize any analysis components
* that depend on the data managed by this resource. It is important that the
* analysis component is only initialized once during core initialization so that
* text analysis is consistent, especially in a distributed environment, as we
* don't want one server applying a different set of stop words than other servers.
*/
@SuppressWarnings("unchecked")
@Override
public void onManagedResourceInitialized(NamedList<?> initArgs, final ManagedResource res) throws SolrException {
NamedList<Object> args = (NamedList<Object>) initArgs;
args.add("synonyms", getResourceId());
args.add("expand", "false");
args.add("format", "solr");
Map<String, String> filtArgs = new HashMap<>();
for (Map.Entry<String, ?> entry : args) {
filtArgs.put(entry.getKey(), entry.getValue().toString());
}
// create the actual filter factory that pulls the synonym mappings
// from synonymMappings using a custom parser implementation
delegate = new SynonymFilterFactory(filtArgs) {
@Override
protected SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
ManagedSynonymParser parser = new ManagedSynonymParser((SynonymManager) res, dedup, analyzer);
// null is safe here because there's no actual parsing done against a input Reader
parser.parse(null);
return parser.build();
}
};
try {
delegate.inform(res.getResourceLoader());
} catch (IOException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
Aggregations