use of org.apache.lucene.analysis.util.TokenizerFactory in project lucene-solr by apache.
the class TestFactories method doTestTokenizer.
private void doTestTokenizer(String tokenizer) throws IOException {
Class<? extends TokenizerFactory> factoryClazz = TokenizerFactory.lookupClass(tokenizer);
TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz);
if (factory != null) {
// if it implements MultiTermAware, sanity check its impl
if (factory instanceof MultiTermAwareComponent) {
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
assertNotNull(mtc);
// it's not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it
assertFalse(mtc instanceof CharFilterFactory);
}
// beast it just a little, it shouldnt throw exceptions:
// (it should have thrown them in initialize)
Analyzer a = new FactoryAnalyzer(factory, null, null);
checkRandomData(random(), a, 20, 20, false, false);
a.close();
}
}
use of org.apache.lucene.analysis.util.TokenizerFactory in project lucene-solr by apache.
the class TestHMMChineseTokenizerFactory method testSimple.
/** Test showing the behavior */
public void testSimple() throws Exception {
Reader reader = new StringReader("我购买了道具和服装。");
TokenizerFactory factory = new HMMChineseTokenizerFactory(new HashMap<String, String>());
Tokenizer tokenizer = factory.create(newAttributeFactory());
tokenizer.setReader(reader);
// TODO: fix smart chinese to not emit punctuation tokens
// at the moment: you have to clean up with WDF, or use the stoplist, etc
assertTokenStreamContents(tokenizer, new String[] { "我", "购买", "了", "道具", "和", "服装", "," });
}
use of org.apache.lucene.analysis.util.TokenizerFactory in project lucene-solr by apache.
the class AnalysisRequestHandlerBase method analyzeValue.
/**
* Analyzes the given value using the given Analyzer.
*
* @param value Value to analyze
* @param context The {@link AnalysisContext analysis context}.
*
* @return NamedList containing the tokens produced by analyzing the given value
*/
protected NamedList<? extends Object> analyzeValue(String value, AnalysisContext context) {
Analyzer analyzer = context.getAnalyzer();
if (!TokenizerChain.class.isInstance(analyzer)) {
try (TokenStream tokenStream = analyzer.tokenStream(context.getFieldName(), value)) {
NamedList<List<NamedList>> namedList = new NamedList<>();
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context));
return namedList;
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
}
}
TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
CharFilterFactory[] cfiltfacs = tokenizerChain.getCharFilterFactories();
TokenizerFactory tfac = tokenizerChain.getTokenizerFactory();
TokenFilterFactory[] filtfacs = tokenizerChain.getTokenFilterFactories();
NamedList<Object> namedList = new NamedList<>();
if (0 < cfiltfacs.length) {
String source = value;
for (CharFilterFactory cfiltfac : cfiltfacs) {
Reader reader = new StringReader(source);
reader = cfiltfac.create(reader);
source = writeCharStream(namedList, reader);
}
}
TokenStream tokenStream = tfac.create();
((Tokenizer) tokenStream).setReader(tokenizerChain.initReader(null, new StringReader(value)));
List<AttributeSource> tokens = analyzeTokenStream(tokenStream);
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokenStream, tokens);
for (TokenFilterFactory tokenFilterFactory : filtfacs) {
for (final AttributeSource tok : tokens) {
tok.getAttribute(TokenTrackingAttribute.class).freezeStage();
}
// overwrite the vars "tokenStream", "tokens", and "listBasedTokenStream"
tokenStream = tokenFilterFactory.create(listBasedTokenStream);
tokens = analyzeTokenStream(tokenStream);
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
listBasedTokenStream = new ListBasedTokenStream(listBasedTokenStream, tokens);
}
return namedList;
}
use of org.apache.lucene.analysis.util.TokenizerFactory in project lucene-solr by apache.
the class LukeRequestHandler method getAnalyzerInfo.
private static SimpleOrderedMap<Object> getAnalyzerInfo(Analyzer analyzer) {
SimpleOrderedMap<Object> aninfo = new SimpleOrderedMap<>();
aninfo.add("className", analyzer.getClass().getName());
if (analyzer instanceof TokenizerChain) {
TokenizerChain tchain = (TokenizerChain) analyzer;
CharFilterFactory[] cfiltfacs = tchain.getCharFilterFactories();
if (0 < cfiltfacs.length) {
SimpleOrderedMap<Map<String, Object>> cfilters = new SimpleOrderedMap<>();
for (CharFilterFactory cfiltfac : cfiltfacs) {
Map<String, Object> tok = new HashMap<>();
String className = cfiltfac.getClass().getName();
tok.put("className", className);
tok.put("args", cfiltfac.getOriginalArgs());
cfilters.add(className.substring(className.lastIndexOf('.') + 1), tok);
}
aninfo.add("charFilters", cfilters);
}
SimpleOrderedMap<Object> tokenizer = new SimpleOrderedMap<>();
TokenizerFactory tfac = tchain.getTokenizerFactory();
tokenizer.add("className", tfac.getClass().getName());
tokenizer.add("args", tfac.getOriginalArgs());
aninfo.add("tokenizer", tokenizer);
TokenFilterFactory[] filtfacs = tchain.getTokenFilterFactories();
if (0 < filtfacs.length) {
SimpleOrderedMap<Map<String, Object>> filters = new SimpleOrderedMap<>();
for (TokenFilterFactory filtfac : filtfacs) {
Map<String, Object> tok = new HashMap<>();
String className = filtfac.getClass().getName();
tok.put("className", className);
tok.put("args", filtfac.getOriginalArgs());
filters.add(className.substring(className.lastIndexOf('.') + 1), tok);
}
aninfo.add("filters", filters);
}
}
return aninfo;
}
use of org.apache.lucene.analysis.util.TokenizerFactory in project lucene-solr by apache.
the class ManagedIndexSchema method informResourceLoaderAwareObjectsInChain.
/**
* After creating a new FieldType, it may contain components that implement
* the ResourceLoaderAware interface, which need to be informed after they
* are loaded (as they depend on this callback to complete initialization work)
*/
protected void informResourceLoaderAwareObjectsInChain(TokenizerChain chain) {
CharFilterFactory[] charFilters = chain.getCharFilterFactories();
for (CharFilterFactory next : charFilters) {
if (next instanceof ResourceLoaderAware) {
try {
((ResourceLoaderAware) next).inform(loader);
} catch (IOException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
}
TokenizerFactory tokenizerFactory = chain.getTokenizerFactory();
if (tokenizerFactory instanceof ResourceLoaderAware) {
try {
((ResourceLoaderAware) tokenizerFactory).inform(loader);
} catch (IOException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
TokenFilterFactory[] filters = chain.getTokenFilterFactories();
for (TokenFilterFactory next : filters) {
if (next instanceof ResourceLoaderAware) {
try {
((ResourceLoaderAware) next).inform(loader);
} catch (IOException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
}
}
Aggregations