use of org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig in project elasticsearch by elastic.
the class IcuTokenizerFactory method getIcuConfig.
private ICUTokenizerConfig getIcuConfig(Environment env, Settings settings) {
Map<Integer, String> tailored = new HashMap<>();
try {
String[] ruleFiles = settings.getAsArray(RULE_FILES);
for (String scriptAndResourcePath : ruleFiles) {
int colonPos = scriptAndResourcePath.indexOf(":");
if (colonPos == -1 || colonPos == scriptAndResourcePath.length() - 1) {
throw new IllegalArgumentException(RULE_FILES + " should contain comma-separated \"code:rulefile\" pairs");
}
String scriptCode = scriptAndResourcePath.substring(0, colonPos).trim();
String resourcePath = scriptAndResourcePath.substring(colonPos + 1).trim();
tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath);
}
if (tailored.isEmpty()) {
return null;
} else {
final BreakIterator[] breakers = new BreakIterator[UScript.CODE_LIMIT];
for (Map.Entry<Integer, String> entry : tailored.entrySet()) {
int code = entry.getKey();
String resourcePath = entry.getValue();
breakers[code] = parseRules(resourcePath, env);
}
// cjkAsWords nor myanmarAsWords are not configurable yet.
ICUTokenizerConfig config = new DefaultICUTokenizerConfig(true, true) {
@Override
public BreakIterator getBreakIterator(int script) {
if (breakers[script] != null) {
return (BreakIterator) breakers[script].clone();
} else {
return super.getBreakIterator(script);
}
}
};
return config;
}
} catch (Exception e) {
throw new ElasticsearchException("failed to load ICU rule files", e);
}
}
Aggregations