use of com.ibm.icu.text.RuleBasedCollator in project elasticsearch by elastic.
the class SimpleIcuCollationTokenFilterTests method testCustomRules.
/*
* For german, you might want oe to sort and match with o umlaut.
* This is not the default, but you can make a customized ruleset to do this.
*
* The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
* http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
*/
public void testCustomRules() throws Exception {
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
String DIN5007_2_tailorings = "& ae , ä & AE , Ä" + "& oe , ö & OE , Ö" + "& ue , ü & UE , ü";
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
String tailoredRules = tailoredCollator.getRules();
Settings settings = Settings.builder().put("index.analysis.filter.myCollator.type", "icu_collation").put("index.analysis.filter.myCollator.rules", tailoredRules).put("index.analysis.filter.myCollator.strength", "primary").build();
TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());
TokenFilterFactory filterFactory = analysis.tokenFilter.get("myCollator");
assertCollatesToSame(filterFactory, "Töne", "Toene");
}
use of com.ibm.icu.text.RuleBasedCollator in project lucene-solr by apache.
the class ICUCollationField method setup.
/**
* Setup the field according to the provided parameters
*/
private void setup(ResourceLoader loader, Map<String, String> args) {
String custom = args.remove("custom");
String localeID = args.remove("locale");
String strength = args.remove("strength");
String decomposition = args.remove("decomposition");
String alternate = args.remove("alternate");
String caseLevel = args.remove("caseLevel");
String caseFirst = args.remove("caseFirst");
String numeric = args.remove("numeric");
String variableTop = args.remove("variableTop");
if (custom == null && localeID == null)
throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
if (custom != null && localeID != null)
throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both locale and custom. " + "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. " + "Then save the entire customized ruleset to a file, and use with the custom parameter");
final Collator collator;
if (localeID != null) {
// create from a system collator, based on Locale.
collator = createFromLocale(localeID);
} else {
// create from a custom ruleset
collator = createFromRules(custom, loader);
}
// set the strength flag, otherwise it will be the default.
if (strength != null) {
if (strength.equalsIgnoreCase("primary"))
collator.setStrength(Collator.PRIMARY);
else if (strength.equalsIgnoreCase("secondary"))
collator.setStrength(Collator.SECONDARY);
else if (strength.equalsIgnoreCase("tertiary"))
collator.setStrength(Collator.TERTIARY);
else if (strength.equalsIgnoreCase("quaternary"))
collator.setStrength(Collator.QUATERNARY);
else if (strength.equalsIgnoreCase("identical"))
collator.setStrength(Collator.IDENTICAL);
else
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " + strength);
}
// set the decomposition flag, otherwise it will be the default.
if (decomposition != null) {
if (decomposition.equalsIgnoreCase("no"))
collator.setDecomposition(Collator.NO_DECOMPOSITION);
else if (decomposition.equalsIgnoreCase("canonical"))
collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
else
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
}
// expert options: concrete subclasses are always a RuleBasedCollator
RuleBasedCollator rbc = (RuleBasedCollator) collator;
if (alternate != null) {
if (alternate.equalsIgnoreCase("shifted")) {
rbc.setAlternateHandlingShifted(true);
} else if (alternate.equalsIgnoreCase("non-ignorable")) {
rbc.setAlternateHandlingShifted(false);
} else {
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
}
}
if (caseLevel != null) {
rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
}
if (caseFirst != null) {
if (caseFirst.equalsIgnoreCase("lower")) {
rbc.setLowerCaseFirst(true);
} else if (caseFirst.equalsIgnoreCase("upper")) {
rbc.setUpperCaseFirst(true);
} else {
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
}
}
if (numeric != null) {
rbc.setNumericCollation(Boolean.parseBoolean(numeric));
}
if (variableTop != null) {
rbc.setVariableTop(variableTop);
}
analyzer = new ICUCollationKeyAnalyzer(collator);
}
use of com.ibm.icu.text.RuleBasedCollator in project lucene-solr by apache.
the class ICUCollationField method createFromRules.
/**
* Read custom rules from a file, and create a RuleBasedCollator
* The file cannot support comments, as # might be in the rules!
*/
static Collator createFromRules(String fileName, ResourceLoader loader) {
InputStream input = null;
try {
input = loader.openResource(fileName);
String rules = IOUtils.toString(input, "UTF-8");
return new RuleBasedCollator(rules);
} catch (Exception e) {
// io error or invalid rules
throw new RuntimeException(e);
} finally {
IOUtils.closeQuietly(input);
}
}
use of com.ibm.icu.text.RuleBasedCollator in project lucene-solr by apache.
the class TestICUCollationFieldDocValues method setupSolrHome.
/**
* Ugly: but what to do? We want to test custom sort, which reads rules in as a resource.
* These are largish files, and jvm-specific (as our documentation says, you should always
* look out for jvm differences with collation).
* So it's preferable to create this file on-the-fly.
*/
public static String setupSolrHome() throws Exception {
File tmpFile = createTempDir().toFile();
// make data and conf dirs
new File(tmpFile + "/collection1", "data").mkdirs();
File confDir = new File(tmpFile + "/collection1", "conf");
confDir.mkdirs();
// copy over configuration files
FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml"), new File(confDir, "solrconfig.xml"));
FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/schema-icucollate-dv.xml"), new File(confDir, "schema.xml"));
// generate custom collation rules (DIN 5007-2), saving to customrules.dat
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de", "DE"));
String DIN5007_2_tailorings = "& ae , ä & AE , Ä" + "& oe , ö & OE , Ö" + "& ue , ü & UE , ü";
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
String tailoredRules = tailoredCollator.getRules();
FileOutputStream os = new FileOutputStream(new File(confDir, "customrules.dat"));
IOUtils.write(tailoredRules, os, "UTF-8");
os.close();
return tmpFile.getAbsolutePath();
}
use of com.ibm.icu.text.RuleBasedCollator in project es6draft by anba.
the class CollatorObject method createCollator.
private Collator createCollator() {
ULocale locale = ULocale.forLanguageTag(this.locale);
if ("search".equals(usage)) {
// "search" usage cannot be set through unicode extensions (u-co-search), handle here:
locale = locale.setKeywordValue("collation", "search");
}
RuleBasedCollator collator = (RuleBasedCollator) Collator.getInstance(locale);
collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
collator.setNumericCollation(numeric);
switch(caseFirst) {
case "upper":
collator.setUpperCaseFirst(true);
break;
case "lower":
collator.setLowerCaseFirst(true);
break;
}
switch(sensitivity) {
case "base":
collator.setStrength(Collator.PRIMARY);
break;
case "accent":
collator.setStrength(Collator.SECONDARY);
break;
case "case":
collator.setStrength(Collator.PRIMARY);
collator.setCaseLevel(true);
break;
case "variant":
collator.setStrength(Collator.TERTIARY);
break;
}
collator.setAlternateHandlingShifted(ignorePunctuation);
return collator;
}
Aggregations