Search in sources :

Example 1 with RuleBasedCollator

use of com.ibm.icu.text.RuleBasedCollator in project elasticsearch by elastic.

the class SimpleIcuCollationTokenFilterTests method testCustomRules.

/*
    * For german, you might want oe to sort and match with o umlaut.
    * This is not the default, but you can make a customized ruleset to do this.
    *
    * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
    *  http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
    */
public void testCustomRules() throws Exception {
    RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
    String DIN5007_2_tailorings = "& ae , ä & AE , Ä" + "& oe , ö & OE , Ö" + "& ue , ü & UE , ü";
    RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
    String tailoredRules = tailoredCollator.getRules();
    Settings settings = Settings.builder().put("index.analysis.filter.myCollator.type", "icu_collation").put("index.analysis.filter.myCollator.rules", tailoredRules).put("index.analysis.filter.myCollator.strength", "primary").build();
    TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());
    TokenFilterFactory filterFactory = analysis.tokenFilter.get("myCollator");
    assertCollatesToSame(filterFactory, "Töne", "Toene");
}
Also used : RuleBasedCollator(com.ibm.icu.text.RuleBasedCollator) ULocale(com.ibm.icu.util.ULocale) Index(org.elasticsearch.index.Index) AnalysisICUPlugin(org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin) Settings(org.elasticsearch.common.settings.Settings)

Example 2 with RuleBasedCollator

use of com.ibm.icu.text.RuleBasedCollator in project lucene-solr by apache.

the class ICUCollationField method setup.

/**
   * Setup the field according to the provided parameters
   */
private void setup(ResourceLoader loader, Map<String, String> args) {
    String custom = args.remove("custom");
    String localeID = args.remove("locale");
    String strength = args.remove("strength");
    String decomposition = args.remove("decomposition");
    String alternate = args.remove("alternate");
    String caseLevel = args.remove("caseLevel");
    String caseFirst = args.remove("caseFirst");
    String numeric = args.remove("numeric");
    String variableTop = args.remove("variableTop");
    if (custom == null && localeID == null)
        throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
    if (custom != null && localeID != null)
        throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both locale and custom. " + "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. " + "Then save the entire customized ruleset to a file, and use with the custom parameter");
    final Collator collator;
    if (localeID != null) {
        // create from a system collator, based on Locale.
        collator = createFromLocale(localeID);
    } else {
        // create from a custom ruleset
        collator = createFromRules(custom, loader);
    }
    // set the strength flag, otherwise it will be the default.
    if (strength != null) {
        if (strength.equalsIgnoreCase("primary"))
            collator.setStrength(Collator.PRIMARY);
        else if (strength.equalsIgnoreCase("secondary"))
            collator.setStrength(Collator.SECONDARY);
        else if (strength.equalsIgnoreCase("tertiary"))
            collator.setStrength(Collator.TERTIARY);
        else if (strength.equalsIgnoreCase("quaternary"))
            collator.setStrength(Collator.QUATERNARY);
        else if (strength.equalsIgnoreCase("identical"))
            collator.setStrength(Collator.IDENTICAL);
        else
            throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " + strength);
    }
    // set the decomposition flag, otherwise it will be the default.
    if (decomposition != null) {
        if (decomposition.equalsIgnoreCase("no"))
            collator.setDecomposition(Collator.NO_DECOMPOSITION);
        else if (decomposition.equalsIgnoreCase("canonical"))
            collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
        else
            throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
    }
    // expert options: concrete subclasses are always a RuleBasedCollator
    RuleBasedCollator rbc = (RuleBasedCollator) collator;
    if (alternate != null) {
        if (alternate.equalsIgnoreCase("shifted")) {
            rbc.setAlternateHandlingShifted(true);
        } else if (alternate.equalsIgnoreCase("non-ignorable")) {
            rbc.setAlternateHandlingShifted(false);
        } else {
            throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
        }
    }
    if (caseLevel != null) {
        rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
    }
    if (caseFirst != null) {
        if (caseFirst.equalsIgnoreCase("lower")) {
            rbc.setLowerCaseFirst(true);
        } else if (caseFirst.equalsIgnoreCase("upper")) {
            rbc.setUpperCaseFirst(true);
        } else {
            throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
        }
    }
    if (numeric != null) {
        rbc.setNumericCollation(Boolean.parseBoolean(numeric));
    }
    if (variableTop != null) {
        rbc.setVariableTop(variableTop);
    }
    analyzer = new ICUCollationKeyAnalyzer(collator);
}
Also used : RuleBasedCollator(com.ibm.icu.text.RuleBasedCollator) SolrException(org.apache.solr.common.SolrException) ICUCollationKeyAnalyzer(org.apache.lucene.collation.ICUCollationKeyAnalyzer) Collator(com.ibm.icu.text.Collator) RuleBasedCollator(com.ibm.icu.text.RuleBasedCollator)

Example 3 with RuleBasedCollator

use of com.ibm.icu.text.RuleBasedCollator in project lucene-solr by apache.

the class ICUCollationField method createFromRules.

/**
   * Read custom rules from a file, and create a RuleBasedCollator
   * The file cannot support comments, as # might be in the rules!
   */
static Collator createFromRules(String fileName, ResourceLoader loader) {
    InputStream input = null;
    try {
        input = loader.openResource(fileName);
        String rules = IOUtils.toString(input, "UTF-8");
        return new RuleBasedCollator(rules);
    } catch (Exception e) {
        // io error or invalid rules
        throw new RuntimeException(e);
    } finally {
        IOUtils.closeQuietly(input);
    }
}
Also used : RuleBasedCollator(com.ibm.icu.text.RuleBasedCollator) InputStream(java.io.InputStream) SolrException(org.apache.solr.common.SolrException) IOException(java.io.IOException)

Example 4 with RuleBasedCollator

use of com.ibm.icu.text.RuleBasedCollator in project lucene-solr by apache.

the class TestICUCollationFieldDocValues method setupSolrHome.

/**
   * Ugly: but what to do? We want to test custom sort, which reads rules in as a resource.
   * These are largish files, and jvm-specific (as our documentation says, you should always
   * look out for jvm differences with collation).
   * So it's preferable to create this file on-the-fly.
   */
public static String setupSolrHome() throws Exception {
    File tmpFile = createTempDir().toFile();
    // make data and conf dirs
    new File(tmpFile + "/collection1", "data").mkdirs();
    File confDir = new File(tmpFile + "/collection1", "conf");
    confDir.mkdirs();
    // copy over configuration files
    FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml"), new File(confDir, "solrconfig.xml"));
    FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/schema-icucollate-dv.xml"), new File(confDir, "schema.xml"));
    // generate custom collation rules (DIN 5007-2), saving to customrules.dat
    RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de", "DE"));
    String DIN5007_2_tailorings = "& ae , ä & AE , Ä" + "& oe , ö & OE , Ö" + "& ue , ü & UE , ü";
    RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
    String tailoredRules = tailoredCollator.getRules();
    FileOutputStream os = new FileOutputStream(new File(confDir, "customrules.dat"));
    IOUtils.write(tailoredRules, os, "UTF-8");
    os.close();
    return tmpFile.getAbsolutePath();
}
Also used : RuleBasedCollator(com.ibm.icu.text.RuleBasedCollator) ULocale(com.ibm.icu.util.ULocale) FileOutputStream(java.io.FileOutputStream) File(java.io.File)

Example 5 with RuleBasedCollator

use of com.ibm.icu.text.RuleBasedCollator in project es6draft by anba.

the class CollatorObject method createCollator.

private Collator createCollator() {
    ULocale locale = ULocale.forLanguageTag(this.locale);
    if ("search".equals(usage)) {
        // "search" usage cannot be set through unicode extensions (u-co-search), handle here:
        locale = locale.setKeywordValue("collation", "search");
    }
    RuleBasedCollator collator = (RuleBasedCollator) Collator.getInstance(locale);
    collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
    collator.setNumericCollation(numeric);
    switch(caseFirst) {
        case "upper":
            collator.setUpperCaseFirst(true);
            break;
        case "lower":
            collator.setLowerCaseFirst(true);
            break;
    }
    switch(sensitivity) {
        case "base":
            collator.setStrength(Collator.PRIMARY);
            break;
        case "accent":
            collator.setStrength(Collator.SECONDARY);
            break;
        case "case":
            collator.setStrength(Collator.PRIMARY);
            collator.setCaseLevel(true);
            break;
        case "variant":
            collator.setStrength(Collator.TERTIARY);
            break;
    }
    collator.setAlternateHandlingShifted(ignorePunctuation);
    return collator;
}
Also used : RuleBasedCollator(com.ibm.icu.text.RuleBasedCollator) ULocale(com.ibm.icu.util.ULocale)

Aggregations

RuleBasedCollator (com.ibm.icu.text.RuleBasedCollator)6 ULocale (com.ibm.icu.util.ULocale)4 Collator (com.ibm.icu.text.Collator)2 File (java.io.File)2 FileOutputStream (java.io.FileOutputStream)2 SolrException (org.apache.solr.common.SolrException)2 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 FilesystemResourceLoader (org.apache.lucene.analysis.util.FilesystemResourceLoader)1 ResourceLoader (org.apache.lucene.analysis.util.ResourceLoader)1 StringMockResourceLoader (org.apache.lucene.analysis.util.StringMockResourceLoader)1 ICUCollationKeyAnalyzer (org.apache.lucene.collation.ICUCollationKeyAnalyzer)1 Settings (org.elasticsearch.common.settings.Settings)1 Index (org.elasticsearch.index.Index)1 AnalysisICUPlugin (org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin)1