Search in sources :

Example 1 with UnicodeSet

use of com.ibm.icu.text.UnicodeSet in project antlr4 by antlr.

the class UnicodeDataTemplateController method addTR35ExtendedPictographicPropertyCodesToCodePointRanges.

private static void addTR35ExtendedPictographicPropertyCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
    IntervalSet set = new IntervalSet();
    // Generated using scripts/parse-extended-pictographic/parse.py
    set.add(0x1F774, 0x1F77F);
    set.add(0x2700, 0x2701);
    set.add(0x2703, 0x2704);
    set.add(0x270E);
    set.add(0x2710, 0x2711);
    set.add(0x2765, 0x2767);
    set.add(0x1F030, 0x1F093);
    set.add(0x1F094, 0x1F09F);
    set.add(0x1F10D, 0x1F10F);
    set.add(0x1F12F);
    set.add(0x1F16C, 0x1F16F);
    set.add(0x1F1AD, 0x1F1E5);
    set.add(0x1F260, 0x1F265);
    set.add(0x1F203, 0x1F20F);
    set.add(0x1F23C, 0x1F23F);
    set.add(0x1F249, 0x1F24F);
    set.add(0x1F252, 0x1F25F);
    set.add(0x1F266, 0x1F2FF);
    set.add(0x1F7D5, 0x1F7FF);
    set.add(0x1F000, 0x1F003);
    set.add(0x1F005, 0x1F02B);
    set.add(0x1F02C, 0x1F02F);
    set.add(0x1F322, 0x1F323);
    set.add(0x1F394, 0x1F395);
    set.add(0x1F398);
    set.add(0x1F39C, 0x1F39D);
    set.add(0x1F3F1, 0x1F3F2);
    set.add(0x1F3F6);
    set.add(0x1F4FE);
    set.add(0x1F53E, 0x1F548);
    set.add(0x1F54F);
    set.add(0x1F568, 0x1F56E);
    set.add(0x1F571, 0x1F572);
    set.add(0x1F57B, 0x1F586);
    set.add(0x1F588, 0x1F589);
    set.add(0x1F58E, 0x1F58F);
    set.add(0x1F591, 0x1F594);
    set.add(0x1F597, 0x1F5A3);
    set.add(0x1F5A6, 0x1F5A7);
    set.add(0x1F5A9, 0x1F5B0);
    set.add(0x1F5B3, 0x1F5BB);
    set.add(0x1F5BD, 0x1F5C1);
    set.add(0x1F5C5, 0x1F5D0);
    set.add(0x1F5D4, 0x1F5DB);
    set.add(0x1F5DF, 0x1F5E0);
    set.add(0x1F5E2);
    set.add(0x1F5E4, 0x1F5E7);
    set.add(0x1F5E9, 0x1F5EE);
    set.add(0x1F5F0, 0x1F5F2);
    set.add(0x1F5F4, 0x1F5F9);
    set.add(0x2605);
    set.add(0x2607, 0x260D);
    set.add(0x260F, 0x2610);
    set.add(0x2612);
    set.add(0x2616, 0x2617);
    set.add(0x2619, 0x261C);
    set.add(0x261E, 0x261F);
    set.add(0x2621);
    set.add(0x2624, 0x2625);
    set.add(0x2627, 0x2629);
    set.add(0x262B, 0x262D);
    set.add(0x2630, 0x2637);
    set.add(0x263B, 0x2647);
    set.add(0x2654, 0x265F);
    set.add(0x2661, 0x2662);
    set.add(0x2664);
    set.add(0x2667);
    set.add(0x2669, 0x267A);
    set.add(0x267C, 0x267E);
    set.add(0x2680, 0x2691);
    set.add(0x2695);
    set.add(0x2698);
    set.add(0x269A);
    set.add(0x269D, 0x269F);
    set.add(0x26A2, 0x26A9);
    set.add(0x26AC, 0x26AF);
    set.add(0x26B2, 0x26BC);
    set.add(0x26BF, 0x26C3);
    set.add(0x26C6, 0x26C7);
    set.add(0x26C9, 0x26CD);
    set.add(0x26D0);
    set.add(0x26D2);
    set.add(0x26D5, 0x26E8);
    set.add(0x26EB, 0x26EF);
    set.add(0x26F6);
    set.add(0x26FB, 0x26FC);
    set.add(0x26FE, 0x26FF);
    set.add(0x2388);
    set.add(0x1FA00, 0x1FFFD);
    set.add(0x1F0A0, 0x1F0AE);
    set.add(0x1F0B1, 0x1F0BF);
    set.add(0x1F0C1, 0x1F0CF);
    set.add(0x1F0D1, 0x1F0F5);
    set.add(0x1F0AF, 0x1F0B0);
    set.add(0x1F0C0);
    set.add(0x1F0D0);
    set.add(0x1F0F6, 0x1F0FF);
    set.add(0x1F80C, 0x1F80F);
    set.add(0x1F848, 0x1F84F);
    set.add(0x1F85A, 0x1F85F);
    set.add(0x1F888, 0x1F88F);
    set.add(0x1F8AE, 0x1F8FF);
    set.add(0x1F900, 0x1F90B);
    set.add(0x1F91F);
    set.add(0x1F928, 0x1F92F);
    set.add(0x1F931, 0x1F932);
    set.add(0x1F94C);
    set.add(0x1F95F, 0x1F96B);
    set.add(0x1F992, 0x1F997);
    set.add(0x1F9D0, 0x1F9E6);
    set.add(0x1F90C, 0x1F90F);
    set.add(0x1F93F);
    set.add(0x1F94D, 0x1F94F);
    set.add(0x1F96C, 0x1F97F);
    set.add(0x1F998, 0x1F9BF);
    set.add(0x1F9C1, 0x1F9CF);
    set.add(0x1F9E7, 0x1F9FF);
    set.add(0x1F6C6, 0x1F6CA);
    set.add(0x1F6D3, 0x1F6D4);
    set.add(0x1F6E6, 0x1F6E8);
    set.add(0x1F6EA);
    set.add(0x1F6F1, 0x1F6F2);
    set.add(0x1F6F7, 0x1F6F8);
    set.add(0x1F6D5, 0x1F6DF);
    set.add(0x1F6ED, 0x1F6EF);
    set.add(0x1F6F9, 0x1F6FF);
    propertyCodePointRanges.put("Extended_Pictographic", set);
    UnicodeSet emojiRKUnicodeSet = new UnicodeSet("[\\p{GCB=Regional_Indicator}\\*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]");
    IntervalSet emojiRKIntervalSet = new IntervalSet();
    addUnicodeSetToIntervalSet(emojiRKUnicodeSet, emojiRKIntervalSet);
    propertyCodePointRanges.put("EmojiRK", emojiRKIntervalSet);
    UnicodeSet emojiNRKUnicodeSet = new UnicodeSet("[\\p{Emoji=Yes}]");
    emojiNRKUnicodeSet.removeAll(emojiRKUnicodeSet);
    IntervalSet emojiNRKIntervalSet = new IntervalSet();
    addUnicodeSetToIntervalSet(emojiNRKUnicodeSet, emojiNRKIntervalSet);
    propertyCodePointRanges.put("EmojiNRK", emojiNRKIntervalSet);
}
Also used : IntervalSet(org.antlr.v4.runtime.misc.IntervalSet) UnicodeSet(com.ibm.icu.text.UnicodeSet)

Example 2 with UnicodeSet

use of com.ibm.icu.text.UnicodeSet in project lucene-solr by apache.

the class GenerateUTR30DataFiles method expandSingleRule.

private static void expandSingleRule(StringBuilder builder, String leftHandSide, String rightHandSide) throws IllegalArgumentException {
    UnicodeSet set = new UnicodeSet(leftHandSide, UnicodeSet.IGNORE_SPACE);
    boolean numericValue = NUMERIC_VALUE_PATTERN.matcher(rightHandSide).matches();
    for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.nextRange(); ) {
        if (it.codepoint != UnicodeSetIterator.IS_STRING) {
            if (numericValue) {
                for (int cp = it.codepoint; cp <= it.codepointEnd; ++cp) {
                    builder.append(String.format(Locale.ROOT, "%04X", cp)).append('>');
                    builder.append(String.format(Locale.ROOT, "%04X", 0x30 + UCharacter.getNumericValue(cp)));
                    builder.append("   # ").append(UCharacter.getName(cp));
                    builder.append("\n");
                }
            } else {
                builder.append(String.format(Locale.ROOT, "%04X", it.codepoint));
                if (it.codepointEnd > it.codepoint) {
                    builder.append("..").append(String.format(Locale.ROOT, "%04X", it.codepointEnd));
                }
                builder.append('>').append(rightHandSide).append("\n");
            }
        } else {
            System.err.println("ERROR: String '" + it.getString() + "' found in UnicodeSet");
            System.exit(1);
        }
    }
}
Also used : UnicodeSetIterator(com.ibm.icu.text.UnicodeSetIterator) UnicodeSet(com.ibm.icu.text.UnicodeSet)

Example 3 with UnicodeSet

use of com.ibm.icu.text.UnicodeSet in project lucene-solr by apache.

the class TestICUTransformFilter method testOptimizerSurrogate.

public void testOptimizerSurrogate() throws Exception {
    // convert CJK UNIFIED IDEOGRAPH-20087 to an x
    String rules = "\\U00020087 > x;";
    Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
    assertTrue(custom.getFilter() == null);
    final KeywordTokenizer input = new KeywordTokenizer();
    input.setReader(new StringReader(""));
    new ICUTransformFilter(input, custom);
    assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]")));
}
Also used : StringReader(java.io.StringReader) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) UnicodeSet(com.ibm.icu.text.UnicodeSet) Transliterator(com.ibm.icu.text.Transliterator)

Example 4 with UnicodeSet

use of com.ibm.icu.text.UnicodeSet in project elasticsearch by elastic.

the class IcuFoldingTokenFilterFactory method create.

@Override
public TokenStream create(TokenStream tokenStream) {
    // ICUFoldingFilter lacks a constructor for adding filtering so we implemement it here
    if (unicodeSetFilter != null) {
        Normalizer2 base = Normalizer2.getInstance(ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"), "utr30", Normalizer2.Mode.COMPOSE);
        UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter);
        unicodeSet.freeze();
        Normalizer2 filtered = new FilteredNormalizer2(base, unicodeSet);
        return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, filtered);
    } else {
        return new ICUFoldingFilter(tokenStream);
    }
}
Also used : FilteredNormalizer2(com.ibm.icu.text.FilteredNormalizer2) FilteredNormalizer2(com.ibm.icu.text.FilteredNormalizer2) Normalizer2(com.ibm.icu.text.Normalizer2) UnicodeSet(com.ibm.icu.text.UnicodeSet) ICUFoldingFilter(org.apache.lucene.analysis.icu.ICUFoldingFilter)

Example 5 with UnicodeSet

use of com.ibm.icu.text.UnicodeSet in project antlr4 by antlr.

the class UnicodeDataTemplateController method addUnicodeBinaryPropertyCodesToCodePointRanges.

private static void addUnicodeBinaryPropertyCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
    for (int property = UProperty.BINARY_START; property < UProperty.BINARY_LIMIT; property++) {
        String propertyName = getShortPropertyName(property);
        IntervalSet intervalSet = new IntervalSet();
        UnicodeSet unicodeSet = new UnicodeSet();
        unicodeSet.applyIntPropertyValue(property, 1);
        for (UnicodeSet.EntryRange range : unicodeSet.ranges()) {
            intervalSet.add(range.codepoint, range.codepointEnd);
        }
        propertyCodePointRanges.put(propertyName, intervalSet);
    }
}
Also used : IntervalSet(org.antlr.v4.runtime.misc.IntervalSet) UnicodeSet(com.ibm.icu.text.UnicodeSet)

Aggregations

UnicodeSet (com.ibm.icu.text.UnicodeSet)8 IntervalSet (org.antlr.v4.runtime.misc.IntervalSet)4 Transliterator (com.ibm.icu.text.Transliterator)2 StringReader (java.io.StringReader)2 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)2 FilteredNormalizer2 (com.ibm.icu.text.FilteredNormalizer2)1 Normalizer2 (com.ibm.icu.text.Normalizer2)1 UnicodeSetIterator (com.ibm.icu.text.UnicodeSetIterator)1 ICUFoldingFilter (org.apache.lucene.analysis.icu.ICUFoldingFilter)1