Examples with UnicodeSet - android.icu.text.UnicodeSet

Example 71 with UnicodeSet

use of android.icu.text.UnicodeSet in project j2objc by google.

the class StringTokenizer method nextToken.

/**
 * Returns the next token in this string tokenizer's string. First,
 * the set of characters considered to be delimiters by this
 * <tt>StringTokenizer</tt> object is changed to be the characters in
 * the string <tt>delim</tt>. Then the next token in the string
 * after the current position is returned. The current position is
 * advanced beyond the recognized token.  The new delimiter set
 * remains the default after this call.
 * @param delim the new delimiters.
 * @return the next token, after switching to the new delimiter set.
 * @exception NoSuchElementException if there are no more tokens in
 *            this tokenizer's string.
 */
public String nextToken(String delim) {
    m_delimiters_ = EMPTY_DELIMITER_;
    if (delim != null && delim.length() > 0) {
        m_delimiters_ = new UnicodeSet();
        m_delimiters_.addAll(delim);
    }
    return nextToken(m_delimiters_);
}

Also used : UnicodeSet(android.icu.text.UnicodeSet)

Example 72 with UnicodeSet

use of android.icu.text.UnicodeSet in project j2objc by google.

the class TransliteratorTest method TestCompoundFilter.

/**
 * Compound filter semantics were orginially not implemented
 * correctly.  Originally, each component filter f(i) is replaced by
 * f'(i) = f(i) && g, where g is the filter for the compound
 * transliterator.
 *
 * From Mark:
 *
 * Suppose and I have a transliterator X. Internally X is
 * "Greek-Latin; Latin-Cyrillic; Any-Lower". I use a filter [^A].
 *
 * The compound should convert all greek characters (through latin) to
 * cyrillic, then lowercase the result. The filter should say "don't
 * touch 'A' in the original". But because an intermediate result
 * happens to go through "A", the Greek Alpha gets hung up.
 */
@Test
public void TestCompoundFilter() {
    Transliterator t = Transliterator.getInstance("Greek-Latin; Latin-Greek; Lower", Transliterator.FORWARD);
    t.setFilter(new UnicodeSet("[^A]"));
    // Only the 'A' at index 1 should remain unchanged
    expect(t, CharsToUnicodeString("BA\\u039A\\u0391"), CharsToUnicodeString("\\u03b2A\\u03ba\\u03b1"));
}

Also used : UnicodeSet(android.icu.text.UnicodeSet) Transliterator(android.icu.text.Transliterator) Test(org.junit.Test)

Example 73 with UnicodeSet

use of android.icu.text.UnicodeSet in project j2objc by google.

the class TransliteratorTest method TestSourceTargetSet2.

@Test
public void TestSourceTargetSet2() {
    Normalizer2 nfc = Normalizer2.getNFCInstance();
    Normalizer2 nfd = Normalizer2.getNFDInstance();
    // Normalizer2 nfkd = Normalizer2.getInstance(null, "nfkd", Mode.DECOMPOSE);
    // UnicodeSet nfkdSource = new UnicodeSet();
    // UnicodeSet nfkdTarget = new UnicodeSet();
    // for (int i = 0; i <= 0x10FFFF; ++i) {
    // if (nfkd.isInert(i)) {
    // continue;
    // }
    // nfkdSource.add(i);
    // String t = nfkd.getDecomposition(i);
    // if (t != null) {
    // nfkdTarget.addAll(t);
    // } else {
    // nfkdTarget.add(i);
    // }
    // }
    // nfkdSource.freeze();
    // nfkdTarget.freeze();
    // logln("NFKD Source: " + nfkdSource.toPattern(false));
    // logln("NFKD Target: " + nfkdTarget.toPattern(false));
    UnicodeMap<UnicodeSet> leadToTrail = new UnicodeMap();
    UnicodeMap<UnicodeSet> leadToSources = new UnicodeMap();
    UnicodeSet nonStarters = new UnicodeSet("[:^ccc=0:]").freeze();
    CanonicalIterator can = new CanonicalIterator("");
    UnicodeSet disorderedMarks = new UnicodeSet();
    for (int i = 0; i <= 0x10FFFF; ++i) {
        String s = nfd.getDecomposition(i);
        if (s == null) {
            continue;
        }
        can.setSource(s);
        for (String t = can.next(); t != null; t = can.next()) {
            disorderedMarks.add(t);
        }
        // if s has two code points, (or more), add the lead/trail information
        int first = s.codePointAt(0);
        int firstCount = Character.charCount(first);
        if (s.length() == firstCount)
            continue;
        String trailString = s.substring(firstCount);
        // add all the trail characters
        if (!nonStarters.containsSome(trailString)) {
            continue;
        }
        UnicodeSet trailSet = leadToTrail.get(first);
        if (trailSet == null) {
            leadToTrail.put(first, trailSet = new UnicodeSet());
        }
        // add remaining trails
        trailSet.addAll(trailString);
        // add the sources
        UnicodeSet sourcesSet = leadToSources.get(first);
        if (sourcesSet == null) {
            leadToSources.put(first, sourcesSet = new UnicodeSet());
        }
        sourcesSet.add(i);
    }
    for (Entry<String, UnicodeSet> x : leadToSources.entrySet()) {
        String lead = x.getKey();
        UnicodeSet sources = x.getValue();
        UnicodeSet trailSet = leadToTrail.get(lead);
        for (String source : sources) {
            for (String trail : trailSet) {
                can.setSource(source + trail);
                for (String t = can.next(); t != null; t = can.next()) {
                    if (t.endsWith(trail))
                        continue;
                    disorderedMarks.add(t);
                }
            }
        }
    }
    for (String s : nonStarters) {
        disorderedMarks.add("\u0345" + s);
        disorderedMarks.add(s + "\u0323");
        String xx = nfc.normalize("\u01EC" + s);
        if (!xx.startsWith("\u01EC")) {
            logln("??");
        }
    }
    // for (int i = 0; i <= 0x10FFFF; ++i) {
    // String s = nfkd.getDecomposition(i);
    // if (s != null) {
    // disorderedMarks.add(s);
    // disorderedMarks.add(nfc.normalize(s));
    // addDerivedStrings(nfc, disorderedMarks, s);
    // }
    // s = nfd.getDecomposition(i);
    // if (s != null) {
    // disorderedMarks.add(s);
    // }
    // if (!nfc.isInert(i)) {
    // if (i == 0x00C0) {
    // logln("\u00C0");
    // }
    // can.setSource(s+"\u0334");
    // for (String t = can.next(); t != null; t = can.next()) {
    // addDerivedStrings(nfc, disorderedMarks, t);
    // }
    // can.setSource(s+"\u0345");
    // for (String t = can.next(); t != null; t = can.next()) {
    // addDerivedStrings(nfc, disorderedMarks, t);
    // }
    // can.setSource(s+"\u0323");
    // for (String t = can.next(); t != null; t = can.next()) {
    // addDerivedStrings(nfc, disorderedMarks, t);
    // }
    // }
    // }
    logln("Test cases: " + disorderedMarks.size());
    disorderedMarks.addAll(0, 0x10FFFF).freeze();
    logln("isInert \u0104 " + nfc.isInert('\u0104'));
    Object[][] rules = { { ":: [:sc=COMMON:] any-name;", null }, { ":: [:Greek:] hex-any/C;", null }, { ":: [:Greek:] any-hex/C;", null }, { ":: [[:Mn:][:Me:]] remove;", null }, { ":: [[:Mn:][:Me:]] null;", null }, { ":: lower;", null }, { ":: upper;", null }, { ":: title;", null }, { ":: CaseFold;", null }, { ":: NFD;", null }, { ":: NFC;", null }, { ":: NFKD;", null }, { ":: NFKC;", null }, { ":: [[:Mn:][:Me:]] NFKD;", null }, { ":: Latin-Greek;", null }, { ":: [:Latin:] NFKD;", null }, { ":: NFKD;", null }, { ":: NFKD;\n" + ":: [[:Mn:][:Me:]] remove;\n" + ":: NFC;", null } };
    for (Object[] rulex : rules) {
        String rule = (String) rulex[0];
        Transliterator trans = Transliterator.createFromRules("temp", rule, Transliterator.FORWARD);
        UnicodeSet actualSource = trans.getSourceSet();
        UnicodeSet actualTarget = trans.getTargetSet();
        UnicodeSet empiricalSource = new UnicodeSet();
        UnicodeSet empiricalTarget = new UnicodeSet();
        String ruleDisplay = rule.replace("\n", "\t\t");
        UnicodeSet toTest = disorderedMarks;
        // if (rulex[1] != null) {
        // toTest = new UnicodeSet(disorderedMarks);
        // toTest.addAll((UnicodeSet) rulex[1]);
        // }
        String test = nfd.normalize("\u0104");
        boolean DEBUG = true;
        @SuppressWarnings("unused") int // for debugging
        count = 0;
        for (String s : toTest) {
            if (s.equals(test)) {
                logln(test);
            }
            String t = trans.transform(s);
            if (!s.equals(t)) {
                if (!isAtomic(s, t, trans)) {
                    isAtomic(s, t, trans);
                    continue;
                }
                // }
                if (DEBUG) {
                    if (!actualSource.containsAll(s)) {
                        count++;
                    }
                    if (!actualTarget.containsAll(t)) {
                        count++;
                    }
                }
                addSourceTarget(s, empiricalSource, t, empiricalTarget);
            }
        }
        assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, SetAssert.MISSING_OK);
        assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, SetAssert.MISSING_OK);
    }
}

Also used : Normalizer2(android.icu.text.Normalizer2) CaseInsensitiveString(android.icu.util.CaseInsensitiveString) ReplaceableString(android.icu.text.ReplaceableString) UnicodeSet(android.icu.text.UnicodeSet) CanonicalIterator(android.icu.text.CanonicalIterator) UnicodeMap(android.icu.dev.util.UnicodeMap) Transliterator(android.icu.text.Transliterator) Test(org.junit.Test)

Example 74 with UnicodeSet

use of android.icu.text.UnicodeSet in project j2objc by google.

the class TransliteratorTest method checkRegistry.

private void checkRegistry(String id) {
    Transliterator fie = Transliterator.getInstance(id);
    final UnicodeSet fae = new UnicodeSet("[a-z5]");
    fie.setFilter(fae);
    Transliterator foe = Transliterator.getInstance(id);
    UnicodeFilter fee = foe.getFilter();
    if (fae.equals(fee)) {
        errln("Changed what is in registry for " + id);
    }
}

Also used : UnicodeFilter(android.icu.text.UnicodeFilter) UnicodeSet(android.icu.text.UnicodeSet) Transliterator(android.icu.text.Transliterator)

Example 75 with UnicodeSet

use of android.icu.text.UnicodeSet in project j2objc by google.

the class TransliteratorTest method TestGurmukhiDevanagari.

/**
 * Test Gurmukhi-Devanagari Tippi and Bindi
 */
@Test
public void TestGurmukhiDevanagari() {
    // the rule says:
    // (\u0902) (when preceded by vowel)      --->  (\u0A02)
    // (\u0902) (when preceded by consonant)  --->  (\u0A70)
    UnicodeSet vowel = new UnicodeSet("[\u0905-\u090A \u090F\u0910\u0913\u0914 \u093e-\u0942\u0947\u0948\u094B\u094C\u094D]");
    UnicodeSet non_vowel = new UnicodeSet("[\u0915-\u0928\u092A-\u0930]");
    UnicodeSetIterator vIter = new UnicodeSetIterator(vowel);
    UnicodeSetIterator nvIter = new UnicodeSetIterator(non_vowel);
    Transliterator trans = Transliterator.getInstance("Devanagari-Gurmukhi");
    StringBuffer src = new StringBuffer(" \u0902");
    StringBuffer expect = new StringBuffer(" \u0A02");
    while (vIter.next()) {
        src.setCharAt(0, (char) vIter.codepoint);
        expect.setCharAt(0, (char) (vIter.codepoint + 0x0100));
        expect(trans, src.toString(), expect.toString());
    }
    expect.setCharAt(1, '\u0A70');
    while (nvIter.next()) {
        // src.setCharAt(0,(char) nvIter.codepoint);
        src.setCharAt(0, (char) nvIter.codepoint);
        expect.setCharAt(0, (char) (nvIter.codepoint + 0x0100));
        expect(trans, src.toString(), expect.toString());
    }
}

Also used : UnicodeSetIterator(android.icu.text.UnicodeSetIterator) UnicodeSet(android.icu.text.UnicodeSet) Transliterator(android.icu.text.Transliterator) Test(org.junit.Test)

Aggregations

UnicodeSet (android.icu.text.UnicodeSet)158 Test (org.junit.Test)112 UnicodeSetIterator (android.icu.text.UnicodeSetIterator)25 Transliterator (android.icu.text.Transliterator)19 ReplaceableString (android.icu.text.ReplaceableString)14 ULocale (android.icu.util.ULocale)13 CaseInsensitiveString (android.icu.util.CaseInsensitiveString)9 Normalizer2 (android.icu.text.Normalizer2)7 RuleBasedCollator (android.icu.text.RuleBasedCollator)7 ArrayList (java.util.ArrayList)5 HashSet (java.util.HashSet)5 FilteredNormalizer2 (android.icu.text.FilteredNormalizer2)4 SpoofChecker (android.icu.text.SpoofChecker)4 TreeSet (java.util.TreeSet)4 UnicodeMap (android.icu.dev.util.UnicodeMap)3 AlphabeticIndex (android.icu.text.AlphabeticIndex)3 CollationKey (android.icu.text.CollationKey)3 RawCollationKey (android.icu.text.RawCollationKey)3 CheckResult (android.icu.text.SpoofChecker.CheckResult)3 SpanCondition (android.icu.text.UnicodeSet.SpanCondition)3