Examples with CanonicalIterator - android.icu.text.CanonicalIterator

Example 1 with CanonicalIterator

use of android.icu.text.CanonicalIterator in project j2objc by google.

the class TransliteratorTest method TestSourceTargetSet2.

@Test
public void TestSourceTargetSet2() {
    Normalizer2 nfc = Normalizer2.getNFCInstance();
    Normalizer2 nfd = Normalizer2.getNFDInstance();
    // Normalizer2 nfkd = Normalizer2.getInstance(null, "nfkd", Mode.DECOMPOSE);
    // UnicodeSet nfkdSource = new UnicodeSet();
    // UnicodeSet nfkdTarget = new UnicodeSet();
    // for (int i = 0; i <= 0x10FFFF; ++i) {
    // if (nfkd.isInert(i)) {
    // continue;
    // }
    // nfkdSource.add(i);
    // String t = nfkd.getDecomposition(i);
    // if (t != null) {
    // nfkdTarget.addAll(t);
    // } else {
    // nfkdTarget.add(i);
    // }
    // }
    // nfkdSource.freeze();
    // nfkdTarget.freeze();
    // logln("NFKD Source: " + nfkdSource.toPattern(false));
    // logln("NFKD Target: " + nfkdTarget.toPattern(false));
    UnicodeMap<UnicodeSet> leadToTrail = new UnicodeMap();
    UnicodeMap<UnicodeSet> leadToSources = new UnicodeMap();
    UnicodeSet nonStarters = new UnicodeSet("[:^ccc=0:]").freeze();
    CanonicalIterator can = new CanonicalIterator("");
    UnicodeSet disorderedMarks = new UnicodeSet();
    for (int i = 0; i <= 0x10FFFF; ++i) {
        String s = nfd.getDecomposition(i);
        if (s == null) {
            continue;
        }
        can.setSource(s);
        for (String t = can.next(); t != null; t = can.next()) {
            disorderedMarks.add(t);
        }
        // if s has two code points, (or more), add the lead/trail information
        int first = s.codePointAt(0);
        int firstCount = Character.charCount(first);
        if (s.length() == firstCount)
            continue;
        String trailString = s.substring(firstCount);
        // add all the trail characters
        if (!nonStarters.containsSome(trailString)) {
            continue;
        }
        UnicodeSet trailSet = leadToTrail.get(first);
        if (trailSet == null) {
            leadToTrail.put(first, trailSet = new UnicodeSet());
        }
        // add remaining trails
        trailSet.addAll(trailString);
        // add the sources
        UnicodeSet sourcesSet = leadToSources.get(first);
        if (sourcesSet == null) {
            leadToSources.put(first, sourcesSet = new UnicodeSet());
        }
        sourcesSet.add(i);
    }
    for (Entry<String, UnicodeSet> x : leadToSources.entrySet()) {
        String lead = x.getKey();
        UnicodeSet sources = x.getValue();
        UnicodeSet trailSet = leadToTrail.get(lead);
        for (String source : sources) {
            for (String trail : trailSet) {
                can.setSource(source + trail);
                for (String t = can.next(); t != null; t = can.next()) {
                    if (t.endsWith(trail))
                        continue;
                    disorderedMarks.add(t);
                }
            }
        }
    }
    for (String s : nonStarters) {
        disorderedMarks.add("\u0345" + s);
        disorderedMarks.add(s + "\u0323");
        String xx = nfc.normalize("\u01EC" + s);
        if (!xx.startsWith("\u01EC")) {
            logln("??");
        }
    }
    // for (int i = 0; i <= 0x10FFFF; ++i) {
    // String s = nfkd.getDecomposition(i);
    // if (s != null) {
    // disorderedMarks.add(s);
    // disorderedMarks.add(nfc.normalize(s));
    // addDerivedStrings(nfc, disorderedMarks, s);
    // }
    // s = nfd.getDecomposition(i);
    // if (s != null) {
    // disorderedMarks.add(s);
    // }
    // if (!nfc.isInert(i)) {
    // if (i == 0x00C0) {
    // logln("\u00C0");
    // }
    // can.setSource(s+"\u0334");
    // for (String t = can.next(); t != null; t = can.next()) {
    // addDerivedStrings(nfc, disorderedMarks, t);
    // }
    // can.setSource(s+"\u0345");
    // for (String t = can.next(); t != null; t = can.next()) {
    // addDerivedStrings(nfc, disorderedMarks, t);
    // }
    // can.setSource(s+"\u0323");
    // for (String t = can.next(); t != null; t = can.next()) {
    // addDerivedStrings(nfc, disorderedMarks, t);
    // }
    // }
    // }
    logln("Test cases: " + disorderedMarks.size());
    disorderedMarks.addAll(0, 0x10FFFF).freeze();
    logln("isInert \u0104 " + nfc.isInert('\u0104'));
    Object[][] rules = { { ":: [:sc=COMMON:] any-name;", null }, { ":: [:Greek:] hex-any/C;", null }, { ":: [:Greek:] any-hex/C;", null }, { ":: [[:Mn:][:Me:]] remove;", null }, { ":: [[:Mn:][:Me:]] null;", null }, { ":: lower;", null }, { ":: upper;", null }, { ":: title;", null }, { ":: CaseFold;", null }, { ":: NFD;", null }, { ":: NFC;", null }, { ":: NFKD;", null }, { ":: NFKC;", null }, { ":: [[:Mn:][:Me:]] NFKD;", null }, { ":: Latin-Greek;", null }, { ":: [:Latin:] NFKD;", null }, { ":: NFKD;", null }, { ":: NFKD;\n" + ":: [[:Mn:][:Me:]] remove;\n" + ":: NFC;", null } };
    for (Object[] rulex : rules) {
        String rule = (String) rulex[0];
        Transliterator trans = Transliterator.createFromRules("temp", rule, Transliterator.FORWARD);
        UnicodeSet actualSource = trans.getSourceSet();
        UnicodeSet actualTarget = trans.getTargetSet();
        UnicodeSet empiricalSource = new UnicodeSet();
        UnicodeSet empiricalTarget = new UnicodeSet();
        String ruleDisplay = rule.replace("\n", "\t\t");
        UnicodeSet toTest = disorderedMarks;
        // if (rulex[1] != null) {
        // toTest = new UnicodeSet(disorderedMarks);
        // toTest.addAll((UnicodeSet) rulex[1]);
        // }
        String test = nfd.normalize("\u0104");
        boolean DEBUG = true;
        @SuppressWarnings("unused") int // for debugging
        count = 0;
        for (String s : toTest) {
            if (s.equals(test)) {
                logln(test);
            }
            String t = trans.transform(s);
            if (!s.equals(t)) {
                if (!isAtomic(s, t, trans)) {
                    isAtomic(s, t, trans);
                    continue;
                }
                // }
                if (DEBUG) {
                    if (!actualSource.containsAll(s)) {
                        count++;
                    }
                    if (!actualTarget.containsAll(t)) {
                        count++;
                    }
                }
                addSourceTarget(s, empiricalSource, t, empiricalTarget);
            }
        }
        assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, SetAssert.MISSING_OK);
        assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, SetAssert.MISSING_OK);
    }
}

Also used : Normalizer2(android.icu.text.Normalizer2) CaseInsensitiveString(android.icu.util.CaseInsensitiveString) ReplaceableString(android.icu.text.ReplaceableString) UnicodeSet(android.icu.text.UnicodeSet) CanonicalIterator(android.icu.text.CanonicalIterator) UnicodeMap(android.icu.dev.util.UnicodeMap) Transliterator(android.icu.text.Transliterator) Test(org.junit.Test)

Example 2 with CanonicalIterator

use of android.icu.text.CanonicalIterator in project j2objc by google.

the class CollationBuilder method addOnlyClosure.

private int addOnlyClosure(CharSequence nfdPrefix, CharSequence nfdString, long[] newCEs, int newCEsLength, int ce32) {
    // TODO: make CanonicalIterator work with CharSequence, or maybe change arguments here to String
    if (nfdPrefix.length() == 0) {
        CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString());
        String prefix = "";
        for (; ; ) {
            String str = stringIter.next();
            if (str == null) {
                break;
            }
            if (ignoreString(str) || str.contentEquals(nfdString)) {
                continue;
            }
            ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32);
        }
    } else {
        CanonicalIterator prefixIter = new CanonicalIterator(nfdPrefix.toString());
        CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString());
        for (; ; ) {
            String prefix = prefixIter.next();
            if (prefix == null) {
                break;
            }
            if (ignorePrefix(prefix)) {
                continue;
            }
            boolean samePrefix = prefix.contentEquals(nfdPrefix);
            for (; ; ) {
                String str = stringIter.next();
                if (str == null) {
                    break;
                }
                if (ignoreString(str) || (samePrefix && str.contentEquals(nfdString))) {
                    continue;
                }
                ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32);
            }
            stringIter.reset();
        }
    }
    return ce32;
}

Also used : CanonicalIterator(android.icu.text.CanonicalIterator)

Example 3 with CanonicalIterator

use of android.icu.text.CanonicalIterator in project j2objc by google.

the class TestCanonicalIterator method TestExhaustive.

@Test
public void TestExhaustive() {
    int counter = 0;
    CanonicalIterator it = new CanonicalIterator("");
    for (int i = 0; i < 0x10FFFF; ++i) {
        // skip characters we know don't have decomps
        int type = UCharacter.getType(i);
        if (type == Character.UNASSIGNED || type == Character.PRIVATE_USE || type == Character.SURROGATE)
            continue;
        if ((++counter % 5000) == 0)
            logln("Testing " + Utility.hex(i, 0));
        String s = UTF16.valueOf(i);
        characterTest(s, i, it);
        characterTest(s + "\u0345", i, it);
    }
}

Also used : CanonicalIterator(android.icu.text.CanonicalIterator) Test(org.junit.Test)

Example 4 with CanonicalIterator

use of android.icu.text.CanonicalIterator in project j2objc by google.

the class TestCanonicalIterator method TestSpeed.

public int TestSpeed() {
    // skip unless verbose
    if (!isVerbose())
        return 0;
    String s = "\uAC01\u0345";
    CanonicalIterator it = new CanonicalIterator(s);
    double start, end;
    // just to keep code from optimizing away.
    int x = 0;
    int iterations = 10000;
    double slowDelta = 0;
    /*
        CanonicalIterator slowIt = new CanonicalIterator(s);
        slowIt.SKIP_ZEROS = false;

        start = System.currentTimeMillis();
        for (int i = 0; i < iterations; ++i) {
            slowIt.setSource(s);
            while (true) {
                String item = slowIt.next();
                if (item == null) break;
                x += item.length();
            }
        }
        end = System.currentTimeMillis();
        double slowDelta = (end-start) / iterations;
        logln("Slow iteration: " + slowDelta);
        */
    start = System.currentTimeMillis();
    for (int i = 0; i < iterations; ++i) {
        it.setSource(s);
        while (true) {
            String item = it.next();
            if (item == null)
                break;
            x += item.length();
        }
    }
    end = System.currentTimeMillis();
    double fastDelta = (end - start) / iterations;
    logln("Fast iteration: " + fastDelta + (slowDelta != 0 ? ", " + (fastDelta / slowDelta) : ""));
    return x;
}

Also used : CanonicalIterator(android.icu.text.CanonicalIterator)

Example 5 with CanonicalIterator

use of android.icu.text.CanonicalIterator in project j2objc by google.

the class TestCanonicalIterator method TestBasic.

@Test
public void TestBasic() {
    // This is not interesting anymore as the data is already built
    // beforehand
    // check build
    // UnicodeSet ss = CanonicalIterator.getSafeStart();
    // logln("Safe Start: " + ss.toPattern(true));
    // ss = CanonicalIterator.getStarts('a');
    // expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
    // new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
    // + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
    // );
    // check permute
    // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!
    Set results = new TreeSet();
    CanonicalIterator.permute("ABC", false, results);
    expectEqual("Simple permutation ", "", collectionToString(results), "ABC, ACB, BAC, BCA, CAB, CBA");
    // try samples
    SortedSet set = new TreeSet();
    for (int i = 0; i < testArray.length; ++i) {
        // logln("Results for: " + name.transliterate(testArray[i]));
        CanonicalIterator it = new CanonicalIterator(testArray[i][0]);
        // int counter = 0;
        set.clear();
        String first = null;
        while (true) {
            String result = it.next();
            if (first == null) {
                first = result;
            }
            if (result == null)
                break;
            // sort them
            set.add(result);
        // logln(++counter + ": " + hex.transliterate(result));
        // logln(" = " + name.transliterate(result));
        }
        expectEqual(i + ": ", testArray[i][0], collectionToString(set), testArray[i][1]);
        it.reset();
        if (!it.next().equals(first)) {
            errln("CanonicalIterator.reset() failed");
        }
        if (!it.getSource().equals(Normalizer.normalize(testArray[i][0], Normalizer.NFD))) {
            errln("CanonicalIterator.getSource() does not return NFD of input source");
        }
    }
}

Also used : CanonicalIterator(android.icu.text.CanonicalIterator) SortedSet(java.util.SortedSet) Set(java.util.Set) TreeSet(java.util.TreeSet) TreeSet(java.util.TreeSet) SortedSet(java.util.SortedSet) Test(org.junit.Test)

Aggregations

CanonicalIterator (android.icu.text.CanonicalIterator)5 Test (org.junit.Test)3 UnicodeMap (android.icu.dev.util.UnicodeMap)1 Normalizer2 (android.icu.text.Normalizer2)1 ReplaceableString (android.icu.text.ReplaceableString)1 Transliterator (android.icu.text.Transliterator)1 UnicodeSet (android.icu.text.UnicodeSet)1 CaseInsensitiveString (android.icu.util.CaseInsensitiveString)1 Set (java.util.Set)1 SortedSet (java.util.SortedSet)1 TreeSet (java.util.TreeSet)1