Examples with UnicodeSetIterator - android.icu.text.UnicodeSetIterator

Example 21 with UnicodeSetIterator

use of android.icu.text.UnicodeSetIterator in project j2objc by google.

the class RoundTripTest method TestHan.

@Test
public void TestHan() throws UnsupportedEncodingException, FileNotFoundException {
    try {
        UnicodeSet exemplars = LocaleData.getExemplarSet(new ULocale("zh"), 0);
        // create string with all chars
        StringBuffer b = new StringBuffer();
        for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next(); ) {
            UTF16.append(b, it.codepoint);
        }
        String source = b.toString();
        // transform with Han translit
        Transliterator han = Transliterator.getInstance("Han-Latin");
        String target = han.transliterate(source);
        // now verify that there are no Han characters left
        UnicodeSet allHan = new UnicodeSet("[:han:]");
        assertFalse("No Han must be left after Han-Latin transliteration", allHan.containsSome(target));
        // check the pinyin translit
        Transliterator pn = Transliterator.getInstance("Latin-NumericPinyin");
        String target2 = pn.transliterate(target);
        // verify that there are no marks
        Transliterator nfc = Transliterator.getInstance("nfc");
        String nfced = nfc.transliterate(target2);
        UnicodeSet allMarks = new UnicodeSet("[:mark:]");
        assertFalse("NumericPinyin must contain no marks", allMarks.containsSome(nfced));
        // verify roundtrip
        Transliterator np = pn.getInverse();
        String target3 = np.transliterate(target);
        boolean roundtripOK = target3.equals(target);
        assertTrue("NumericPinyin must roundtrip", roundtripOK);
        if (!roundtripOK) {
            String filename = "numeric-pinyin.log.txt";
            PrintWriter out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF8"), 4 * 1024));
            errln("Creating log file " + new File(filename).getAbsoluteFile());
            out.println("Pinyin:                " + target);
            out.println("Pinyin-Numeric-Pinyin: " + target2);
            out.close();
        }
    } catch (MissingResourceException ex) {
        warnln("Could not load the locale data for fetching the exemplar characters.");
    }
}

Also used : ULocale(android.icu.util.ULocale) MissingResourceException(java.util.MissingResourceException) UnicodeSet(android.icu.text.UnicodeSet) BufferedWriter(java.io.BufferedWriter) UnicodeSetIterator(android.icu.text.UnicodeSetIterator) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) File(java.io.File) Transliterator(android.icu.text.Transliterator) PrintWriter(java.io.PrintWriter) Test(org.junit.Test)

Example 22 with UnicodeSetIterator

use of android.icu.text.UnicodeSetIterator in project j2objc by google.

the class RoundTripTest method TestHangul2.

/**
 * This is a shorter version of the test for doubles, that allows us to skip lots of cases, but
 * does check the ones that should cause problems (if any do).
 */
@Test
public void TestHangul2() {
    Transliterator lh = Transliterator.getInstance("Latin-Hangul");
    Transliterator hl = lh.getInverse();
    final UnicodeSet representativeHangul = getRepresentativeHangul();
    for (UnicodeSetIterator it = new UnicodeSetIterator(representativeHangul); it.next(); ) {
        assertRoundTripTransform("Transform", it.getString(), lh, hl);
    }
}

Also used : UnicodeSetIterator(android.icu.text.UnicodeSetIterator) UnicodeSet(android.icu.text.UnicodeSet) Transliterator(android.icu.text.Transliterator) Test(org.junit.Test)

Example 23 with UnicodeSetIterator

use of android.icu.text.UnicodeSetIterator in project j2objc by google.

the class BasicTest method initSkippables.

private static UnicodeSet[] initSkippables(UnicodeSet[] skipSets) {
    skipSets[D].applyPattern("[[:NFD_QC=Yes:]&[:ccc=0:]]", false);
    skipSets[C].applyPattern("[[:NFC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false);
    skipSets[KD].applyPattern("[[:NFKD_QC=Yes:]&[:ccc=0:]]", false);
    skipSets[KC].applyPattern("[[:NFKC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false);
    // Remove from the NFC and NFKC sets all those characters that change
    // when a back-combining character is added.
    // First, get all of the back-combining characters and their combining classes.
    UnicodeSet combineBack = new UnicodeSet("[:NFC_QC=Maybe:]");
    int numCombineBack = combineBack.size();
    int[] combineBackCharsAndCc = new int[numCombineBack * 2];
    UnicodeSetIterator iter = new UnicodeSetIterator(combineBack);
    for (int i = 0; i < numCombineBack; ++i) {
        iter.next();
        int c = iter.codepoint;
        combineBackCharsAndCc[2 * i] = c;
        combineBackCharsAndCc[2 * i + 1] = UCharacter.getCombiningClass(c);
    }
    // We need not look at control codes, Han characters nor Hangul LVT syllables because they
    // do not combine forward. LV syllables are already removed.
    UnicodeSet notInteresting = new UnicodeSet("[[:C:][:Unified_Ideograph:][:HST=LVT:]]");
    UnicodeSet unsure = ((UnicodeSet) (skipSets[C].clone())).removeAll(notInteresting);
    // System.out.format("unsure.size()=%d\n", unsure.size());
    // For each character about which we are unsure, see if it changes when we add
    // one of the back-combining characters.
    Normalizer2 norm2 = Normalizer2.getNFCInstance();
    StringBuilder s = new StringBuilder();
    iter.reset(unsure);
    while (iter.next()) {
        int c = iter.codepoint;
        s.delete(0, 0x7fffffff).appendCodePoint(c);
        int cLength = s.length();
        int tccc = UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS);
        for (int i = 0; i < numCombineBack; ++i) {
            // If c's decomposition ends with a character with non-zero combining class, then
            // c can only change if it combines with a character with a non-zero combining class.
            int cc2 = combineBackCharsAndCc[2 * i + 1];
            if (tccc == 0 || cc2 != 0) {
                int c2 = combineBackCharsAndCc[2 * i];
                s.appendCodePoint(c2);
                if (!norm2.isNormalized(s)) {
                    // System.out.format("remove U+%04x (tccc=%d) + U+%04x (cc=%d)\n", c, tccc, c2, cc2);
                    skipSets[C].remove(c);
                    skipSets[KC].remove(c);
                    break;
                }
                s.delete(cLength, 0x7fffffff);
            }
        }
    }
    return skipSets;
}

Also used : UnicodeSetIterator(android.icu.text.UnicodeSetIterator) Normalizer2(android.icu.text.Normalizer2) FilteredNormalizer2(android.icu.text.FilteredNormalizer2) UnicodeSet(android.icu.text.UnicodeSet)

Example 24 with UnicodeSetIterator

use of android.icu.text.UnicodeSetIterator in project j2objc by google.

the class BasicTest method TestSerializedSet.

@Test
public void TestSerializedSet() {
    USerializedSet sset = new USerializedSet();
    UnicodeSet set = new UnicodeSet();
    int start, end;
    char[] serialized = { // length
    0x8007, // bmpLength
    3, 0xc0, 0xfe, 0xfffc, 1, 9, 0x10, 0xfffc };
    sset.getSet(serialized, 0);
    // collect all sets into one for contiguous output
    int[] startEnd = new int[2];
    int count = sset.countRanges();
    for (int j = 0; j < count; ++j) {
        sset.getRange(j, startEnd);
        set.add(startEnd[0], startEnd[1]);
    }
    // test all of these characters
    UnicodeSetIterator it = new UnicodeSetIterator(set);
    while (it.nextRange() && it.codepoint != UnicodeSetIterator.IS_STRING) {
        start = it.codepoint;
        end = it.codepointEnd;
        while (start <= end) {
            if (!sset.contains(start)) {
                errln("USerializedSet.contains failed for " + Utility.hex(start, 8));
            }
            ++start;
        }
    }
}

Also used : UnicodeSetIterator(android.icu.text.UnicodeSetIterator) USerializedSet(android.icu.impl.USerializedSet) UnicodeSet(android.icu.text.UnicodeSet) Test(org.junit.Test)

Example 25 with UnicodeSetIterator

use of android.icu.text.UnicodeSetIterator in project j2objc by google.

the class CollectionUtilities method flatten.

/**
 * Modifies Unicode set to flatten the strings. Eg [abc{da}] => [abcd]
 * Returns the set for chaining.
 * @param exemplar1
 * @return
 */
public static UnicodeSet flatten(UnicodeSet exemplar1) {
    UnicodeSet result = new UnicodeSet();
    boolean gotString = false;
    for (UnicodeSetIterator it = new UnicodeSetIterator(exemplar1); it.nextRange(); ) {
        if (it.codepoint == UnicodeSetIterator.IS_STRING) {
            result.addAll(it.string);
            gotString = true;
        } else {
            result.add(it.codepoint, it.codepointEnd);
        }
    }
    if (gotString)
        exemplar1.set(result);
    return exemplar1;
}

Also used : UnicodeSetIterator(android.icu.text.UnicodeSetIterator) UnicodeSet(android.icu.text.UnicodeSet)

Aggregations

UnicodeSetIterator (android.icu.text.UnicodeSetIterator)31 UnicodeSet (android.icu.text.UnicodeSet)25 Test (org.junit.Test)17 ULocale (android.icu.util.ULocale)6 Transliterator (android.icu.text.Transliterator)5 HashSet (java.util.HashSet)4 RuleBasedCollator (android.icu.text.RuleBasedCollator)3 CollationData (android.icu.impl.coll.CollationData)2 CollationKey (android.icu.text.CollationKey)2 FilteredNormalizer2 (android.icu.text.FilteredNormalizer2)2 Normalizer2 (android.icu.text.Normalizer2)2 RawCollationKey (android.icu.text.RawCollationKey)2 File (java.io.File)2 FileOutputStream (java.io.FileOutputStream)2 OutputStreamWriter (java.io.OutputStreamWriter)2 PrintWriter (java.io.PrintWriter)2 ArrayList (java.util.ArrayList)2 Iterator (java.util.Iterator)2 TreeSet (java.util.TreeSet)2 Normalizer2Impl (android.icu.impl.Normalizer2Impl)1