Examples with UnicodeSet - android.icu.text.UnicodeSet

Example 6 with UnicodeSet

use of android.icu.text.UnicodeSet in project j2objc by google.

the class CollationBuilder method addTailComposites.

private void addTailComposites(CharSequence nfdPrefix, CharSequence nfdString) {
    // Look for the last starter in the NFD string.
    int lastStarter;
    int indexAfterLastStarter = nfdString.length();
    for (; ; ) {
        // no starter at all
        if (indexAfterLastStarter == 0) {
            return;
        }
        lastStarter = Character.codePointBefore(nfdString, indexAfterLastStarter);
        if (nfd.getCombiningClass(lastStarter) == 0) {
            break;
        }
        indexAfterLastStarter -= Character.charCount(lastStarter);
    }
    // No closure to Hangul syllables since we decompose them on the fly.
    if (Hangul.isJamoL(lastStarter)) {
        return;
    }
    // Are there any composites whose decomposition starts with the lastStarter?
    // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.
    // We might find some more equivalent mappings here if it did.
    UnicodeSet composites = new UnicodeSet();
    if (!nfcImpl.getCanonStartSet(lastStarter, composites)) {
        return;
    }
    StringBuilder newNFDString = new StringBuilder(), newString = new StringBuilder();
    long[] newCEs = new long[Collation.MAX_EXPANSION_LENGTH];
    UnicodeSetIterator iter = new UnicodeSetIterator(composites);
    while (iter.next()) {
        assert (iter.codepoint != UnicodeSetIterator.IS_STRING);
        int composite = iter.codepoint;
        String decomp = nfd.getDecomposition(composite);
        if (!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp, newNFDString, newString)) {
            continue;
        }
        int newCEsLength = dataBuilder.getCEs(nfdPrefix, newNFDString, newCEs, 0);
        if (newCEsLength > Collation.MAX_EXPANSION_LENGTH) {
            // Ignore mappings that we cannot store.
            continue;
        }
        // Note: It is possible that the newCEs do not make use of the mapping
        // for which we are adding the tail composites, in which case we might be adding
        // unnecessary mappings.
        // For example, when we add tail composites for ae^ (^=combining circumflex),
        // UCA discontiguous-contraction matching does not find any matches
        // for ae_^ (_=any combining diacritic below) *unless* there is also
        // a contraction mapping for ae.
        // Thus, if there is no ae contraction, then the ae^ mapping is ignored
        // while fetching the newCEs for ae_^.
        // TODO: Try to detect this effectively.
        // (Alternatively, print a warning when prefix contractions are missing.)
        // We do not need an explicit mapping for the NFD strings.
        // It is fine if the NFD input collates like this via a sequence of mappings.
        // It also saves a little bit of space, and may reduce the set of characters with contractions.
        int ce32 = addIfDifferent(nfdPrefix, newString, newCEs, newCEsLength, Collation.UNASSIGNED_CE32);
        if (ce32 != Collation.UNASSIGNED_CE32) {
            // was different, was added
            addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32);
        }
    }
}

Also used : UnicodeSetIterator(android.icu.text.UnicodeSetIterator) UnicodeSet(android.icu.text.UnicodeSet)

Example 7 with UnicodeSet

use of android.icu.text.UnicodeSet in project j2objc by google.

the class AlphabeticIndexTest method TestSchSt.

/**
 * Test labels with multiple primary weights.
 */
@Test
public void TestSchSt() {
    AlphabeticIndex index = new AlphabeticIndex(ULocale.GERMAN);
    index.addLabels(new UnicodeSet("[Æ{Sch*}{St*}]"));
    // ... A Æ B-R S Sch St T-Z ...
    ImmutableIndex immIndex = index.buildImmutableIndex();
    assertEquals("getBucketCount()", 31, index.getBucketCount());
    assertEquals("immutable getBucketCount()", 31, immIndex.getBucketCount());
    String[][] testCases = new String[][] { // name, bucket index, bucket label
    { "Adelbert", "1", "A" }, { "Afrika", "1", "A" }, { "Æsculap", "2", "Æ" }, { "Aesthet", "2", "Æ" }, { "Berlin", "3", "B" }, { "Rilke", "19", "R" }, { "Sacher", "20", "S" }, { "Seiler", "20", "S" }, { "Sultan", "20", "S" }, { "Schiller", "21", "Sch" }, { "Steiff", "22", "St" }, { "Thomas", "23", "T" } };
    List<String> labels = index.getBucketLabels();
    for (String[] testCase : testCases) {
        String name = testCase[0];
        int bucketIndex = Integer.valueOf(testCase[1]);
        String label = testCase[2];
        String msg = "getBucketIndex(" + name + ")";
        assertEquals(msg, bucketIndex, index.getBucketIndex(name));
        msg = "immutable " + msg;
        assertEquals(msg, bucketIndex, immIndex.getBucketIndex(name));
        msg = "bucket label (" + name + ")";
        assertEquals(msg, label, labels.get(index.getBucketIndex(name)));
        msg = "immutable " + msg;
        assertEquals(msg, label, immIndex.getBucket(bucketIndex).getLabel());
    }
}

Also used : ImmutableIndex(android.icu.text.AlphabeticIndex.ImmutableIndex) AlphabeticIndex(android.icu.text.AlphabeticIndex) UnicodeSet(android.icu.text.UnicodeSet) Test(org.junit.Test)

Example 8 with UnicodeSet

use of android.icu.text.UnicodeSet in project j2objc by google.

the class AlphabeticIndexTest method firstStringsInScript.

/**
 * Returns a collection of all the "First" characters of scripts, according to the collation.
 */
private static Collection<String> firstStringsInScript(RuleBasedCollator ruleBasedCollator) {
    String[] results = new String[UScript.CODE_LIMIT];
    for (String current : TO_TRY) {
        if (ruleBasedCollator.compare(current, "a") < 0) {
            // we only want "real" script characters, not symbols.
            continue;
        }
        int script = UScript.getScript(current.codePointAt(0));
        if (results[script] == null) {
            results[script] = current;
        } else if (ruleBasedCollator.compare(current, results[script]) < 0) {
            results[script] = current;
        }
    }
    try {
        UnicodeSet extras = new UnicodeSet();
        UnicodeSet expansions = new UnicodeSet();
        ruleBasedCollator.getContractionsAndExpansions(extras, expansions, true);
        extras.addAll(expansions).removeAll(TO_TRY);
        if (extras.size() != 0) {
            Normalizer2 normalizer = Normalizer2.getNFKCInstance();
            for (String current : extras) {
                if (!normalizer.isNormalized(current) || ruleBasedCollator.compare(current, "9") <= 0) {
                    continue;
                }
                int script = getFirstRealScript(current);
                if (script == UScript.UNKNOWN && !isUnassignedBoundary(current)) {
                    continue;
                }
                if (results[script] == null) {
                    results[script] = current;
                } else if (ruleBasedCollator.compare(current, results[script]) < 0) {
                    results[script] = current;
                }
            }
        }
    } catch (Exception e) {
    }
    // why have a checked exception???
    // TODO: We should not test that we get the same strings, but that we
    // get strings that sort primary-equal to those from the implementation.
    Collection<String> result = new ArrayList<String>();
    for (int i = 0; i < results.length; ++i) {
        if (results[i] != null) {
            result.add(results[i]);
        }
    }
    return result;
}

Also used : Normalizer2(android.icu.text.Normalizer2) ArrayList(java.util.ArrayList) UnicodeSet(android.icu.text.UnicodeSet)

Example 9 with UnicodeSet

use of android.icu.text.UnicodeSet in project j2objc by google.

the class AlphabeticIndexTest method TestInflow.

@Test
public void TestInflow() {
    Object[][] tests = { { 0, ULocale.ENGLISH }, { 0, ULocale.ENGLISH, new ULocale("el") }, { 1, ULocale.ENGLISH, new ULocale("ru") }, { 0, ULocale.ENGLISH, new ULocale("el"), new UnicodeSet("[\u2C80]"), new ULocale("ru") }, { 0, ULocale.ENGLISH }, { 2, ULocale.ENGLISH, new ULocale("ru"), ULocale.JAPANESE } };
    for (Object[] test : tests) {
        int expected = (Integer) test[0];
        AlphabeticIndex<Double> alphabeticIndex = new AlphabeticIndex((ULocale) test[1]);
        for (int i = 2; i < test.length; ++i) {
            if (test[i] instanceof ULocale) {
                alphabeticIndex.addLabels((ULocale) test[i]);
            } else {
                alphabeticIndex.addLabels((UnicodeSet) test[i]);
            }
        }
        Counter<AlphabeticIndex.Bucket.LabelType> counter = new Counter();
        for (Bucket<Double> bucket : alphabeticIndex) {
            LabelType labelType = bucket.getLabelType();
            counter.add(labelType, 1);
        }
        String printList = Arrays.asList(test).toString();
        assertEquals(LabelType.UNDERFLOW + "\t" + printList, 1, counter.get(LabelType.UNDERFLOW));
        assertEquals(LabelType.INFLOW + "\t" + printList, expected, counter.get(LabelType.INFLOW));
        if (expected != counter.get(LabelType.INFLOW)) {
            // for debugging
            AlphabeticIndex<Double> indexCharacters2 = new AlphabeticIndex((ULocale) test[1]);
            for (int i = 2; i < test.length; ++i) {
                if (test[i] instanceof ULocale) {
                    indexCharacters2.addLabels((ULocale) test[i]);
                } else {
                    indexCharacters2.addLabels((UnicodeSet) test[i]);
                }
            }
            List<Bucket<Double>> buckets = CollectionUtilities.addAll(alphabeticIndex.iterator(), new ArrayList<Bucket<Double>>());
            logln(buckets.toString());
        }
        assertEquals(LabelType.OVERFLOW + "\t" + printList, 1, counter.get(LabelType.OVERFLOW));
    }
}

Also used : ULocale(android.icu.util.ULocale) UnicodeSet(android.icu.text.UnicodeSet) AlphabeticIndex(android.icu.text.AlphabeticIndex) Bucket(android.icu.text.AlphabeticIndex.Bucket) LabelType(android.icu.text.AlphabeticIndex.Bucket.LabelType) Test(org.junit.Test)

Example 10 with UnicodeSet

use of android.icu.text.UnicodeSet in project j2objc by google.

the class CollationMiscTest method TestImportWithType.

@Test
public void TestImportWithType() {
    try {
        RuleBasedCollator vicoll = (RuleBasedCollator) Collator.getInstance(new ULocale("vi"));
        RuleBasedCollator decoll = (RuleBasedCollator) Collator.getInstance(ULocale.forLanguageTag("de-u-co-phonebk"));
        RuleBasedCollator videcoll = new RuleBasedCollator(vicoll.getRules() + decoll.getRules());
        RuleBasedCollator importvidecoll = new RuleBasedCollator("[import vi][import de-u-co-phonebk]");
        UnicodeSet tailoredSet = videcoll.getTailoredSet();
        UnicodeSet importTailoredSet = importvidecoll.getTailoredSet();
        if (!tailoredSet.equals(importTailoredSet)) {
            warnln("Tailored set not equal");
        }
        for (UnicodeSetIterator it = new UnicodeSetIterator(tailoredSet); it.next(); ) {
            String t = it.getString();
            CollationKey sk1 = videcoll.getCollationKey(t);
            CollationKey sk2 = importvidecoll.getCollationKey(t);
            if (!sk1.equals(sk2)) {
                warnln("Collation key's not equal for " + t);
            }
        }
    } catch (Exception e) {
        // Android patch: Add --omitCollationRules to genrb.
        logln("ERROR: in creation of rule based collator");
    // Android patch end.
    }
}

Also used : UnicodeSetIterator(android.icu.text.UnicodeSetIterator) RuleBasedCollator(android.icu.text.RuleBasedCollator) ULocale(android.icu.util.ULocale) CollationKey(android.icu.text.CollationKey) RawCollationKey(android.icu.text.RawCollationKey) UnicodeSet(android.icu.text.UnicodeSet) Test(org.junit.Test)

Aggregations

UnicodeSet (android.icu.text.UnicodeSet)158 Test (org.junit.Test)112 UnicodeSetIterator (android.icu.text.UnicodeSetIterator)25 Transliterator (android.icu.text.Transliterator)19 ReplaceableString (android.icu.text.ReplaceableString)14 ULocale (android.icu.util.ULocale)13 CaseInsensitiveString (android.icu.util.CaseInsensitiveString)9 Normalizer2 (android.icu.text.Normalizer2)7 RuleBasedCollator (android.icu.text.RuleBasedCollator)7 ArrayList (java.util.ArrayList)5 HashSet (java.util.HashSet)5 FilteredNormalizer2 (android.icu.text.FilteredNormalizer2)4 SpoofChecker (android.icu.text.SpoofChecker)4 TreeSet (java.util.TreeSet)4 UnicodeMap (android.icu.dev.util.UnicodeMap)3 AlphabeticIndex (android.icu.text.AlphabeticIndex)3 CollationKey (android.icu.text.CollationKey)3 RawCollationKey (android.icu.text.RawCollationKey)3 CheckResult (android.icu.text.SpoofChecker.CheckResult)3 SpanCondition (android.icu.text.UnicodeSet.SpanCondition)3