Examples with Normalizer2 - android.icu.text.Normalizer2

Example 16 with Normalizer2

use of android.icu.text.Normalizer2 in project j2objc by google.

the class BasicTest method initSkippables.

private static UnicodeSet[] initSkippables(UnicodeSet[] skipSets) {
    skipSets[D].applyPattern("[[:NFD_QC=Yes:]&[:ccc=0:]]", false);
    skipSets[C].applyPattern("[[:NFC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false);
    skipSets[KD].applyPattern("[[:NFKD_QC=Yes:]&[:ccc=0:]]", false);
    skipSets[KC].applyPattern("[[:NFKC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false);
    // Remove from the NFC and NFKC sets all those characters that change
    // when a back-combining character is added.
    // First, get all of the back-combining characters and their combining classes.
    UnicodeSet combineBack = new UnicodeSet("[:NFC_QC=Maybe:]");
    int numCombineBack = combineBack.size();
    int[] combineBackCharsAndCc = new int[numCombineBack * 2];
    UnicodeSetIterator iter = new UnicodeSetIterator(combineBack);
    for (int i = 0; i < numCombineBack; ++i) {
        iter.next();
        int c = iter.codepoint;
        combineBackCharsAndCc[2 * i] = c;
        combineBackCharsAndCc[2 * i + 1] = UCharacter.getCombiningClass(c);
    }
    // We need not look at control codes, Han characters nor Hangul LVT syllables because they
    // do not combine forward. LV syllables are already removed.
    UnicodeSet notInteresting = new UnicodeSet("[[:C:][:Unified_Ideograph:][:HST=LVT:]]");
    UnicodeSet unsure = ((UnicodeSet) (skipSets[C].clone())).removeAll(notInteresting);
    // System.out.format("unsure.size()=%d\n", unsure.size());
    // For each character about which we are unsure, see if it changes when we add
    // one of the back-combining characters.
    Normalizer2 norm2 = Normalizer2.getNFCInstance();
    StringBuilder s = new StringBuilder();
    iter.reset(unsure);
    while (iter.next()) {
        int c = iter.codepoint;
        s.delete(0, 0x7fffffff).appendCodePoint(c);
        int cLength = s.length();
        int tccc = UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS);
        for (int i = 0; i < numCombineBack; ++i) {
            // If c's decomposition ends with a character with non-zero combining class, then
            // c can only change if it combines with a character with a non-zero combining class.
            int cc2 = combineBackCharsAndCc[2 * i + 1];
            if (tccc == 0 || cc2 != 0) {
                int c2 = combineBackCharsAndCc[2 * i];
                s.appendCodePoint(c2);
                if (!norm2.isNormalized(s)) {
                    // System.out.format("remove U+%04x (tccc=%d) + U+%04x (cc=%d)\n", c, tccc, c2, cc2);
                    skipSets[C].remove(c);
                    skipSets[KC].remove(c);
                    break;
                }
                s.delete(cLength, 0x7fffffff);
            }
        }
    }
    return skipSets;
}

Also used : UnicodeSetIterator(android.icu.text.UnicodeSetIterator) Normalizer2(android.icu.text.Normalizer2) FilteredNormalizer2(android.icu.text.FilteredNormalizer2) UnicodeSet(android.icu.text.UnicodeSet)

Example 17 with Normalizer2

use of android.icu.text.Normalizer2 in project j2objc by google.

the class UCharacterTest method TestUnicodeData.

/**
 * Tests for the character types, direction.<br>
 * This method reads in UnicodeData.txt file for testing purposes. A
 * default path is provided relative to the src path, however the user
 * could set a system property to change the directory path.<br>
 * e.g. java -DUnicodeData="data_directory_path"
 * android.icu.dev.test.lang.UCharacterTest
 */
@Test
public void TestUnicodeData() {
    // this is the 2 char category types used in the UnicodeData file
    final String TYPE = "LuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf";
    // directorionality types used in the UnicodeData file
    // padded by spaces to make each type size 4
    final String DIR = "L   R   EN  ES  ET  AN  CS  B   S   WS  ON  LRE LRO AL  RLE RLO PDF NSM BN  FSI LRI RLI PDI ";
    Normalizer2 nfc = Normalizer2.getNFCInstance();
    Normalizer2 nfkc = Normalizer2.getNFKCInstance();
    BufferedReader input = null;
    try {
        input = TestUtil.getDataReader("unicode/UnicodeData.txt");
        int numErrors = 0;
        for (; ; ) {
            String s = input.readLine();
            if (s == null) {
                break;
            }
            if (s.length() < 4 || s.startsWith("#")) {
                continue;
            }
            String[] fields = s.split(";", -1);
            assert (fields.length == 15) : "Number of fields is " + fields.length + ": " + s;
            int ch = Integer.parseInt(fields[0], 16);
            // testing the general category
            int type = TYPE.indexOf(fields[2]);
            if (type < 0)
                type = 0;
            else
                type = (type >> 1) + 1;
            if (UCharacter.getType(ch) != type) {
                errln("FAIL \\u" + hex(ch) + " expected type " + type);
                break;
            }
            if (UCharacter.getIntPropertyValue(ch, UProperty.GENERAL_CATEGORY_MASK) != (1 << type)) {
                errln("error: getIntPropertyValue(\\u" + Integer.toHexString(ch) + ", UProperty.GENERAL_CATEGORY_MASK) != " + "getMask(getType(ch))");
            }
            // testing combining class
            int cc = Integer.parseInt(fields[3]);
            if (UCharacter.getCombiningClass(ch) != cc) {
                errln("FAIL \\u" + hex(ch) + " expected combining " + "class " + cc);
                break;
            }
            if (nfkc.getCombiningClass(ch) != cc) {
                errln("FAIL \\u" + hex(ch) + " expected NFKC combining " + "class " + cc);
                break;
            }
            // testing the direction
            String d = fields[4];
            if (d.length() == 1)
                d = d + "   ";
            int dir = DIR.indexOf(d) >> 2;
            if (UCharacter.getDirection(ch) != dir) {
                errln("FAIL \\u" + hex(ch) + " expected direction " + dir + " but got " + UCharacter.getDirection(ch));
                break;
            }
            byte bdir = (byte) dir;
            if (UCharacter.getDirectionality(ch) != bdir) {
                errln("FAIL \\u" + hex(ch) + " expected directionality " + bdir + " but got " + UCharacter.getDirectionality(ch));
                break;
            }
            /* get Decomposition_Type & Decomposition_Mapping, field 5 */
            int dt;
            if (fields[5].length() == 0) {
                /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
                if (ch == 0xac00 || ch == 0xd7a3) {
                    dt = UCharacter.DecompositionType.CANONICAL;
                } else {
                    dt = UCharacter.DecompositionType.NONE;
                }
            } else {
                d = fields[5];
                dt = -1;
                if (d.charAt(0) == '<') {
                    int end = d.indexOf('>', 1);
                    if (end >= 0) {
                        dt = UCharacter.getPropertyValueEnum(UProperty.DECOMPOSITION_TYPE, d.substring(1, end));
                        // skip spaces
                        while (d.charAt(++end) == ' ') {
                        }
                        d = d.substring(end);
                    }
                } else {
                    dt = UCharacter.DecompositionType.CANONICAL;
                }
            }
            String dm;
            if (dt > UCharacter.DecompositionType.NONE) {
                if (ch == 0xac00) {
                    dm = "\u1100\u1161";
                } else if (ch == 0xd7a3) {
                    dm = "\ud788\u11c2";
                } else {
                    String[] dmChars = d.split(" +");
                    StringBuilder dmb = new StringBuilder(dmChars.length);
                    for (String dmc : dmChars) {
                        dmb.appendCodePoint(Integer.parseInt(dmc, 16));
                    }
                    dm = dmb.toString();
                }
            } else {
                dm = null;
            }
            if (dt < 0) {
                errln(String.format("error in UnicodeData.txt: syntax error in U+%04x decomposition field", ch));
                return;
            }
            int i = UCharacter.getIntPropertyValue(ch, UProperty.DECOMPOSITION_TYPE);
            assertEquals(String.format("error: UCharacter.getIntPropertyValue(U+%04x, UProperty.DECOMPOSITION_TYPE) is wrong", ch), dt, i);
            /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
            String mapping = nfkc.getRawDecomposition(ch);
            assertEquals(String.format("error: nfkc.getRawDecomposition(U+%04x) is wrong", ch), dm, mapping);
            /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
            if (dt != UCharacter.DecompositionType.CANONICAL) {
                dm = null;
            }
            mapping = nfc.getRawDecomposition(ch);
            assertEquals(String.format("error: nfc.getRawDecomposition(U+%04x) is wrong", ch), dm, mapping);
            /* recompose */
            if (dt == UCharacter.DecompositionType.CANONICAL && !UCharacter.hasBinaryProperty(ch, UProperty.FULL_COMPOSITION_EXCLUSION)) {
                int a = dm.codePointAt(0);
                int b = dm.codePointBefore(dm.length());
                int composite = nfc.composePair(a, b);
                assertEquals(String.format("error: nfc U+%04X decomposes to U+%04X+U+%04X " + "but does not compose back (instead U+%04X)", ch, a, b, composite), ch, composite);
            /*
                     * Note: NFKC has fewer round-trip mappings than NFC,
                     * so we can't just test nfkc.composePair(a, b) here without further data.
                     */
            }
            // testing iso comment
            try {
                String isocomment = fields[11];
                String comment = UCharacter.getISOComment(ch);
                if (comment == null) {
                    comment = "";
                }
                if (!comment.equals(isocomment)) {
                    errln("FAIL \\u" + hex(ch) + " expected iso comment " + isocomment);
                    break;
                }
            } catch (Exception e) {
                if (e.getMessage().indexOf("unames.icu") >= 0) {
                    numErrors++;
                } else {
                    throw e;
                }
            }
            String upper = fields[12];
            int tempchar = ch;
            if (upper.length() > 0) {
                tempchar = Integer.parseInt(upper, 16);
            }
            int resultCp = UCharacter.toUpperCase(ch);
            if (resultCp != tempchar) {
                errln("FAIL \\u" + Utility.hex(ch, 4) + " expected uppercase \\u" + Utility.hex(tempchar, 4) + " but got \\u" + Utility.hex(resultCp, 4));
                break;
            }
            String lower = fields[13];
            tempchar = ch;
            if (lower.length() > 0) {
                tempchar = Integer.parseInt(lower, 16);
            }
            if (UCharacter.toLowerCase(ch) != tempchar) {
                errln("FAIL \\u" + Utility.hex(ch, 4) + " expected lowercase \\u" + Utility.hex(tempchar, 4));
                break;
            }
            String title = fields[14];
            tempchar = ch;
            if (title.length() > 0) {
                tempchar = Integer.parseInt(title, 16);
            }
            if (UCharacter.toTitleCase(ch) != tempchar) {
                errln("FAIL \\u" + Utility.hex(ch, 4) + " expected titlecase \\u" + Utility.hex(tempchar, 4));
                break;
            }
        }
        if (numErrors > 0) {
            warnln("Could not find unames.icu");
        }
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        if (input != null) {
            try {
                input.close();
            } catch (IOException ignored) {
            }
        }
    }
    if (UCharacter.UnicodeBlock.of(0x0041) != UCharacter.UnicodeBlock.BASIC_LATIN || UCharacter.getIntPropertyValue(0x41, UProperty.BLOCK) != UCharacter.UnicodeBlock.BASIC_LATIN.getID()) {
        errln("UCharacter.UnicodeBlock.of(\\u0041) property failed! " + "Expected : " + UCharacter.UnicodeBlock.BASIC_LATIN.getID() + " got " + UCharacter.UnicodeBlock.of(0x0041));
    }
    // sanity check on repeated properties
    for (int ch = 0xfffe; ch <= 0x10ffff; ) {
        int type = UCharacter.getType(ch);
        if (UCharacter.getIntPropertyValue(ch, UProperty.GENERAL_CATEGORY_MASK) != (1 << type)) {
            errln("error: UCharacter.getIntPropertyValue(\\u" + Integer.toHexString(ch) + ", UProperty.GENERAL_CATEGORY_MASK) != " + "getMask(getType())");
        }
        if (type != UCharacterCategory.UNASSIGNED) {
            errln("error: UCharacter.getType(\\u" + Utility.hex(ch, 4) + " != UCharacterCategory.UNASSIGNED (returns " + UCharacterCategory.toString(UCharacter.getType(ch)) + ")");
        }
        if ((ch & 0xffff) == 0xfffe) {
            ++ch;
        } else {
            ch += 0xffff;
        }
    }
    // test that PUA is not "unassigned"
    for (int ch = 0xe000; ch <= 0x10fffd; ) {
        int type = UCharacter.getType(ch);
        if (UCharacter.getIntPropertyValue(ch, UProperty.GENERAL_CATEGORY_MASK) != (1 << type)) {
            errln("error: UCharacter.getIntPropertyValue(\\u" + Integer.toHexString(ch) + ", UProperty.GENERAL_CATEGORY_MASK) != " + "getMask(getType())");
        }
        if (type == UCharacterCategory.UNASSIGNED) {
            errln("error: UCharacter.getType(\\u" + Utility.hex(ch, 4) + ") == UCharacterCategory.UNASSIGNED");
        } else if (type != UCharacterCategory.PRIVATE_USE) {
            logln("PUA override: UCharacter.getType(\\u" + Utility.hex(ch, 4) + ")=" + type);
        }
        if (ch == 0xf8ff) {
            ch = 0xf0000;
        } else if (ch == 0xffffd) {
            ch = 0x100000;
        } else {
            ++ch;
        }
    }
}

Also used : Normalizer2(android.icu.text.Normalizer2) BufferedReader(java.io.BufferedReader) IOException(java.io.IOException) IOException(java.io.IOException) Test(org.junit.Test)

Example 18 with Normalizer2

use of android.icu.text.Normalizer2 in project j2objc by google.

the class UCharacterTest method TestConsistency.

/* various tests for consistency of UCD data and API behavior */
@Test
public void TestConsistency() {
    UnicodeSet set1, set2, set3, set4;
    int start, end;
    int i, length;
    String hyphenPattern = "[:Hyphen:]";
    String dashPattern = "[:Dash:]";
    String lowerPattern = "[:Lowercase:]";
    String formatPattern = "[:Cf:]";
    String alphaPattern = "[:Alphabetic:]";
    /*
        * It used to be that UCD.html and its precursors said
        * "Those dashes used to mark connections between pieces of words,
        *  plus the Katakana middle dot."
        *
        * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
        * but not from Hyphen.
        * UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
        * Therefore, do not show errors when testing the Hyphen property.
        */
    logln("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n" + "known to the UTC and not considered errors.\n");
    set1 = new UnicodeSet(hyphenPattern);
    set2 = new UnicodeSet(dashPattern);
    /* remove the Katakana middle dot(s) from set1 */
    set1.remove(0x30fb);
    set2.remove(0xff65);
    /* halfwidth variant */
    showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", false);
    /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
    set3 = new UnicodeSet(formatPattern);
    set4 = new UnicodeSet(alphaPattern);
    showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", false);
    showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", true);
    showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", true);
    /*
        * Check that each lowercase character has "small" in its name
        * and not "capital".
        * There are some such characters, some of which seem odd.
        * Use the verbose flag to see these notices.
        */
    set1 = new UnicodeSet(lowerPattern);
    for (i = 0; ; ++i) {
        // try{
        // length=set1.getItem(set1, i, &start, &end, NULL, 0, &errorCode);
        // }catch(Exception e){
        // break;
        // }
        start = set1.getRangeStart(i);
        end = set1.getRangeEnd(i);
        length = i < set1.getRangeCount() ? set1.getRangeCount() : 0;
        if (length != 0) {
            break;
        /* done with code points, got a string or -1 */
        }
        while (start <= end) {
            String name = UCharacter.getName(start);
            if ((name.indexOf("SMALL") < 0 || name.indexOf("CAPITAL") < -1) && name.indexOf("SMALL CAPITAL") == -1) {
                logln("info: [:Lowercase:] contains U+" + hex(start) + " whose name does not suggest lowercase: " + name);
            }
            ++start;
        }
    }
    /*
        * Test for an example that unorm_getCanonStartSet() delivers
        * all characters that compose from the input one,
        * even in multiple steps.
        * For example, the set for "I" (0049) should contain both
        * I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
        * In general, the set for the middle such character should be a subset
        * of the set for the first.
        */
    Normalizer2 norm2 = Normalizer2.getNFDInstance();
    set1 = new UnicodeSet();
    Norm2AllModes.getNFCInstance().impl.ensureCanonIterData().getCanonStartSet(0x49, set1);
    set2 = new UnicodeSet();
    /* enumerate all characters that are plausible to be latin letters */
    for (start = 0xa0; start < 0x2000; ++start) {
        String decomp = norm2.normalize(UTF16.valueOf(start));
        if (decomp.length() > 1 && decomp.charAt(0) == 0x49) {
            set2.add(start);
        }
    }
    compareUSets(set1, set2, "[canon start set of 0049]", "[all c with canon decomp with 0049]", false);
}

Also used : Normalizer2(android.icu.text.Normalizer2) UnicodeSet(android.icu.text.UnicodeSet) Test(org.junit.Test)

Aggregations

Normalizer2 (android.icu.text.Normalizer2)18 Test (org.junit.Test)16 FilteredNormalizer2 (android.icu.text.FilteredNormalizer2)13 UnicodeSet (android.icu.text.UnicodeSet)7 UnicodeSetIterator (android.icu.text.UnicodeSetIterator)2 BufferedReader (java.io.BufferedReader)2 IOException (java.io.IOException)2 UnicodeMap (android.icu.dev.util.UnicodeMap)1 Norm2AllModes (android.icu.impl.Norm2AllModes)1 Normalizer2Impl (android.icu.impl.Normalizer2Impl)1 CanonicalIterator (android.icu.text.CanonicalIterator)1 ReplaceableString (android.icu.text.ReplaceableString)1 SpoofChecker (android.icu.text.SpoofChecker)1 Transliterator (android.icu.text.Transliterator)1 UTF16 (android.icu.text.UTF16)1 CaseInsensitiveString (android.icu.util.CaseInsensitiveString)1 ArrayList (java.util.ArrayList)1 Matcher (java.util.regex.Matcher)1