use of android.icu.text.Normalizer2 in project j2objc by google.
the class BasicTest method initSkippables.
private static UnicodeSet[] initSkippables(UnicodeSet[] skipSets) {
skipSets[D].applyPattern("[[:NFD_QC=Yes:]&[:ccc=0:]]", false);
skipSets[C].applyPattern("[[:NFC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false);
skipSets[KD].applyPattern("[[:NFKD_QC=Yes:]&[:ccc=0:]]", false);
skipSets[KC].applyPattern("[[:NFKC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false);
// Remove from the NFC and NFKC sets all those characters that change
// when a back-combining character is added.
// First, get all of the back-combining characters and their combining classes.
UnicodeSet combineBack = new UnicodeSet("[:NFC_QC=Maybe:]");
int numCombineBack = combineBack.size();
int[] combineBackCharsAndCc = new int[numCombineBack * 2];
UnicodeSetIterator iter = new UnicodeSetIterator(combineBack);
for (int i = 0; i < numCombineBack; ++i) {
iter.next();
int c = iter.codepoint;
combineBackCharsAndCc[2 * i] = c;
combineBackCharsAndCc[2 * i + 1] = UCharacter.getCombiningClass(c);
}
// We need not look at control codes, Han characters nor Hangul LVT syllables because they
// do not combine forward. LV syllables are already removed.
UnicodeSet notInteresting = new UnicodeSet("[[:C:][:Unified_Ideograph:][:HST=LVT:]]");
UnicodeSet unsure = ((UnicodeSet) (skipSets[C].clone())).removeAll(notInteresting);
// System.out.format("unsure.size()=%d\n", unsure.size());
// For each character about which we are unsure, see if it changes when we add
// one of the back-combining characters.
Normalizer2 norm2 = Normalizer2.getNFCInstance();
StringBuilder s = new StringBuilder();
iter.reset(unsure);
while (iter.next()) {
int c = iter.codepoint;
s.delete(0, 0x7fffffff).appendCodePoint(c);
int cLength = s.length();
int tccc = UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS);
for (int i = 0; i < numCombineBack; ++i) {
// If c's decomposition ends with a character with non-zero combining class, then
// c can only change if it combines with a character with a non-zero combining class.
int cc2 = combineBackCharsAndCc[2 * i + 1];
if (tccc == 0 || cc2 != 0) {
int c2 = combineBackCharsAndCc[2 * i];
s.appendCodePoint(c2);
if (!norm2.isNormalized(s)) {
// System.out.format("remove U+%04x (tccc=%d) + U+%04x (cc=%d)\n", c, tccc, c2, cc2);
skipSets[C].remove(c);
skipSets[KC].remove(c);
break;
}
s.delete(cLength, 0x7fffffff);
}
}
}
return skipSets;
}
use of android.icu.text.Normalizer2 in project j2objc by google.
the class UCharacterTest method TestUnicodeData.
/**
* Tests for the character types, direction.<br>
* This method reads in UnicodeData.txt file for testing purposes. A
* default path is provided relative to the src path, however the user
* could set a system property to change the directory path.<br>
* e.g. java -DUnicodeData="data_directory_path"
* android.icu.dev.test.lang.UCharacterTest
*/
@Test
public void TestUnicodeData() {
// this is the 2 char category types used in the UnicodeData file
final String TYPE = "LuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf";
// directorionality types used in the UnicodeData file
// padded by spaces to make each type size 4
final String DIR = "L R EN ES ET AN CS B S WS ON LRE LRO AL RLE RLO PDF NSM BN FSI LRI RLI PDI ";
Normalizer2 nfc = Normalizer2.getNFCInstance();
Normalizer2 nfkc = Normalizer2.getNFKCInstance();
BufferedReader input = null;
try {
input = TestUtil.getDataReader("unicode/UnicodeData.txt");
int numErrors = 0;
for (; ; ) {
String s = input.readLine();
if (s == null) {
break;
}
if (s.length() < 4 || s.startsWith("#")) {
continue;
}
String[] fields = s.split(";", -1);
assert (fields.length == 15) : "Number of fields is " + fields.length + ": " + s;
int ch = Integer.parseInt(fields[0], 16);
// testing the general category
int type = TYPE.indexOf(fields[2]);
if (type < 0)
type = 0;
else
type = (type >> 1) + 1;
if (UCharacter.getType(ch) != type) {
errln("FAIL \\u" + hex(ch) + " expected type " + type);
break;
}
if (UCharacter.getIntPropertyValue(ch, UProperty.GENERAL_CATEGORY_MASK) != (1 << type)) {
errln("error: getIntPropertyValue(\\u" + Integer.toHexString(ch) + ", UProperty.GENERAL_CATEGORY_MASK) != " + "getMask(getType(ch))");
}
// testing combining class
int cc = Integer.parseInt(fields[3]);
if (UCharacter.getCombiningClass(ch) != cc) {
errln("FAIL \\u" + hex(ch) + " expected combining " + "class " + cc);
break;
}
if (nfkc.getCombiningClass(ch) != cc) {
errln("FAIL \\u" + hex(ch) + " expected NFKC combining " + "class " + cc);
break;
}
// testing the direction
String d = fields[4];
if (d.length() == 1)
d = d + " ";
int dir = DIR.indexOf(d) >> 2;
if (UCharacter.getDirection(ch) != dir) {
errln("FAIL \\u" + hex(ch) + " expected direction " + dir + " but got " + UCharacter.getDirection(ch));
break;
}
byte bdir = (byte) dir;
if (UCharacter.getDirectionality(ch) != bdir) {
errln("FAIL \\u" + hex(ch) + " expected directionality " + bdir + " but got " + UCharacter.getDirectionality(ch));
break;
}
/* get Decomposition_Type & Decomposition_Mapping, field 5 */
int dt;
if (fields[5].length() == 0) {
/* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
if (ch == 0xac00 || ch == 0xd7a3) {
dt = UCharacter.DecompositionType.CANONICAL;
} else {
dt = UCharacter.DecompositionType.NONE;
}
} else {
d = fields[5];
dt = -1;
if (d.charAt(0) == '<') {
int end = d.indexOf('>', 1);
if (end >= 0) {
dt = UCharacter.getPropertyValueEnum(UProperty.DECOMPOSITION_TYPE, d.substring(1, end));
// skip spaces
while (d.charAt(++end) == ' ') {
}
d = d.substring(end);
}
} else {
dt = UCharacter.DecompositionType.CANONICAL;
}
}
String dm;
if (dt > UCharacter.DecompositionType.NONE) {
if (ch == 0xac00) {
dm = "\u1100\u1161";
} else if (ch == 0xd7a3) {
dm = "\ud788\u11c2";
} else {
String[] dmChars = d.split(" +");
StringBuilder dmb = new StringBuilder(dmChars.length);
for (String dmc : dmChars) {
dmb.appendCodePoint(Integer.parseInt(dmc, 16));
}
dm = dmb.toString();
}
} else {
dm = null;
}
if (dt < 0) {
errln(String.format("error in UnicodeData.txt: syntax error in U+%04x decomposition field", ch));
return;
}
int i = UCharacter.getIntPropertyValue(ch, UProperty.DECOMPOSITION_TYPE);
assertEquals(String.format("error: UCharacter.getIntPropertyValue(U+%04x, UProperty.DECOMPOSITION_TYPE) is wrong", ch), dt, i);
/* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
String mapping = nfkc.getRawDecomposition(ch);
assertEquals(String.format("error: nfkc.getRawDecomposition(U+%04x) is wrong", ch), dm, mapping);
/* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
if (dt != UCharacter.DecompositionType.CANONICAL) {
dm = null;
}
mapping = nfc.getRawDecomposition(ch);
assertEquals(String.format("error: nfc.getRawDecomposition(U+%04x) is wrong", ch), dm, mapping);
/* recompose */
if (dt == UCharacter.DecompositionType.CANONICAL && !UCharacter.hasBinaryProperty(ch, UProperty.FULL_COMPOSITION_EXCLUSION)) {
int a = dm.codePointAt(0);
int b = dm.codePointBefore(dm.length());
int composite = nfc.composePair(a, b);
assertEquals(String.format("error: nfc U+%04X decomposes to U+%04X+U+%04X " + "but does not compose back (instead U+%04X)", ch, a, b, composite), ch, composite);
/*
* Note: NFKC has fewer round-trip mappings than NFC,
* so we can't just test nfkc.composePair(a, b) here without further data.
*/
}
// testing iso comment
try {
String isocomment = fields[11];
String comment = UCharacter.getISOComment(ch);
if (comment == null) {
comment = "";
}
if (!comment.equals(isocomment)) {
errln("FAIL \\u" + hex(ch) + " expected iso comment " + isocomment);
break;
}
} catch (Exception e) {
if (e.getMessage().indexOf("unames.icu") >= 0) {
numErrors++;
} else {
throw e;
}
}
String upper = fields[12];
int tempchar = ch;
if (upper.length() > 0) {
tempchar = Integer.parseInt(upper, 16);
}
int resultCp = UCharacter.toUpperCase(ch);
if (resultCp != tempchar) {
errln("FAIL \\u" + Utility.hex(ch, 4) + " expected uppercase \\u" + Utility.hex(tempchar, 4) + " but got \\u" + Utility.hex(resultCp, 4));
break;
}
String lower = fields[13];
tempchar = ch;
if (lower.length() > 0) {
tempchar = Integer.parseInt(lower, 16);
}
if (UCharacter.toLowerCase(ch) != tempchar) {
errln("FAIL \\u" + Utility.hex(ch, 4) + " expected lowercase \\u" + Utility.hex(tempchar, 4));
break;
}
String title = fields[14];
tempchar = ch;
if (title.length() > 0) {
tempchar = Integer.parseInt(title, 16);
}
if (UCharacter.toTitleCase(ch) != tempchar) {
errln("FAIL \\u" + Utility.hex(ch, 4) + " expected titlecase \\u" + Utility.hex(tempchar, 4));
break;
}
}
if (numErrors > 0) {
warnln("Could not find unames.icu");
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (input != null) {
try {
input.close();
} catch (IOException ignored) {
}
}
}
if (UCharacter.UnicodeBlock.of(0x0041) != UCharacter.UnicodeBlock.BASIC_LATIN || UCharacter.getIntPropertyValue(0x41, UProperty.BLOCK) != UCharacter.UnicodeBlock.BASIC_LATIN.getID()) {
errln("UCharacter.UnicodeBlock.of(\\u0041) property failed! " + "Expected : " + UCharacter.UnicodeBlock.BASIC_LATIN.getID() + " got " + UCharacter.UnicodeBlock.of(0x0041));
}
// sanity check on repeated properties
for (int ch = 0xfffe; ch <= 0x10ffff; ) {
int type = UCharacter.getType(ch);
if (UCharacter.getIntPropertyValue(ch, UProperty.GENERAL_CATEGORY_MASK) != (1 << type)) {
errln("error: UCharacter.getIntPropertyValue(\\u" + Integer.toHexString(ch) + ", UProperty.GENERAL_CATEGORY_MASK) != " + "getMask(getType())");
}
if (type != UCharacterCategory.UNASSIGNED) {
errln("error: UCharacter.getType(\\u" + Utility.hex(ch, 4) + " != UCharacterCategory.UNASSIGNED (returns " + UCharacterCategory.toString(UCharacter.getType(ch)) + ")");
}
if ((ch & 0xffff) == 0xfffe) {
++ch;
} else {
ch += 0xffff;
}
}
// test that PUA is not "unassigned"
for (int ch = 0xe000; ch <= 0x10fffd; ) {
int type = UCharacter.getType(ch);
if (UCharacter.getIntPropertyValue(ch, UProperty.GENERAL_CATEGORY_MASK) != (1 << type)) {
errln("error: UCharacter.getIntPropertyValue(\\u" + Integer.toHexString(ch) + ", UProperty.GENERAL_CATEGORY_MASK) != " + "getMask(getType())");
}
if (type == UCharacterCategory.UNASSIGNED) {
errln("error: UCharacter.getType(\\u" + Utility.hex(ch, 4) + ") == UCharacterCategory.UNASSIGNED");
} else if (type != UCharacterCategory.PRIVATE_USE) {
logln("PUA override: UCharacter.getType(\\u" + Utility.hex(ch, 4) + ")=" + type);
}
if (ch == 0xf8ff) {
ch = 0xf0000;
} else if (ch == 0xffffd) {
ch = 0x100000;
} else {
++ch;
}
}
}
use of android.icu.text.Normalizer2 in project j2objc by google.
the class UCharacterTest method TestConsistency.
/* various tests for consistency of UCD data and API behavior */
@Test
public void TestConsistency() {
UnicodeSet set1, set2, set3, set4;
int start, end;
int i, length;
String hyphenPattern = "[:Hyphen:]";
String dashPattern = "[:Dash:]";
String lowerPattern = "[:Lowercase:]";
String formatPattern = "[:Cf:]";
String alphaPattern = "[:Alphabetic:]";
/*
* It used to be that UCD.html and its precursors said
* "Those dashes used to mark connections between pieces of words,
* plus the Katakana middle dot."
*
* Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
* but not from Hyphen.
* UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
* Therefore, do not show errors when testing the Hyphen property.
*/
logln("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n" + "known to the UTC and not considered errors.\n");
set1 = new UnicodeSet(hyphenPattern);
set2 = new UnicodeSet(dashPattern);
/* remove the Katakana middle dot(s) from set1 */
set1.remove(0x30fb);
set2.remove(0xff65);
/* halfwidth variant */
showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", false);
/* check that Cf is neither Hyphen nor Dash nor Alphabetic */
set3 = new UnicodeSet(formatPattern);
set4 = new UnicodeSet(alphaPattern);
showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", false);
showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", true);
showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", true);
/*
* Check that each lowercase character has "small" in its name
* and not "capital".
* There are some such characters, some of which seem odd.
* Use the verbose flag to see these notices.
*/
set1 = new UnicodeSet(lowerPattern);
for (i = 0; ; ++i) {
// try{
// length=set1.getItem(set1, i, &start, &end, NULL, 0, &errorCode);
// }catch(Exception e){
// break;
// }
start = set1.getRangeStart(i);
end = set1.getRangeEnd(i);
length = i < set1.getRangeCount() ? set1.getRangeCount() : 0;
if (length != 0) {
break;
/* done with code points, got a string or -1 */
}
while (start <= end) {
String name = UCharacter.getName(start);
if ((name.indexOf("SMALL") < 0 || name.indexOf("CAPITAL") < -1) && name.indexOf("SMALL CAPITAL") == -1) {
logln("info: [:Lowercase:] contains U+" + hex(start) + " whose name does not suggest lowercase: " + name);
}
++start;
}
}
/*
* Test for an example that unorm_getCanonStartSet() delivers
* all characters that compose from the input one,
* even in multiple steps.
* For example, the set for "I" (0049) should contain both
* I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
* In general, the set for the middle such character should be a subset
* of the set for the first.
*/
Normalizer2 norm2 = Normalizer2.getNFDInstance();
set1 = new UnicodeSet();
Norm2AllModes.getNFCInstance().impl.ensureCanonIterData().getCanonStartSet(0x49, set1);
set2 = new UnicodeSet();
/* enumerate all characters that are plausible to be latin letters */
for (start = 0xa0; start < 0x2000; ++start) {
String decomp = norm2.normalize(UTF16.valueOf(start));
if (decomp.length() > 1 && decomp.charAt(0) == 0x49) {
set2.add(start);
}
}
compareUSets(set1, set2, "[canon start set of 0049]", "[all c with canon decomp with 0049]", false);
}
Aggregations