use of android.icu.text.CanonicalIterator in project j2objc by google.
the class TransliteratorTest method TestSourceTargetSet2.
@Test
public void TestSourceTargetSet2() {
Normalizer2 nfc = Normalizer2.getNFCInstance();
Normalizer2 nfd = Normalizer2.getNFDInstance();
// Normalizer2 nfkd = Normalizer2.getInstance(null, "nfkd", Mode.DECOMPOSE);
// UnicodeSet nfkdSource = new UnicodeSet();
// UnicodeSet nfkdTarget = new UnicodeSet();
// for (int i = 0; i <= 0x10FFFF; ++i) {
// if (nfkd.isInert(i)) {
// continue;
// }
// nfkdSource.add(i);
// String t = nfkd.getDecomposition(i);
// if (t != null) {
// nfkdTarget.addAll(t);
// } else {
// nfkdTarget.add(i);
// }
// }
// nfkdSource.freeze();
// nfkdTarget.freeze();
// logln("NFKD Source: " + nfkdSource.toPattern(false));
// logln("NFKD Target: " + nfkdTarget.toPattern(false));
UnicodeMap<UnicodeSet> leadToTrail = new UnicodeMap();
UnicodeMap<UnicodeSet> leadToSources = new UnicodeMap();
UnicodeSet nonStarters = new UnicodeSet("[:^ccc=0:]").freeze();
CanonicalIterator can = new CanonicalIterator("");
UnicodeSet disorderedMarks = new UnicodeSet();
for (int i = 0; i <= 0x10FFFF; ++i) {
String s = nfd.getDecomposition(i);
if (s == null) {
continue;
}
can.setSource(s);
for (String t = can.next(); t != null; t = can.next()) {
disorderedMarks.add(t);
}
// if s has two code points, (or more), add the lead/trail information
int first = s.codePointAt(0);
int firstCount = Character.charCount(first);
if (s.length() == firstCount)
continue;
String trailString = s.substring(firstCount);
// add all the trail characters
if (!nonStarters.containsSome(trailString)) {
continue;
}
UnicodeSet trailSet = leadToTrail.get(first);
if (trailSet == null) {
leadToTrail.put(first, trailSet = new UnicodeSet());
}
// add remaining trails
trailSet.addAll(trailString);
// add the sources
UnicodeSet sourcesSet = leadToSources.get(first);
if (sourcesSet == null) {
leadToSources.put(first, sourcesSet = new UnicodeSet());
}
sourcesSet.add(i);
}
for (Entry<String, UnicodeSet> x : leadToSources.entrySet()) {
String lead = x.getKey();
UnicodeSet sources = x.getValue();
UnicodeSet trailSet = leadToTrail.get(lead);
for (String source : sources) {
for (String trail : trailSet) {
can.setSource(source + trail);
for (String t = can.next(); t != null; t = can.next()) {
if (t.endsWith(trail))
continue;
disorderedMarks.add(t);
}
}
}
}
for (String s : nonStarters) {
disorderedMarks.add("\u0345" + s);
disorderedMarks.add(s + "\u0323");
String xx = nfc.normalize("\u01EC" + s);
if (!xx.startsWith("\u01EC")) {
logln("??");
}
}
// for (int i = 0; i <= 0x10FFFF; ++i) {
// String s = nfkd.getDecomposition(i);
// if (s != null) {
// disorderedMarks.add(s);
// disorderedMarks.add(nfc.normalize(s));
// addDerivedStrings(nfc, disorderedMarks, s);
// }
// s = nfd.getDecomposition(i);
// if (s != null) {
// disorderedMarks.add(s);
// }
// if (!nfc.isInert(i)) {
// if (i == 0x00C0) {
// logln("\u00C0");
// }
// can.setSource(s+"\u0334");
// for (String t = can.next(); t != null; t = can.next()) {
// addDerivedStrings(nfc, disorderedMarks, t);
// }
// can.setSource(s+"\u0345");
// for (String t = can.next(); t != null; t = can.next()) {
// addDerivedStrings(nfc, disorderedMarks, t);
// }
// can.setSource(s+"\u0323");
// for (String t = can.next(); t != null; t = can.next()) {
// addDerivedStrings(nfc, disorderedMarks, t);
// }
// }
// }
logln("Test cases: " + disorderedMarks.size());
disorderedMarks.addAll(0, 0x10FFFF).freeze();
logln("isInert \u0104 " + nfc.isInert('\u0104'));
Object[][] rules = { { ":: [:sc=COMMON:] any-name;", null }, { ":: [:Greek:] hex-any/C;", null }, { ":: [:Greek:] any-hex/C;", null }, { ":: [[:Mn:][:Me:]] remove;", null }, { ":: [[:Mn:][:Me:]] null;", null }, { ":: lower;", null }, { ":: upper;", null }, { ":: title;", null }, { ":: CaseFold;", null }, { ":: NFD;", null }, { ":: NFC;", null }, { ":: NFKD;", null }, { ":: NFKC;", null }, { ":: [[:Mn:][:Me:]] NFKD;", null }, { ":: Latin-Greek;", null }, { ":: [:Latin:] NFKD;", null }, { ":: NFKD;", null }, { ":: NFKD;\n" + ":: [[:Mn:][:Me:]] remove;\n" + ":: NFC;", null } };
for (Object[] rulex : rules) {
String rule = (String) rulex[0];
Transliterator trans = Transliterator.createFromRules("temp", rule, Transliterator.FORWARD);
UnicodeSet actualSource = trans.getSourceSet();
UnicodeSet actualTarget = trans.getTargetSet();
UnicodeSet empiricalSource = new UnicodeSet();
UnicodeSet empiricalTarget = new UnicodeSet();
String ruleDisplay = rule.replace("\n", "\t\t");
UnicodeSet toTest = disorderedMarks;
// if (rulex[1] != null) {
// toTest = new UnicodeSet(disorderedMarks);
// toTest.addAll((UnicodeSet) rulex[1]);
// }
String test = nfd.normalize("\u0104");
boolean DEBUG = true;
@SuppressWarnings("unused") int // for debugging
count = 0;
for (String s : toTest) {
if (s.equals(test)) {
logln(test);
}
String t = trans.transform(s);
if (!s.equals(t)) {
if (!isAtomic(s, t, trans)) {
isAtomic(s, t, trans);
continue;
}
// }
if (DEBUG) {
if (!actualSource.containsAll(s)) {
count++;
}
if (!actualTarget.containsAll(t)) {
count++;
}
}
addSourceTarget(s, empiricalSource, t, empiricalTarget);
}
}
assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, SetAssert.MISSING_OK);
assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, SetAssert.MISSING_OK);
}
}
use of android.icu.text.CanonicalIterator in project j2objc by google.
the class CollationBuilder method addOnlyClosure.
private int addOnlyClosure(CharSequence nfdPrefix, CharSequence nfdString, long[] newCEs, int newCEsLength, int ce32) {
// TODO: make CanonicalIterator work with CharSequence, or maybe change arguments here to String
if (nfdPrefix.length() == 0) {
CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString());
String prefix = "";
for (; ; ) {
String str = stringIter.next();
if (str == null) {
break;
}
if (ignoreString(str) || str.contentEquals(nfdString)) {
continue;
}
ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32);
}
} else {
CanonicalIterator prefixIter = new CanonicalIterator(nfdPrefix.toString());
CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString());
for (; ; ) {
String prefix = prefixIter.next();
if (prefix == null) {
break;
}
if (ignorePrefix(prefix)) {
continue;
}
boolean samePrefix = prefix.contentEquals(nfdPrefix);
for (; ; ) {
String str = stringIter.next();
if (str == null) {
break;
}
if (ignoreString(str) || (samePrefix && str.contentEquals(nfdString))) {
continue;
}
ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32);
}
stringIter.reset();
}
}
return ce32;
}
use of android.icu.text.CanonicalIterator in project j2objc by google.
the class TestCanonicalIterator method TestExhaustive.
@Test
public void TestExhaustive() {
int counter = 0;
CanonicalIterator it = new CanonicalIterator("");
for (int i = 0; i < 0x10FFFF; ++i) {
// skip characters we know don't have decomps
int type = UCharacter.getType(i);
if (type == Character.UNASSIGNED || type == Character.PRIVATE_USE || type == Character.SURROGATE)
continue;
if ((++counter % 5000) == 0)
logln("Testing " + Utility.hex(i, 0));
String s = UTF16.valueOf(i);
characterTest(s, i, it);
characterTest(s + "\u0345", i, it);
}
}
use of android.icu.text.CanonicalIterator in project j2objc by google.
the class TestCanonicalIterator method TestSpeed.
public int TestSpeed() {
// skip unless verbose
if (!isVerbose())
return 0;
String s = "\uAC01\u0345";
CanonicalIterator it = new CanonicalIterator(s);
double start, end;
// just to keep code from optimizing away.
int x = 0;
int iterations = 10000;
double slowDelta = 0;
/*
CanonicalIterator slowIt = new CanonicalIterator(s);
slowIt.SKIP_ZEROS = false;
start = System.currentTimeMillis();
for (int i = 0; i < iterations; ++i) {
slowIt.setSource(s);
while (true) {
String item = slowIt.next();
if (item == null) break;
x += item.length();
}
}
end = System.currentTimeMillis();
double slowDelta = (end-start) / iterations;
logln("Slow iteration: " + slowDelta);
*/
start = System.currentTimeMillis();
for (int i = 0; i < iterations; ++i) {
it.setSource(s);
while (true) {
String item = it.next();
if (item == null)
break;
x += item.length();
}
}
end = System.currentTimeMillis();
double fastDelta = (end - start) / iterations;
logln("Fast iteration: " + fastDelta + (slowDelta != 0 ? ", " + (fastDelta / slowDelta) : ""));
return x;
}
use of android.icu.text.CanonicalIterator in project j2objc by google.
the class TestCanonicalIterator method TestBasic.
@Test
public void TestBasic() {
// This is not interesting anymore as the data is already built
// beforehand
// check build
// UnicodeSet ss = CanonicalIterator.getSafeStart();
// logln("Safe Start: " + ss.toPattern(true));
// ss = CanonicalIterator.getStarts('a');
// expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
// new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
// + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
// );
// check permute
// NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!
Set results = new TreeSet();
CanonicalIterator.permute("ABC", false, results);
expectEqual("Simple permutation ", "", collectionToString(results), "ABC, ACB, BAC, BCA, CAB, CBA");
// try samples
SortedSet set = new TreeSet();
for (int i = 0; i < testArray.length; ++i) {
// logln("Results for: " + name.transliterate(testArray[i]));
CanonicalIterator it = new CanonicalIterator(testArray[i][0]);
// int counter = 0;
set.clear();
String first = null;
while (true) {
String result = it.next();
if (first == null) {
first = result;
}
if (result == null)
break;
// sort them
set.add(result);
// logln(++counter + ": " + hex.transliterate(result));
// logln(" = " + name.transliterate(result));
}
expectEqual(i + ": ", testArray[i][0], collectionToString(set), testArray[i][1]);
it.reset();
if (!it.next().equals(first)) {
errln("CanonicalIterator.reset() failed");
}
if (!it.getSource().equals(Normalizer.normalize(testArray[i][0], Normalizer.NFD))) {
errln("CanonicalIterator.getSource() does not return NFD of input source");
}
}
}
Aggregations