use of android.icu.text.UnicodeSetIterator in project j2objc by google.
the class RoundTripTest method TestHan.
@Test
public void TestHan() throws UnsupportedEncodingException, FileNotFoundException {
try {
UnicodeSet exemplars = LocaleData.getExemplarSet(new ULocale("zh"), 0);
// create string with all chars
StringBuffer b = new StringBuffer();
for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next(); ) {
UTF16.append(b, it.codepoint);
}
String source = b.toString();
// transform with Han translit
Transliterator han = Transliterator.getInstance("Han-Latin");
String target = han.transliterate(source);
// now verify that there are no Han characters left
UnicodeSet allHan = new UnicodeSet("[:han:]");
assertFalse("No Han must be left after Han-Latin transliteration", allHan.containsSome(target));
// check the pinyin translit
Transliterator pn = Transliterator.getInstance("Latin-NumericPinyin");
String target2 = pn.transliterate(target);
// verify that there are no marks
Transliterator nfc = Transliterator.getInstance("nfc");
String nfced = nfc.transliterate(target2);
UnicodeSet allMarks = new UnicodeSet("[:mark:]");
assertFalse("NumericPinyin must contain no marks", allMarks.containsSome(nfced));
// verify roundtrip
Transliterator np = pn.getInverse();
String target3 = np.transliterate(target);
boolean roundtripOK = target3.equals(target);
assertTrue("NumericPinyin must roundtrip", roundtripOK);
if (!roundtripOK) {
String filename = "numeric-pinyin.log.txt";
PrintWriter out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF8"), 4 * 1024));
errln("Creating log file " + new File(filename).getAbsoluteFile());
out.println("Pinyin: " + target);
out.println("Pinyin-Numeric-Pinyin: " + target2);
out.close();
}
} catch (MissingResourceException ex) {
warnln("Could not load the locale data for fetching the exemplar characters.");
}
}
use of android.icu.text.UnicodeSetIterator in project j2objc by google.
the class RoundTripTest method TestHangul2.
/**
* This is a shorter version of the test for doubles, that allows us to skip lots of cases, but
* does check the ones that should cause problems (if any do).
*/
@Test
public void TestHangul2() {
Transliterator lh = Transliterator.getInstance("Latin-Hangul");
Transliterator hl = lh.getInverse();
final UnicodeSet representativeHangul = getRepresentativeHangul();
for (UnicodeSetIterator it = new UnicodeSetIterator(representativeHangul); it.next(); ) {
assertRoundTripTransform("Transform", it.getString(), lh, hl);
}
}
use of android.icu.text.UnicodeSetIterator in project j2objc by google.
the class BasicTest method initSkippables.
private static UnicodeSet[] initSkippables(UnicodeSet[] skipSets) {
skipSets[D].applyPattern("[[:NFD_QC=Yes:]&[:ccc=0:]]", false);
skipSets[C].applyPattern("[[:NFC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false);
skipSets[KD].applyPattern("[[:NFKD_QC=Yes:]&[:ccc=0:]]", false);
skipSets[KC].applyPattern("[[:NFKC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false);
// Remove from the NFC and NFKC sets all those characters that change
// when a back-combining character is added.
// First, get all of the back-combining characters and their combining classes.
UnicodeSet combineBack = new UnicodeSet("[:NFC_QC=Maybe:]");
int numCombineBack = combineBack.size();
int[] combineBackCharsAndCc = new int[numCombineBack * 2];
UnicodeSetIterator iter = new UnicodeSetIterator(combineBack);
for (int i = 0; i < numCombineBack; ++i) {
iter.next();
int c = iter.codepoint;
combineBackCharsAndCc[2 * i] = c;
combineBackCharsAndCc[2 * i + 1] = UCharacter.getCombiningClass(c);
}
// We need not look at control codes, Han characters nor Hangul LVT syllables because they
// do not combine forward. LV syllables are already removed.
UnicodeSet notInteresting = new UnicodeSet("[[:C:][:Unified_Ideograph:][:HST=LVT:]]");
UnicodeSet unsure = ((UnicodeSet) (skipSets[C].clone())).removeAll(notInteresting);
// System.out.format("unsure.size()=%d\n", unsure.size());
// For each character about which we are unsure, see if it changes when we add
// one of the back-combining characters.
Normalizer2 norm2 = Normalizer2.getNFCInstance();
StringBuilder s = new StringBuilder();
iter.reset(unsure);
while (iter.next()) {
int c = iter.codepoint;
s.delete(0, 0x7fffffff).appendCodePoint(c);
int cLength = s.length();
int tccc = UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS);
for (int i = 0; i < numCombineBack; ++i) {
// If c's decomposition ends with a character with non-zero combining class, then
// c can only change if it combines with a character with a non-zero combining class.
int cc2 = combineBackCharsAndCc[2 * i + 1];
if (tccc == 0 || cc2 != 0) {
int c2 = combineBackCharsAndCc[2 * i];
s.appendCodePoint(c2);
if (!norm2.isNormalized(s)) {
// System.out.format("remove U+%04x (tccc=%d) + U+%04x (cc=%d)\n", c, tccc, c2, cc2);
skipSets[C].remove(c);
skipSets[KC].remove(c);
break;
}
s.delete(cLength, 0x7fffffff);
}
}
}
return skipSets;
}
use of android.icu.text.UnicodeSetIterator in project j2objc by google.
the class BasicTest method TestSerializedSet.
@Test
public void TestSerializedSet() {
USerializedSet sset = new USerializedSet();
UnicodeSet set = new UnicodeSet();
int start, end;
char[] serialized = { // length
0x8007, // bmpLength
3, 0xc0, 0xfe, 0xfffc, 1, 9, 0x10, 0xfffc };
sset.getSet(serialized, 0);
// collect all sets into one for contiguous output
int[] startEnd = new int[2];
int count = sset.countRanges();
for (int j = 0; j < count; ++j) {
sset.getRange(j, startEnd);
set.add(startEnd[0], startEnd[1]);
}
// test all of these characters
UnicodeSetIterator it = new UnicodeSetIterator(set);
while (it.nextRange() && it.codepoint != UnicodeSetIterator.IS_STRING) {
start = it.codepoint;
end = it.codepointEnd;
while (start <= end) {
if (!sset.contains(start)) {
errln("USerializedSet.contains failed for " + Utility.hex(start, 8));
}
++start;
}
}
}
use of android.icu.text.UnicodeSetIterator in project j2objc by google.
the class CollectionUtilities method flatten.
/**
* Modifies Unicode set to flatten the strings. Eg [abc{da}] => [abcd]
* Returns the set for chaining.
* @param exemplar1
* @return
*/
public static UnicodeSet flatten(UnicodeSet exemplar1) {
UnicodeSet result = new UnicodeSet();
boolean gotString = false;
for (UnicodeSetIterator it = new UnicodeSetIterator(exemplar1); it.nextRange(); ) {
if (it.codepoint == UnicodeSetIterator.IS_STRING) {
result.addAll(it.string);
gotString = true;
} else {
result.add(it.codepoint, it.codepointEnd);
}
}
if (gotString)
exemplar1.set(result);
return exemplar1;
}
Aggregations