use of android.icu.text.UnicodeSet in project j2objc by google.
the class StringTokenizer method nextToken.
/**
* Returns the next token in this string tokenizer's string. First,
* the set of characters considered to be delimiters by this
* <tt>StringTokenizer</tt> object is changed to be the characters in
* the string <tt>delim</tt>. Then the next token in the string
* after the current position is returned. The current position is
* advanced beyond the recognized token. The new delimiter set
* remains the default after this call.
* @param delim the new delimiters.
* @return the next token, after switching to the new delimiter set.
* @exception NoSuchElementException if there are no more tokens in
* this tokenizer's string.
*/
public String nextToken(String delim) {
m_delimiters_ = EMPTY_DELIMITER_;
if (delim != null && delim.length() > 0) {
m_delimiters_ = new UnicodeSet();
m_delimiters_.addAll(delim);
}
return nextToken(m_delimiters_);
}
use of android.icu.text.UnicodeSet in project j2objc by google.
the class TransliteratorTest method TestCompoundFilter.
/**
* Compound filter semantics were orginially not implemented
* correctly. Originally, each component filter f(i) is replaced by
* f'(i) = f(i) && g, where g is the filter for the compound
* transliterator.
*
* From Mark:
*
* Suppose and I have a transliterator X. Internally X is
* "Greek-Latin; Latin-Cyrillic; Any-Lower". I use a filter [^A].
*
* The compound should convert all greek characters (through latin) to
* cyrillic, then lowercase the result. The filter should say "don't
* touch 'A' in the original". But because an intermediate result
* happens to go through "A", the Greek Alpha gets hung up.
*/
@Test
public void TestCompoundFilter() {
Transliterator t = Transliterator.getInstance("Greek-Latin; Latin-Greek; Lower", Transliterator.FORWARD);
t.setFilter(new UnicodeSet("[^A]"));
// Only the 'A' at index 1 should remain unchanged
expect(t, CharsToUnicodeString("BA\\u039A\\u0391"), CharsToUnicodeString("\\u03b2A\\u03ba\\u03b1"));
}
use of android.icu.text.UnicodeSet in project j2objc by google.
the class TransliteratorTest method TestSourceTargetSet2.
@Test
public void TestSourceTargetSet2() {
Normalizer2 nfc = Normalizer2.getNFCInstance();
Normalizer2 nfd = Normalizer2.getNFDInstance();
// Normalizer2 nfkd = Normalizer2.getInstance(null, "nfkd", Mode.DECOMPOSE);
// UnicodeSet nfkdSource = new UnicodeSet();
// UnicodeSet nfkdTarget = new UnicodeSet();
// for (int i = 0; i <= 0x10FFFF; ++i) {
// if (nfkd.isInert(i)) {
// continue;
// }
// nfkdSource.add(i);
// String t = nfkd.getDecomposition(i);
// if (t != null) {
// nfkdTarget.addAll(t);
// } else {
// nfkdTarget.add(i);
// }
// }
// nfkdSource.freeze();
// nfkdTarget.freeze();
// logln("NFKD Source: " + nfkdSource.toPattern(false));
// logln("NFKD Target: " + nfkdTarget.toPattern(false));
UnicodeMap<UnicodeSet> leadToTrail = new UnicodeMap();
UnicodeMap<UnicodeSet> leadToSources = new UnicodeMap();
UnicodeSet nonStarters = new UnicodeSet("[:^ccc=0:]").freeze();
CanonicalIterator can = new CanonicalIterator("");
UnicodeSet disorderedMarks = new UnicodeSet();
for (int i = 0; i <= 0x10FFFF; ++i) {
String s = nfd.getDecomposition(i);
if (s == null) {
continue;
}
can.setSource(s);
for (String t = can.next(); t != null; t = can.next()) {
disorderedMarks.add(t);
}
// if s has two code points, (or more), add the lead/trail information
int first = s.codePointAt(0);
int firstCount = Character.charCount(first);
if (s.length() == firstCount)
continue;
String trailString = s.substring(firstCount);
// add all the trail characters
if (!nonStarters.containsSome(trailString)) {
continue;
}
UnicodeSet trailSet = leadToTrail.get(first);
if (trailSet == null) {
leadToTrail.put(first, trailSet = new UnicodeSet());
}
// add remaining trails
trailSet.addAll(trailString);
// add the sources
UnicodeSet sourcesSet = leadToSources.get(first);
if (sourcesSet == null) {
leadToSources.put(first, sourcesSet = new UnicodeSet());
}
sourcesSet.add(i);
}
for (Entry<String, UnicodeSet> x : leadToSources.entrySet()) {
String lead = x.getKey();
UnicodeSet sources = x.getValue();
UnicodeSet trailSet = leadToTrail.get(lead);
for (String source : sources) {
for (String trail : trailSet) {
can.setSource(source + trail);
for (String t = can.next(); t != null; t = can.next()) {
if (t.endsWith(trail))
continue;
disorderedMarks.add(t);
}
}
}
}
for (String s : nonStarters) {
disorderedMarks.add("\u0345" + s);
disorderedMarks.add(s + "\u0323");
String xx = nfc.normalize("\u01EC" + s);
if (!xx.startsWith("\u01EC")) {
logln("??");
}
}
// for (int i = 0; i <= 0x10FFFF; ++i) {
// String s = nfkd.getDecomposition(i);
// if (s != null) {
// disorderedMarks.add(s);
// disorderedMarks.add(nfc.normalize(s));
// addDerivedStrings(nfc, disorderedMarks, s);
// }
// s = nfd.getDecomposition(i);
// if (s != null) {
// disorderedMarks.add(s);
// }
// if (!nfc.isInert(i)) {
// if (i == 0x00C0) {
// logln("\u00C0");
// }
// can.setSource(s+"\u0334");
// for (String t = can.next(); t != null; t = can.next()) {
// addDerivedStrings(nfc, disorderedMarks, t);
// }
// can.setSource(s+"\u0345");
// for (String t = can.next(); t != null; t = can.next()) {
// addDerivedStrings(nfc, disorderedMarks, t);
// }
// can.setSource(s+"\u0323");
// for (String t = can.next(); t != null; t = can.next()) {
// addDerivedStrings(nfc, disorderedMarks, t);
// }
// }
// }
logln("Test cases: " + disorderedMarks.size());
disorderedMarks.addAll(0, 0x10FFFF).freeze();
logln("isInert \u0104 " + nfc.isInert('\u0104'));
Object[][] rules = { { ":: [:sc=COMMON:] any-name;", null }, { ":: [:Greek:] hex-any/C;", null }, { ":: [:Greek:] any-hex/C;", null }, { ":: [[:Mn:][:Me:]] remove;", null }, { ":: [[:Mn:][:Me:]] null;", null }, { ":: lower;", null }, { ":: upper;", null }, { ":: title;", null }, { ":: CaseFold;", null }, { ":: NFD;", null }, { ":: NFC;", null }, { ":: NFKD;", null }, { ":: NFKC;", null }, { ":: [[:Mn:][:Me:]] NFKD;", null }, { ":: Latin-Greek;", null }, { ":: [:Latin:] NFKD;", null }, { ":: NFKD;", null }, { ":: NFKD;\n" + ":: [[:Mn:][:Me:]] remove;\n" + ":: NFC;", null } };
for (Object[] rulex : rules) {
String rule = (String) rulex[0];
Transliterator trans = Transliterator.createFromRules("temp", rule, Transliterator.FORWARD);
UnicodeSet actualSource = trans.getSourceSet();
UnicodeSet actualTarget = trans.getTargetSet();
UnicodeSet empiricalSource = new UnicodeSet();
UnicodeSet empiricalTarget = new UnicodeSet();
String ruleDisplay = rule.replace("\n", "\t\t");
UnicodeSet toTest = disorderedMarks;
// if (rulex[1] != null) {
// toTest = new UnicodeSet(disorderedMarks);
// toTest.addAll((UnicodeSet) rulex[1]);
// }
String test = nfd.normalize("\u0104");
boolean DEBUG = true;
@SuppressWarnings("unused") int // for debugging
count = 0;
for (String s : toTest) {
if (s.equals(test)) {
logln(test);
}
String t = trans.transform(s);
if (!s.equals(t)) {
if (!isAtomic(s, t, trans)) {
isAtomic(s, t, trans);
continue;
}
// }
if (DEBUG) {
if (!actualSource.containsAll(s)) {
count++;
}
if (!actualTarget.containsAll(t)) {
count++;
}
}
addSourceTarget(s, empiricalSource, t, empiricalTarget);
}
}
assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, SetAssert.MISSING_OK);
assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, SetAssert.MISSING_OK);
}
}
use of android.icu.text.UnicodeSet in project j2objc by google.
the class TransliteratorTest method checkRegistry.
private void checkRegistry(String id) {
Transliterator fie = Transliterator.getInstance(id);
final UnicodeSet fae = new UnicodeSet("[a-z5]");
fie.setFilter(fae);
Transliterator foe = Transliterator.getInstance(id);
UnicodeFilter fee = foe.getFilter();
if (fae.equals(fee)) {
errln("Changed what is in registry for " + id);
}
}
use of android.icu.text.UnicodeSet in project j2objc by google.
the class TransliteratorTest method TestGurmukhiDevanagari.
/**
* Test Gurmukhi-Devanagari Tippi and Bindi
*/
@Test
public void TestGurmukhiDevanagari() {
// the rule says:
// (\u0902) (when preceded by vowel) ---> (\u0A02)
// (\u0902) (when preceded by consonant) ---> (\u0A70)
UnicodeSet vowel = new UnicodeSet("[\u0905-\u090A \u090F\u0910\u0913\u0914 \u093e-\u0942\u0947\u0948\u094B\u094C\u094D]");
UnicodeSet non_vowel = new UnicodeSet("[\u0915-\u0928\u092A-\u0930]");
UnicodeSetIterator vIter = new UnicodeSetIterator(vowel);
UnicodeSetIterator nvIter = new UnicodeSetIterator(non_vowel);
Transliterator trans = Transliterator.getInstance("Devanagari-Gurmukhi");
StringBuffer src = new StringBuffer(" \u0902");
StringBuffer expect = new StringBuffer(" \u0A02");
while (vIter.next()) {
src.setCharAt(0, (char) vIter.codepoint);
expect.setCharAt(0, (char) (vIter.codepoint + 0x0100));
expect(trans, src.toString(), expect.toString());
}
expect.setCharAt(1, '\u0A70');
while (nvIter.next()) {
// src.setCharAt(0,(char) nvIter.codepoint);
src.setCharAt(0, (char) nvIter.codepoint);
expect.setCharAt(0, (char) (nvIter.codepoint + 0x0100));
expect(trans, src.toString(), expect.toString());
}
}
Aggregations