use of android.icu.text.UnicodeSet in project j2objc by google.
the class CollationBuilder method addTailComposites.
private void addTailComposites(CharSequence nfdPrefix, CharSequence nfdString) {
// Look for the last starter in the NFD string.
int lastStarter;
int indexAfterLastStarter = nfdString.length();
for (; ; ) {
// no starter at all
if (indexAfterLastStarter == 0) {
return;
}
lastStarter = Character.codePointBefore(nfdString, indexAfterLastStarter);
if (nfd.getCombiningClass(lastStarter) == 0) {
break;
}
indexAfterLastStarter -= Character.charCount(lastStarter);
}
// No closure to Hangul syllables since we decompose them on the fly.
if (Hangul.isJamoL(lastStarter)) {
return;
}
// Are there any composites whose decomposition starts with the lastStarter?
// Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.
// We might find some more equivalent mappings here if it did.
UnicodeSet composites = new UnicodeSet();
if (!nfcImpl.getCanonStartSet(lastStarter, composites)) {
return;
}
StringBuilder newNFDString = new StringBuilder(), newString = new StringBuilder();
long[] newCEs = new long[Collation.MAX_EXPANSION_LENGTH];
UnicodeSetIterator iter = new UnicodeSetIterator(composites);
while (iter.next()) {
assert (iter.codepoint != UnicodeSetIterator.IS_STRING);
int composite = iter.codepoint;
String decomp = nfd.getDecomposition(composite);
if (!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp, newNFDString, newString)) {
continue;
}
int newCEsLength = dataBuilder.getCEs(nfdPrefix, newNFDString, newCEs, 0);
if (newCEsLength > Collation.MAX_EXPANSION_LENGTH) {
// Ignore mappings that we cannot store.
continue;
}
// Note: It is possible that the newCEs do not make use of the mapping
// for which we are adding the tail composites, in which case we might be adding
// unnecessary mappings.
// For example, when we add tail composites for ae^ (^=combining circumflex),
// UCA discontiguous-contraction matching does not find any matches
// for ae_^ (_=any combining diacritic below) *unless* there is also
// a contraction mapping for ae.
// Thus, if there is no ae contraction, then the ae^ mapping is ignored
// while fetching the newCEs for ae_^.
// TODO: Try to detect this effectively.
// (Alternatively, print a warning when prefix contractions are missing.)
// We do not need an explicit mapping for the NFD strings.
// It is fine if the NFD input collates like this via a sequence of mappings.
// It also saves a little bit of space, and may reduce the set of characters with contractions.
int ce32 = addIfDifferent(nfdPrefix, newString, newCEs, newCEsLength, Collation.UNASSIGNED_CE32);
if (ce32 != Collation.UNASSIGNED_CE32) {
// was different, was added
addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32);
}
}
}
use of android.icu.text.UnicodeSet in project j2objc by google.
the class AlphabeticIndexTest method TestSchSt.
/**
* Test labels with multiple primary weights.
*/
@Test
public void TestSchSt() {
AlphabeticIndex index = new AlphabeticIndex(ULocale.GERMAN);
index.addLabels(new UnicodeSet("[Æ{Sch*}{St*}]"));
// ... A Æ B-R S Sch St T-Z ...
ImmutableIndex immIndex = index.buildImmutableIndex();
assertEquals("getBucketCount()", 31, index.getBucketCount());
assertEquals("immutable getBucketCount()", 31, immIndex.getBucketCount());
String[][] testCases = new String[][] { // name, bucket index, bucket label
{ "Adelbert", "1", "A" }, { "Afrika", "1", "A" }, { "Æsculap", "2", "Æ" }, { "Aesthet", "2", "Æ" }, { "Berlin", "3", "B" }, { "Rilke", "19", "R" }, { "Sacher", "20", "S" }, { "Seiler", "20", "S" }, { "Sultan", "20", "S" }, { "Schiller", "21", "Sch" }, { "Steiff", "22", "St" }, { "Thomas", "23", "T" } };
List<String> labels = index.getBucketLabels();
for (String[] testCase : testCases) {
String name = testCase[0];
int bucketIndex = Integer.valueOf(testCase[1]);
String label = testCase[2];
String msg = "getBucketIndex(" + name + ")";
assertEquals(msg, bucketIndex, index.getBucketIndex(name));
msg = "immutable " + msg;
assertEquals(msg, bucketIndex, immIndex.getBucketIndex(name));
msg = "bucket label (" + name + ")";
assertEquals(msg, label, labels.get(index.getBucketIndex(name)));
msg = "immutable " + msg;
assertEquals(msg, label, immIndex.getBucket(bucketIndex).getLabel());
}
}
use of android.icu.text.UnicodeSet in project j2objc by google.
the class AlphabeticIndexTest method firstStringsInScript.
/**
* Returns a collection of all the "First" characters of scripts, according to the collation.
*/
private static Collection<String> firstStringsInScript(RuleBasedCollator ruleBasedCollator) {
String[] results = new String[UScript.CODE_LIMIT];
for (String current : TO_TRY) {
if (ruleBasedCollator.compare(current, "a") < 0) {
// we only want "real" script characters, not symbols.
continue;
}
int script = UScript.getScript(current.codePointAt(0));
if (results[script] == null) {
results[script] = current;
} else if (ruleBasedCollator.compare(current, results[script]) < 0) {
results[script] = current;
}
}
try {
UnicodeSet extras = new UnicodeSet();
UnicodeSet expansions = new UnicodeSet();
ruleBasedCollator.getContractionsAndExpansions(extras, expansions, true);
extras.addAll(expansions).removeAll(TO_TRY);
if (extras.size() != 0) {
Normalizer2 normalizer = Normalizer2.getNFKCInstance();
for (String current : extras) {
if (!normalizer.isNormalized(current) || ruleBasedCollator.compare(current, "9") <= 0) {
continue;
}
int script = getFirstRealScript(current);
if (script == UScript.UNKNOWN && !isUnassignedBoundary(current)) {
continue;
}
if (results[script] == null) {
results[script] = current;
} else if (ruleBasedCollator.compare(current, results[script]) < 0) {
results[script] = current;
}
}
}
} catch (Exception e) {
}
// why have a checked exception???
// TODO: We should not test that we get the same strings, but that we
// get strings that sort primary-equal to those from the implementation.
Collection<String> result = new ArrayList<String>();
for (int i = 0; i < results.length; ++i) {
if (results[i] != null) {
result.add(results[i]);
}
}
return result;
}
use of android.icu.text.UnicodeSet in project j2objc by google.
the class AlphabeticIndexTest method TestInflow.
@Test
public void TestInflow() {
Object[][] tests = { { 0, ULocale.ENGLISH }, { 0, ULocale.ENGLISH, new ULocale("el") }, { 1, ULocale.ENGLISH, new ULocale("ru") }, { 0, ULocale.ENGLISH, new ULocale("el"), new UnicodeSet("[\u2C80]"), new ULocale("ru") }, { 0, ULocale.ENGLISH }, { 2, ULocale.ENGLISH, new ULocale("ru"), ULocale.JAPANESE } };
for (Object[] test : tests) {
int expected = (Integer) test[0];
AlphabeticIndex<Double> alphabeticIndex = new AlphabeticIndex((ULocale) test[1]);
for (int i = 2; i < test.length; ++i) {
if (test[i] instanceof ULocale) {
alphabeticIndex.addLabels((ULocale) test[i]);
} else {
alphabeticIndex.addLabels((UnicodeSet) test[i]);
}
}
Counter<AlphabeticIndex.Bucket.LabelType> counter = new Counter();
for (Bucket<Double> bucket : alphabeticIndex) {
LabelType labelType = bucket.getLabelType();
counter.add(labelType, 1);
}
String printList = Arrays.asList(test).toString();
assertEquals(LabelType.UNDERFLOW + "\t" + printList, 1, counter.get(LabelType.UNDERFLOW));
assertEquals(LabelType.INFLOW + "\t" + printList, expected, counter.get(LabelType.INFLOW));
if (expected != counter.get(LabelType.INFLOW)) {
// for debugging
AlphabeticIndex<Double> indexCharacters2 = new AlphabeticIndex((ULocale) test[1]);
for (int i = 2; i < test.length; ++i) {
if (test[i] instanceof ULocale) {
indexCharacters2.addLabels((ULocale) test[i]);
} else {
indexCharacters2.addLabels((UnicodeSet) test[i]);
}
}
List<Bucket<Double>> buckets = CollectionUtilities.addAll(alphabeticIndex.iterator(), new ArrayList<Bucket<Double>>());
logln(buckets.toString());
}
assertEquals(LabelType.OVERFLOW + "\t" + printList, 1, counter.get(LabelType.OVERFLOW));
}
}
use of android.icu.text.UnicodeSet in project j2objc by google.
the class CollationMiscTest method TestImportWithType.
@Test
public void TestImportWithType() {
try {
RuleBasedCollator vicoll = (RuleBasedCollator) Collator.getInstance(new ULocale("vi"));
RuleBasedCollator decoll = (RuleBasedCollator) Collator.getInstance(ULocale.forLanguageTag("de-u-co-phonebk"));
RuleBasedCollator videcoll = new RuleBasedCollator(vicoll.getRules() + decoll.getRules());
RuleBasedCollator importvidecoll = new RuleBasedCollator("[import vi][import de-u-co-phonebk]");
UnicodeSet tailoredSet = videcoll.getTailoredSet();
UnicodeSet importTailoredSet = importvidecoll.getTailoredSet();
if (!tailoredSet.equals(importTailoredSet)) {
warnln("Tailored set not equal");
}
for (UnicodeSetIterator it = new UnicodeSetIterator(tailoredSet); it.next(); ) {
String t = it.getString();
CollationKey sk1 = videcoll.getCollationKey(t);
CollationKey sk2 = importvidecoll.getCollationKey(t);
if (!sk1.equals(sk2)) {
warnln("Collation key's not equal for " + t);
}
}
} catch (Exception e) {
// Android patch: Add --omitCollationRules to genrb.
logln("ERROR: in creation of rule based collator");
// Android patch end.
}
}
Aggregations