use of android.icu.text.UnicodeSet in project j2objc by google.
the class RoundTripTest method getRepresentativeBoundaryHangul.
private static UnicodeSet getRepresentativeBoundaryHangul() {
UnicodeSet resultToAddTo = new UnicodeSet();
// U+1100 HANGUL CHOSEONG KIYEOK
// U+1161 HANGUL JUNGSEONG A
UnicodeSet L = new UnicodeSet("[:hst=L:]");
UnicodeSet V = new UnicodeSet("[:hst=V:]");
UnicodeSet T = new UnicodeSet("[:hst=T:]");
String prefixLV = "\u1100\u1161";
String prefixL = "\u1100";
String suffixV = "\u1161";
// HANGUL CHOSEONG IEUNG
String nullL = "\u110B";
UnicodeSet L0 = new UnicodeSet("[\u1100\u110B]");
for (UnicodeSetIterator iL0 = new UnicodeSetIterator(L0); iL0.next(); ) {
for (UnicodeSetIterator iV = new UnicodeSetIterator(V); iV.next(); ) {
for (UnicodeSetIterator iV2 = new UnicodeSetIterator(V); iV2.next(); ) {
String sample = iL0.getString() + iV.getString() + nullL + iV2.getString();
String trial = Normalizer.compose(sample, false);
if (trial.length() == 2) {
resultToAddTo.add(trial);
}
}
}
}
for (UnicodeSetIterator iL = new UnicodeSetIterator(L); iL.next(); ) {
// do all combinations of "g" + V + L + "a"
final String suffix = iL.getString() + suffixV;
for (UnicodeSetIterator iV = new UnicodeSetIterator(V); iV.next(); ) {
String sample = prefixL + iV.getString() + suffix;
String trial = Normalizer.compose(sample, false);
if (trial.length() == 2) {
resultToAddTo.add(trial);
}
}
// do all combinations of "ga" + T + L + "a"
for (UnicodeSetIterator iT = new UnicodeSetIterator(T); iT.next(); ) {
String sample = prefixLV + iT.getString() + suffix;
String trial = Normalizer.compose(sample, false);
if (trial.length() == 2) {
resultToAddTo.add(trial);
}
}
}
return resultToAddTo;
}
use of android.icu.text.UnicodeSet in project j2objc by google.
the class CollationDataReader method read.
static void read(CollationTailoring base, ByteBuffer inBytes, CollationTailoring tailoring) throws IOException {
tailoring.version = ICUBinary.readHeader(inBytes, DATA_FORMAT, IS_ACCEPTABLE);
if (base != null && base.getUCAVersion() != tailoring.getUCAVersion()) {
throw new ICUException("Tailoring UCA version differs from base data UCA version");
}
int inLength = inBytes.remaining();
if (inLength < 8) {
throw new ICUException("not enough bytes");
}
// inIndexes[IX_INDEXES_LENGTH]
int indexesLength = inBytes.getInt();
if (indexesLength < 2 || inLength < indexesLength * 4) {
throw new ICUException("not enough indexes");
}
int[] inIndexes = new int[IX_TOTAL_SIZE + 1];
inIndexes[0] = indexesLength;
for (int i = 1; i < indexesLength && i < inIndexes.length; ++i) {
inIndexes[i] = inBytes.getInt();
}
for (int i = indexesLength; i < inIndexes.length; ++i) {
inIndexes[i] = -1;
}
if (indexesLength > inIndexes.length) {
ICUBinary.skipBytes(inBytes, (indexesLength - inIndexes.length) * 4);
}
// Assume that the tailoring data is in initial state,
// with null pointers and 0 lengths.
// Set pointers to non-empty data parts.
// Do this in order of their byte offsets. (Should help porting to Java.)
// one of the indexes[] slots
int index;
// byte offset for the index part
int offset;
// number of bytes in the index part
int length;
if (indexesLength > IX_TOTAL_SIZE) {
length = inIndexes[IX_TOTAL_SIZE];
} else if (indexesLength > IX_REORDER_CODES_OFFSET) {
length = inIndexes[indexesLength - 1];
} else {
// only indexes, and inLength was already checked for them
length = 0;
}
if (inLength < length) {
throw new ICUException("not enough bytes");
}
CollationData baseData = base == null ? null : base.data;
int[] reorderCodes;
int reorderCodesLength;
index = IX_REORDER_CODES_OFFSET;
offset = inIndexes[index];
length = inIndexes[index + 1] - offset;
if (length >= 4) {
if (baseData == null) {
// the base data does not have a reordering.
throw new ICUException("Collation base data must not reorder scripts");
}
reorderCodesLength = length / 4;
reorderCodes = ICUBinary.getInts(inBytes, reorderCodesLength, length & 3);
// The reorderRanges (if any) are the trailing reorderCodes entries.
// Split the array at the boundary.
// Script or reorder codes do not exceed 16-bit values.
// Range limits are stored in the upper 16 bits, and are never 0.
int reorderRangesLength = 0;
while (reorderRangesLength < reorderCodesLength && (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
++reorderRangesLength;
}
assert (reorderRangesLength < reorderCodesLength);
reorderCodesLength -= reorderRangesLength;
} else {
reorderCodes = new int[0];
reorderCodesLength = 0;
ICUBinary.skipBytes(inBytes, length);
}
// There should be a reorder table only if there are reorder codes.
// However, when there are reorder codes the reorder table may be omitted to reduce
// the data size.
byte[] reorderTable = null;
index = IX_REORDER_TABLE_OFFSET;
offset = inIndexes[index];
length = inIndexes[index + 1] - offset;
if (length >= 256) {
if (reorderCodesLength == 0) {
throw new ICUException("Reordering table without reordering codes");
}
reorderTable = new byte[256];
inBytes.get(reorderTable);
length -= 256;
} else {
// If we have reorder codes, then build the reorderTable at the end,
// when the CollationData is otherwise complete.
}
ICUBinary.skipBytes(inBytes, length);
if (baseData != null && baseData.numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000L)) {
throw new ICUException("Tailoring numeric primary weight differs from base data");
}
// Remains null if there are no mappings.
CollationData data = null;
index = IX_TRIE_OFFSET;
offset = inIndexes[index];
length = inIndexes[index + 1] - offset;
if (length >= 8) {
tailoring.ensureOwnedData();
data = tailoring.ownedData;
data.base = baseData;
data.numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000L;
data.trie = tailoring.trie = Trie2_32.createFromSerialized(inBytes);
int trieLength = data.trie.getSerializedLength();
if (trieLength > length) {
// No mappings.
throw new ICUException("Not enough bytes for the mappings trie");
}
length -= trieLength;
} else if (baseData != null) {
// Use the base data. Only the settings are tailored.
tailoring.data = baseData;
} else {
// No mappings.
throw new ICUException("Missing collation data mappings");
}
ICUBinary.skipBytes(inBytes, length);
index = IX_RESERVED8_OFFSET;
offset = inIndexes[index];
length = inIndexes[index + 1] - offset;
ICUBinary.skipBytes(inBytes, length);
index = IX_CES_OFFSET;
offset = inIndexes[index];
length = inIndexes[index + 1] - offset;
if (length >= 8) {
if (data == null) {
throw new ICUException("Tailored ces without tailored trie");
}
data.ces = ICUBinary.getLongs(inBytes, length / 8, length & 7);
} else {
ICUBinary.skipBytes(inBytes, length);
}
index = IX_RESERVED10_OFFSET;
offset = inIndexes[index];
length = inIndexes[index + 1] - offset;
ICUBinary.skipBytes(inBytes, length);
index = IX_CE32S_OFFSET;
offset = inIndexes[index];
length = inIndexes[index + 1] - offset;
if (length >= 4) {
if (data == null) {
throw new ICUException("Tailored ce32s without tailored trie");
}
data.ce32s = ICUBinary.getInts(inBytes, length / 4, length & 3);
} else {
ICUBinary.skipBytes(inBytes, length);
}
int jamoCE32sStart = inIndexes[IX_JAMO_CE32S_START];
if (jamoCE32sStart >= 0) {
if (data == null || data.ce32s == null) {
throw new ICUException("JamoCE32sStart index into non-existent ce32s[]");
}
data.jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
System.arraycopy(data.ce32s, jamoCE32sStart, data.jamoCE32s, 0, CollationData.JAMO_CE32S_LENGTH);
} else if (data == null) {
// Nothing to do.
} else if (baseData != null) {
data.jamoCE32s = baseData.jamoCE32s;
} else {
throw new ICUException("Missing Jamo CE32s for Hangul processing");
}
index = IX_ROOT_ELEMENTS_OFFSET;
offset = inIndexes[index];
length = inIndexes[index + 1] - offset;
if (length >= 4) {
int rootElementsLength = length / 4;
if (data == null) {
throw new ICUException("Root elements but no mappings");
}
if (rootElementsLength <= CollationRootElements.IX_SEC_TER_BOUNDARIES) {
throw new ICUException("Root elements array too short");
}
data.rootElements = new long[rootElementsLength];
for (int i = 0; i < rootElementsLength; ++i) {
// unsigned int -> long
data.rootElements[i] = inBytes.getInt() & 0xffffffffL;
}
long commonSecTer = data.rootElements[CollationRootElements.IX_COMMON_SEC_AND_TER_CE];
if (commonSecTer != Collation.COMMON_SEC_AND_TER_CE) {
throw new ICUException("Common sec/ter weights in base data differ from the hardcoded value");
}
long secTerBoundaries = data.rootElements[CollationRootElements.IX_SEC_TER_BOUNDARIES];
if ((secTerBoundaries >>> 24) < CollationKeys.SEC_COMMON_HIGH) {
// and secondary weights would collide with compressed common secondaries.
throw new ICUException("[fixed last secondary common byte] is too low");
}
length &= 3;
}
ICUBinary.skipBytes(inBytes, length);
index = IX_CONTEXTS_OFFSET;
offset = inIndexes[index];
length = inIndexes[index + 1] - offset;
if (length >= 2) {
if (data == null) {
throw new ICUException("Tailored contexts without tailored trie");
}
data.contexts = ICUBinary.getString(inBytes, length / 2, length & 1);
} else {
ICUBinary.skipBytes(inBytes, length);
}
index = IX_UNSAFE_BWD_OFFSET;
offset = inIndexes[index];
length = inIndexes[index + 1] - offset;
if (length >= 2) {
if (data == null) {
throw new ICUException("Unsafe-backward-set but no mappings");
}
if (baseData == null) {
// Create the unsafe-backward set for the root collator.
// Include all non-zero combining marks and trail surrogates.
// We do this at load time, rather than at build time,
// to simplify Unicode version bootstrapping:
// The root data builder only needs the new FractionalUCA.txt data,
// but it need not be built with a version of ICU already updated to
// the corresponding new Unicode Character Database.
//
// The following is an optimized version of
// new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
// It is faster and requires fewer code dependencies.
// trail surrogates
tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff);
data.nfcImpl.addLcccChars(tailoring.unsafeBackwardSet);
} else {
// Clone the root collator's set contents.
tailoring.unsafeBackwardSet = baseData.unsafeBackwardSet.cloneAsThawed();
}
// Add the ranges from the data file to the unsafe-backward set.
USerializedSet sset = new USerializedSet();
char[] unsafeData = ICUBinary.getChars(inBytes, length / 2, length & 1);
length = 0;
sset.getSet(unsafeData, 0);
int count = sset.countRanges();
int[] range = new int[2];
for (int i = 0; i < count; ++i) {
sset.getRange(i, range);
tailoring.unsafeBackwardSet.add(range[0], range[1]);
}
// Mark each lead surrogate as "unsafe"
// if any of its 1024 associated supplementary code points is "unsafe".
int c = 0x10000;
for (int lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
if (!tailoring.unsafeBackwardSet.containsNone(c, c + 0x3ff)) {
tailoring.unsafeBackwardSet.add(lead);
}
}
tailoring.unsafeBackwardSet.freeze();
data.unsafeBackwardSet = tailoring.unsafeBackwardSet;
} else if (data == null) {
// Nothing to do.
} else if (baseData != null) {
// No tailoring-specific data: Alias the root collator's set.
data.unsafeBackwardSet = baseData.unsafeBackwardSet;
} else {
throw new ICUException("Missing unsafe-backward-set");
}
ICUBinary.skipBytes(inBytes, length);
// If the fast Latin format version is different,
// or the version is set to 0 for "no fast Latin table",
// then just always use the normal string comparison path.
index = IX_FAST_LATIN_TABLE_OFFSET;
offset = inIndexes[index];
length = inIndexes[index + 1] - offset;
if (data != null) {
data.fastLatinTable = null;
data.fastLatinTableHeader = null;
if (((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin.VERSION) {
if (length >= 2) {
char header0 = inBytes.getChar();
int headerLength = header0 & 0xff;
data.fastLatinTableHeader = new char[headerLength];
data.fastLatinTableHeader[0] = header0;
for (int i = 1; i < headerLength; ++i) {
data.fastLatinTableHeader[i] = inBytes.getChar();
}
int tableLength = length / 2 - headerLength;
data.fastLatinTable = ICUBinary.getChars(inBytes, tableLength, length & 1);
length = 0;
if ((header0 >> 8) != CollationFastLatin.VERSION) {
throw new ICUException("Fast-Latin table version differs from version in data header");
}
} else if (baseData != null) {
data.fastLatinTable = baseData.fastLatinTable;
data.fastLatinTableHeader = baseData.fastLatinTableHeader;
}
}
}
ICUBinary.skipBytes(inBytes, length);
index = IX_SCRIPTS_OFFSET;
offset = inIndexes[index];
length = inIndexes[index + 1] - offset;
if (length >= 2) {
if (data == null) {
throw new ICUException("Script order data but no mappings");
}
int scriptsLength = length / 2;
CharBuffer inChars = inBytes.asCharBuffer();
data.numScripts = inChars.get();
// There must be enough entries for both arrays, including more than two range starts.
int scriptStartsLength = scriptsLength - (1 + data.numScripts + 16);
if (scriptStartsLength <= 2) {
throw new ICUException("Script order data too short");
}
inChars.get(data.scriptsIndex = new char[data.numScripts + 16]);
inChars.get(data.scriptStarts = new char[scriptStartsLength]);
if (!(data.scriptStarts[0] == 0 && data.scriptStarts[1] == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8) && data.scriptStarts[scriptStartsLength - 1] == (Collation.TRAIL_WEIGHT_BYTE << 8))) {
throw new ICUException("Script order data not valid");
}
} else if (data == null) {
// Nothing to do.
} else if (baseData != null) {
data.numScripts = baseData.numScripts;
data.scriptsIndex = baseData.scriptsIndex;
data.scriptStarts = baseData.scriptStarts;
}
ICUBinary.skipBytes(inBytes, length);
index = IX_COMPRESSIBLE_BYTES_OFFSET;
offset = inIndexes[index];
length = inIndexes[index + 1] - offset;
if (length >= 256) {
if (data == null) {
throw new ICUException("Data for compressible primary lead bytes but no mappings");
}
data.compressibleBytes = new boolean[256];
for (int i = 0; i < 256; ++i) {
data.compressibleBytes[i] = inBytes.get() != 0;
}
length -= 256;
} else if (data == null) {
// Nothing to do.
} else if (baseData != null) {
data.compressibleBytes = baseData.compressibleBytes;
} else {
throw new ICUException("Missing data for compressible primary lead bytes");
}
ICUBinary.skipBytes(inBytes, length);
index = IX_RESERVED18_OFFSET;
offset = inIndexes[index];
length = inIndexes[index + 1] - offset;
ICUBinary.skipBytes(inBytes, length);
CollationSettings ts = tailoring.settings.readOnly();
int options = inIndexes[IX_OPTIONS] & 0xffff;
char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT];
int fastLatinOptions = CollationFastLatin.getOptions(tailoring.data, ts, fastLatinPrimaries);
if (options == ts.options && ts.variableTop != 0 && Arrays.equals(reorderCodes, ts.reorderCodes) && fastLatinOptions == ts.fastLatinOptions && (fastLatinOptions < 0 || Arrays.equals(fastLatinPrimaries, ts.fastLatinPrimaries))) {
return;
}
CollationSettings settings = tailoring.settings.copyOnWrite();
settings.options = options;
// Set variableTop from options and scripts data.
settings.variableTop = tailoring.data.getLastPrimaryForGroup(Collator.ReorderCodes.FIRST + settings.getMaxVariable());
if (settings.variableTop == 0) {
throw new ICUException("The maxVariable could not be mapped to a variableTop");
}
if (reorderCodesLength != 0) {
settings.aliasReordering(baseData, reorderCodes, reorderCodesLength, reorderTable);
}
settings.fastLatinOptions = CollationFastLatin.getOptions(tailoring.data, settings, settings.fastLatinPrimaries);
}
use of android.icu.text.UnicodeSet in project j2objc by google.
the class Normalizer2Impl method addToStartSet.
private void addToStartSet(Trie2Writable newData, int origin, int decompLead) {
int canonValue = newData.get(decompLead);
if ((canonValue & (CANON_HAS_SET | CANON_VALUE_MASK)) == 0 && origin != 0) {
// origin is the first character whose decomposition starts with
// the character for which we are setting the value.
newData.set(decompLead, canonValue | origin);
} else {
// origin is not the first character, or it is U+0000.
UnicodeSet set;
if ((canonValue & CANON_HAS_SET) == 0) {
int firstOrigin = canonValue & CANON_VALUE_MASK;
canonValue = (canonValue & ~CANON_VALUE_MASK) | CANON_HAS_SET | canonStartSets.size();
newData.set(decompLead, canonValue);
canonStartSets.add(set = new UnicodeSet());
if (firstOrigin != 0) {
set.add(firstOrigin);
}
} else {
set = canonStartSets.get(canonValue & CANON_VALUE_MASK);
}
set.add(origin);
}
}
use of android.icu.text.UnicodeSet in project j2objc by google.
the class UnicodeRegex method transform.
/**
* Adds full Unicode property support, with the latest version of Unicode,
* to Java Regex, bringing it up to Level 1 (see
* http://www.unicode.org/reports/tr18/). It does this by preprocessing the
* regex pattern string and interpreting the character classes (\p{...},
* \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With
* this utility, Java regex expressions can be updated to work with the
* latest version of Unicode, and with all Unicode properties. Note that the
* UnicodeSet syntax has not yet, however, been updated to be completely
* consistent with Java regex, so be careful of the differences.
* <p>Not thread-safe; create a separate copy for different threads.
* <p>In the future, we may extend this to support other regex packages.
*
* @regex A modified Java regex pattern, as in the input to
* Pattern.compile(), except that all "character classes" are
* processed as if they were UnicodeSet patterns. Example:
* "abc[:bc=N:]. See UnicodeSet for the differences in syntax.
* @return A processed Java regex pattern, suitable for input to
* Pattern.compile().
*/
@Override
public String transform(String regex) {
StringBuilder result = new StringBuilder();
UnicodeSet temp = new UnicodeSet();
ParsePosition pos = new ParsePosition(0);
// 1 = after \
int state = 0;
for (int i = 0; i < regex.length(); ++i) {
// look for UnicodeSets, allowing for quoting with \ and \Q
char ch = regex.charAt(i);
switch(state) {
case // we only care about \, and '['.
0:
if (ch == '\\') {
if (UnicodeSet.resemblesPattern(regex, i)) {
// should only happen with \p
i = processSet(regex, i, result, temp, pos);
continue;
}
state = 1;
} else if (ch == '[') {
// if we have what looks like a UnicodeSet
if (UnicodeSet.resemblesPattern(regex, i)) {
i = processSet(regex, i, result, temp, pos);
continue;
}
}
break;
case // we are after a \
1:
if (ch == 'Q') {
state = 1;
} else {
state = 0;
}
break;
case // we are in a \Q...
2:
if (ch == '\\') {
state = 3;
}
break;
case // we are in at \Q...\
3:
if (ch == 'E') {
state = 0;
}
state = 2;
break;
}
result.append(ch);
}
return result.toString();
}
use of android.icu.text.UnicodeSet in project j2objc by google.
the class UnicodeRegex method processSet.
// ===== PRIVATES =====
private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) {
try {
pos.setIndex(i);
UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0);
// hack to fix toPattern
x.complement().complement();
result.append(x.toPattern(false));
// allow for the loop increment
i = pos.getIndex() - 1;
return i;
} catch (Exception e) {
throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e);
}
}
Aggregations