Search in sources :

Example 81 with UnicodeSet

use of android.icu.text.UnicodeSet in project j2objc by google.

the class RoundTripTest method getRepresentativeBoundaryHangul.

private static UnicodeSet getRepresentativeBoundaryHangul() {
    UnicodeSet resultToAddTo = new UnicodeSet();
    // U+1100 HANGUL CHOSEONG KIYEOK
    // U+1161 HANGUL JUNGSEONG A
    UnicodeSet L = new UnicodeSet("[:hst=L:]");
    UnicodeSet V = new UnicodeSet("[:hst=V:]");
    UnicodeSet T = new UnicodeSet("[:hst=T:]");
    String prefixLV = "\u1100\u1161";
    String prefixL = "\u1100";
    String suffixV = "\u1161";
    // HANGUL CHOSEONG IEUNG
    String nullL = "\u110B";
    UnicodeSet L0 = new UnicodeSet("[\u1100\u110B]");
    for (UnicodeSetIterator iL0 = new UnicodeSetIterator(L0); iL0.next(); ) {
        for (UnicodeSetIterator iV = new UnicodeSetIterator(V); iV.next(); ) {
            for (UnicodeSetIterator iV2 = new UnicodeSetIterator(V); iV2.next(); ) {
                String sample = iL0.getString() + iV.getString() + nullL + iV2.getString();
                String trial = Normalizer.compose(sample, false);
                if (trial.length() == 2) {
                    resultToAddTo.add(trial);
                }
            }
        }
    }
    for (UnicodeSetIterator iL = new UnicodeSetIterator(L); iL.next(); ) {
        // do all combinations of "g" + V + L + "a"
        final String suffix = iL.getString() + suffixV;
        for (UnicodeSetIterator iV = new UnicodeSetIterator(V); iV.next(); ) {
            String sample = prefixL + iV.getString() + suffix;
            String trial = Normalizer.compose(sample, false);
            if (trial.length() == 2) {
                resultToAddTo.add(trial);
            }
        }
        // do all combinations of "ga" + T + L + "a"
        for (UnicodeSetIterator iT = new UnicodeSetIterator(T); iT.next(); ) {
            String sample = prefixLV + iT.getString() + suffix;
            String trial = Normalizer.compose(sample, false);
            if (trial.length() == 2) {
                resultToAddTo.add(trial);
            }
        }
    }
    return resultToAddTo;
}
Also used : UnicodeSetIterator(android.icu.text.UnicodeSetIterator) UnicodeSet(android.icu.text.UnicodeSet)

Example 82 with UnicodeSet

use of android.icu.text.UnicodeSet in project j2objc by google.

the class CollationDataReader method read.

static void read(CollationTailoring base, ByteBuffer inBytes, CollationTailoring tailoring) throws IOException {
    tailoring.version = ICUBinary.readHeader(inBytes, DATA_FORMAT, IS_ACCEPTABLE);
    if (base != null && base.getUCAVersion() != tailoring.getUCAVersion()) {
        throw new ICUException("Tailoring UCA version differs from base data UCA version");
    }
    int inLength = inBytes.remaining();
    if (inLength < 8) {
        throw new ICUException("not enough bytes");
    }
    // inIndexes[IX_INDEXES_LENGTH]
    int indexesLength = inBytes.getInt();
    if (indexesLength < 2 || inLength < indexesLength * 4) {
        throw new ICUException("not enough indexes");
    }
    int[] inIndexes = new int[IX_TOTAL_SIZE + 1];
    inIndexes[0] = indexesLength;
    for (int i = 1; i < indexesLength && i < inIndexes.length; ++i) {
        inIndexes[i] = inBytes.getInt();
    }
    for (int i = indexesLength; i < inIndexes.length; ++i) {
        inIndexes[i] = -1;
    }
    if (indexesLength > inIndexes.length) {
        ICUBinary.skipBytes(inBytes, (indexesLength - inIndexes.length) * 4);
    }
    // Assume that the tailoring data is in initial state,
    // with null pointers and 0 lengths.
    // Set pointers to non-empty data parts.
    // Do this in order of their byte offsets. (Should help porting to Java.)
    // one of the indexes[] slots
    int index;
    // byte offset for the index part
    int offset;
    // number of bytes in the index part
    int length;
    if (indexesLength > IX_TOTAL_SIZE) {
        length = inIndexes[IX_TOTAL_SIZE];
    } else if (indexesLength > IX_REORDER_CODES_OFFSET) {
        length = inIndexes[indexesLength - 1];
    } else {
        // only indexes, and inLength was already checked for them
        length = 0;
    }
    if (inLength < length) {
        throw new ICUException("not enough bytes");
    }
    CollationData baseData = base == null ? null : base.data;
    int[] reorderCodes;
    int reorderCodesLength;
    index = IX_REORDER_CODES_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 4) {
        if (baseData == null) {
            // the base data does not have a reordering.
            throw new ICUException("Collation base data must not reorder scripts");
        }
        reorderCodesLength = length / 4;
        reorderCodes = ICUBinary.getInts(inBytes, reorderCodesLength, length & 3);
        // The reorderRanges (if any) are the trailing reorderCodes entries.
        // Split the array at the boundary.
        // Script or reorder codes do not exceed 16-bit values.
        // Range limits are stored in the upper 16 bits, and are never 0.
        int reorderRangesLength = 0;
        while (reorderRangesLength < reorderCodesLength && (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
            ++reorderRangesLength;
        }
        assert (reorderRangesLength < reorderCodesLength);
        reorderCodesLength -= reorderRangesLength;
    } else {
        reorderCodes = new int[0];
        reorderCodesLength = 0;
        ICUBinary.skipBytes(inBytes, length);
    }
    // There should be a reorder table only if there are reorder codes.
    // However, when there are reorder codes the reorder table may be omitted to reduce
    // the data size.
    byte[] reorderTable = null;
    index = IX_REORDER_TABLE_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 256) {
        if (reorderCodesLength == 0) {
            throw new ICUException("Reordering table without reordering codes");
        }
        reorderTable = new byte[256];
        inBytes.get(reorderTable);
        length -= 256;
    } else {
    // If we have reorder codes, then build the reorderTable at the end,
    // when the CollationData is otherwise complete.
    }
    ICUBinary.skipBytes(inBytes, length);
    if (baseData != null && baseData.numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000L)) {
        throw new ICUException("Tailoring numeric primary weight differs from base data");
    }
    // Remains null if there are no mappings.
    CollationData data = null;
    index = IX_TRIE_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 8) {
        tailoring.ensureOwnedData();
        data = tailoring.ownedData;
        data.base = baseData;
        data.numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000L;
        data.trie = tailoring.trie = Trie2_32.createFromSerialized(inBytes);
        int trieLength = data.trie.getSerializedLength();
        if (trieLength > length) {
            // No mappings.
            throw new ICUException("Not enough bytes for the mappings trie");
        }
        length -= trieLength;
    } else if (baseData != null) {
        // Use the base data. Only the settings are tailored.
        tailoring.data = baseData;
    } else {
        // No mappings.
        throw new ICUException("Missing collation data mappings");
    }
    ICUBinary.skipBytes(inBytes, length);
    index = IX_RESERVED8_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    ICUBinary.skipBytes(inBytes, length);
    index = IX_CES_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 8) {
        if (data == null) {
            throw new ICUException("Tailored ces without tailored trie");
        }
        data.ces = ICUBinary.getLongs(inBytes, length / 8, length & 7);
    } else {
        ICUBinary.skipBytes(inBytes, length);
    }
    index = IX_RESERVED10_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    ICUBinary.skipBytes(inBytes, length);
    index = IX_CE32S_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 4) {
        if (data == null) {
            throw new ICUException("Tailored ce32s without tailored trie");
        }
        data.ce32s = ICUBinary.getInts(inBytes, length / 4, length & 3);
    } else {
        ICUBinary.skipBytes(inBytes, length);
    }
    int jamoCE32sStart = inIndexes[IX_JAMO_CE32S_START];
    if (jamoCE32sStart >= 0) {
        if (data == null || data.ce32s == null) {
            throw new ICUException("JamoCE32sStart index into non-existent ce32s[]");
        }
        data.jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
        System.arraycopy(data.ce32s, jamoCE32sStart, data.jamoCE32s, 0, CollationData.JAMO_CE32S_LENGTH);
    } else if (data == null) {
    // Nothing to do.
    } else if (baseData != null) {
        data.jamoCE32s = baseData.jamoCE32s;
    } else {
        throw new ICUException("Missing Jamo CE32s for Hangul processing");
    }
    index = IX_ROOT_ELEMENTS_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 4) {
        int rootElementsLength = length / 4;
        if (data == null) {
            throw new ICUException("Root elements but no mappings");
        }
        if (rootElementsLength <= CollationRootElements.IX_SEC_TER_BOUNDARIES) {
            throw new ICUException("Root elements array too short");
        }
        data.rootElements = new long[rootElementsLength];
        for (int i = 0; i < rootElementsLength; ++i) {
            // unsigned int -> long
            data.rootElements[i] = inBytes.getInt() & 0xffffffffL;
        }
        long commonSecTer = data.rootElements[CollationRootElements.IX_COMMON_SEC_AND_TER_CE];
        if (commonSecTer != Collation.COMMON_SEC_AND_TER_CE) {
            throw new ICUException("Common sec/ter weights in base data differ from the hardcoded value");
        }
        long secTerBoundaries = data.rootElements[CollationRootElements.IX_SEC_TER_BOUNDARIES];
        if ((secTerBoundaries >>> 24) < CollationKeys.SEC_COMMON_HIGH) {
            // and secondary weights would collide with compressed common secondaries.
            throw new ICUException("[fixed last secondary common byte] is too low");
        }
        length &= 3;
    }
    ICUBinary.skipBytes(inBytes, length);
    index = IX_CONTEXTS_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 2) {
        if (data == null) {
            throw new ICUException("Tailored contexts without tailored trie");
        }
        data.contexts = ICUBinary.getString(inBytes, length / 2, length & 1);
    } else {
        ICUBinary.skipBytes(inBytes, length);
    }
    index = IX_UNSAFE_BWD_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 2) {
        if (data == null) {
            throw new ICUException("Unsafe-backward-set but no mappings");
        }
        if (baseData == null) {
            // Create the unsafe-backward set for the root collator.
            // Include all non-zero combining marks and trail surrogates.
            // We do this at load time, rather than at build time,
            // to simplify Unicode version bootstrapping:
            // The root data builder only needs the new FractionalUCA.txt data,
            // but it need not be built with a version of ICU already updated to
            // the corresponding new Unicode Character Database.
            // 
            // The following is an optimized version of
            // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
            // It is faster and requires fewer code dependencies.
            // trail surrogates
            tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff);
            data.nfcImpl.addLcccChars(tailoring.unsafeBackwardSet);
        } else {
            // Clone the root collator's set contents.
            tailoring.unsafeBackwardSet = baseData.unsafeBackwardSet.cloneAsThawed();
        }
        // Add the ranges from the data file to the unsafe-backward set.
        USerializedSet sset = new USerializedSet();
        char[] unsafeData = ICUBinary.getChars(inBytes, length / 2, length & 1);
        length = 0;
        sset.getSet(unsafeData, 0);
        int count = sset.countRanges();
        int[] range = new int[2];
        for (int i = 0; i < count; ++i) {
            sset.getRange(i, range);
            tailoring.unsafeBackwardSet.add(range[0], range[1]);
        }
        // Mark each lead surrogate as "unsafe"
        // if any of its 1024 associated supplementary code points is "unsafe".
        int c = 0x10000;
        for (int lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
            if (!tailoring.unsafeBackwardSet.containsNone(c, c + 0x3ff)) {
                tailoring.unsafeBackwardSet.add(lead);
            }
        }
        tailoring.unsafeBackwardSet.freeze();
        data.unsafeBackwardSet = tailoring.unsafeBackwardSet;
    } else if (data == null) {
    // Nothing to do.
    } else if (baseData != null) {
        // No tailoring-specific data: Alias the root collator's set.
        data.unsafeBackwardSet = baseData.unsafeBackwardSet;
    } else {
        throw new ICUException("Missing unsafe-backward-set");
    }
    ICUBinary.skipBytes(inBytes, length);
    // If the fast Latin format version is different,
    // or the version is set to 0 for "no fast Latin table",
    // then just always use the normal string comparison path.
    index = IX_FAST_LATIN_TABLE_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (data != null) {
        data.fastLatinTable = null;
        data.fastLatinTableHeader = null;
        if (((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin.VERSION) {
            if (length >= 2) {
                char header0 = inBytes.getChar();
                int headerLength = header0 & 0xff;
                data.fastLatinTableHeader = new char[headerLength];
                data.fastLatinTableHeader[0] = header0;
                for (int i = 1; i < headerLength; ++i) {
                    data.fastLatinTableHeader[i] = inBytes.getChar();
                }
                int tableLength = length / 2 - headerLength;
                data.fastLatinTable = ICUBinary.getChars(inBytes, tableLength, length & 1);
                length = 0;
                if ((header0 >> 8) != CollationFastLatin.VERSION) {
                    throw new ICUException("Fast-Latin table version differs from version in data header");
                }
            } else if (baseData != null) {
                data.fastLatinTable = baseData.fastLatinTable;
                data.fastLatinTableHeader = baseData.fastLatinTableHeader;
            }
        }
    }
    ICUBinary.skipBytes(inBytes, length);
    index = IX_SCRIPTS_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 2) {
        if (data == null) {
            throw new ICUException("Script order data but no mappings");
        }
        int scriptsLength = length / 2;
        CharBuffer inChars = inBytes.asCharBuffer();
        data.numScripts = inChars.get();
        // There must be enough entries for both arrays, including more than two range starts.
        int scriptStartsLength = scriptsLength - (1 + data.numScripts + 16);
        if (scriptStartsLength <= 2) {
            throw new ICUException("Script order data too short");
        }
        inChars.get(data.scriptsIndex = new char[data.numScripts + 16]);
        inChars.get(data.scriptStarts = new char[scriptStartsLength]);
        if (!(data.scriptStarts[0] == 0 && data.scriptStarts[1] == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8) && data.scriptStarts[scriptStartsLength - 1] == (Collation.TRAIL_WEIGHT_BYTE << 8))) {
            throw new ICUException("Script order data not valid");
        }
    } else if (data == null) {
    // Nothing to do.
    } else if (baseData != null) {
        data.numScripts = baseData.numScripts;
        data.scriptsIndex = baseData.scriptsIndex;
        data.scriptStarts = baseData.scriptStarts;
    }
    ICUBinary.skipBytes(inBytes, length);
    index = IX_COMPRESSIBLE_BYTES_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 256) {
        if (data == null) {
            throw new ICUException("Data for compressible primary lead bytes but no mappings");
        }
        data.compressibleBytes = new boolean[256];
        for (int i = 0; i < 256; ++i) {
            data.compressibleBytes[i] = inBytes.get() != 0;
        }
        length -= 256;
    } else if (data == null) {
    // Nothing to do.
    } else if (baseData != null) {
        data.compressibleBytes = baseData.compressibleBytes;
    } else {
        throw new ICUException("Missing data for compressible primary lead bytes");
    }
    ICUBinary.skipBytes(inBytes, length);
    index = IX_RESERVED18_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    ICUBinary.skipBytes(inBytes, length);
    CollationSettings ts = tailoring.settings.readOnly();
    int options = inIndexes[IX_OPTIONS] & 0xffff;
    char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT];
    int fastLatinOptions = CollationFastLatin.getOptions(tailoring.data, ts, fastLatinPrimaries);
    if (options == ts.options && ts.variableTop != 0 && Arrays.equals(reorderCodes, ts.reorderCodes) && fastLatinOptions == ts.fastLatinOptions && (fastLatinOptions < 0 || Arrays.equals(fastLatinPrimaries, ts.fastLatinPrimaries))) {
        return;
    }
    CollationSettings settings = tailoring.settings.copyOnWrite();
    settings.options = options;
    // Set variableTop from options and scripts data.
    settings.variableTop = tailoring.data.getLastPrimaryForGroup(Collator.ReorderCodes.FIRST + settings.getMaxVariable());
    if (settings.variableTop == 0) {
        throw new ICUException("The maxVariable could not be mapped to a variableTop");
    }
    if (reorderCodesLength != 0) {
        settings.aliasReordering(baseData, reorderCodes, reorderCodesLength, reorderTable);
    }
    settings.fastLatinOptions = CollationFastLatin.getOptions(tailoring.data, settings, settings.fastLatinPrimaries);
}
Also used : ICUException(android.icu.util.ICUException) CharBuffer(java.nio.CharBuffer) UnicodeSet(android.icu.text.UnicodeSet) USerializedSet(android.icu.impl.USerializedSet)

Example 83 with UnicodeSet

use of android.icu.text.UnicodeSet in project j2objc by google.

the class Normalizer2Impl method addToStartSet.

private void addToStartSet(Trie2Writable newData, int origin, int decompLead) {
    int canonValue = newData.get(decompLead);
    if ((canonValue & (CANON_HAS_SET | CANON_VALUE_MASK)) == 0 && origin != 0) {
        // origin is the first character whose decomposition starts with
        // the character for which we are setting the value.
        newData.set(decompLead, canonValue | origin);
    } else {
        // origin is not the first character, or it is U+0000.
        UnicodeSet set;
        if ((canonValue & CANON_HAS_SET) == 0) {
            int firstOrigin = canonValue & CANON_VALUE_MASK;
            canonValue = (canonValue & ~CANON_VALUE_MASK) | CANON_HAS_SET | canonStartSets.size();
            newData.set(decompLead, canonValue);
            canonStartSets.add(set = new UnicodeSet());
            if (firstOrigin != 0) {
                set.add(firstOrigin);
            }
        } else {
            set = canonStartSets.get(canonValue & CANON_VALUE_MASK);
        }
        set.add(origin);
    }
}
Also used : UnicodeSet(android.icu.text.UnicodeSet)

Example 84 with UnicodeSet

use of android.icu.text.UnicodeSet in project j2objc by google.

the class UnicodeRegex method transform.

/**
 * Adds full Unicode property support, with the latest version of Unicode,
 * to Java Regex, bringing it up to Level 1 (see
 * http://www.unicode.org/reports/tr18/). It does this by preprocessing the
 * regex pattern string and interpreting the character classes (\p{...},
 * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With
 * this utility, Java regex expressions can be updated to work with the
 * latest version of Unicode, and with all Unicode properties. Note that the
 * UnicodeSet syntax has not yet, however, been updated to be completely
 * consistent with Java regex, so be careful of the differences.
 * <p>Not thread-safe; create a separate copy for different threads.
 * <p>In the future, we may extend this to support other regex packages.
 *
 * @regex A modified Java regex pattern, as in the input to
 *        Pattern.compile(), except that all "character classes" are
 *        processed as if they were UnicodeSet patterns. Example:
 *        "abc[:bc=N:]. See UnicodeSet for the differences in syntax.
 * @return A processed Java regex pattern, suitable for input to
 *         Pattern.compile().
 */
@Override
public String transform(String regex) {
    StringBuilder result = new StringBuilder();
    UnicodeSet temp = new UnicodeSet();
    ParsePosition pos = new ParsePosition(0);
    // 1 = after \
    int state = 0;
    for (int i = 0; i < regex.length(); ++i) {
        // look for UnicodeSets, allowing for quoting with \ and \Q
        char ch = regex.charAt(i);
        switch(state) {
            case // we only care about \, and '['.
            0:
                if (ch == '\\') {
                    if (UnicodeSet.resemblesPattern(regex, i)) {
                        // should only happen with \p
                        i = processSet(regex, i, result, temp, pos);
                        continue;
                    }
                    state = 1;
                } else if (ch == '[') {
                    // if we have what looks like a UnicodeSet
                    if (UnicodeSet.resemblesPattern(regex, i)) {
                        i = processSet(regex, i, result, temp, pos);
                        continue;
                    }
                }
                break;
            case // we are after a \
            1:
                if (ch == 'Q') {
                    state = 1;
                } else {
                    state = 0;
                }
                break;
            case // we are in a \Q...
            2:
                if (ch == '\\') {
                    state = 3;
                }
                break;
            case // we are in at \Q...\
            3:
                if (ch == 'E') {
                    state = 0;
                }
                state = 2;
                break;
        }
        result.append(ch);
    }
    return result.toString();
}
Also used : UnicodeSet(android.icu.text.UnicodeSet) ParsePosition(java.text.ParsePosition)

Example 85 with UnicodeSet

use of android.icu.text.UnicodeSet in project j2objc by google.

the class UnicodeRegex method processSet.

// ===== PRIVATES =====
private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) {
    try {
        pos.setIndex(i);
        UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0);
        // hack to fix toPattern
        x.complement().complement();
        result.append(x.toPattern(false));
        // allow for the loop increment
        i = pos.getIndex() - 1;
        return i;
    } catch (Exception e) {
        throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e);
    }
}
Also used : UnicodeSet(android.icu.text.UnicodeSet) IOException(java.io.IOException) UnsupportedEncodingException(java.io.UnsupportedEncodingException)

Aggregations

UnicodeSet (android.icu.text.UnicodeSet)158 Test (org.junit.Test)112 UnicodeSetIterator (android.icu.text.UnicodeSetIterator)25 Transliterator (android.icu.text.Transliterator)19 ReplaceableString (android.icu.text.ReplaceableString)14 ULocale (android.icu.util.ULocale)13 CaseInsensitiveString (android.icu.util.CaseInsensitiveString)9 Normalizer2 (android.icu.text.Normalizer2)7 RuleBasedCollator (android.icu.text.RuleBasedCollator)7 ArrayList (java.util.ArrayList)5 HashSet (java.util.HashSet)5 FilteredNormalizer2 (android.icu.text.FilteredNormalizer2)4 SpoofChecker (android.icu.text.SpoofChecker)4 TreeSet (java.util.TreeSet)4 UnicodeMap (android.icu.dev.util.UnicodeMap)3 AlphabeticIndex (android.icu.text.AlphabeticIndex)3 CollationKey (android.icu.text.CollationKey)3 RawCollationKey (android.icu.text.RawCollationKey)3 CheckResult (android.icu.text.SpoofChecker.CheckResult)3 SpanCondition (android.icu.text.UnicodeSet.SpanCondition)3