Examples with USerializedSet - android.icu.impl.USerializedSet

Example 1 with USerializedSet

use of android.icu.impl.USerializedSet in project j2objc by google.

the class CollationDataReader method read.

static void read(CollationTailoring base, ByteBuffer inBytes, CollationTailoring tailoring) throws IOException {
    tailoring.version = ICUBinary.readHeader(inBytes, DATA_FORMAT, IS_ACCEPTABLE);
    if (base != null && base.getUCAVersion() != tailoring.getUCAVersion()) {
        throw new ICUException("Tailoring UCA version differs from base data UCA version");
    }
    int inLength = inBytes.remaining();
    if (inLength < 8) {
        throw new ICUException("not enough bytes");
    }
    // inIndexes[IX_INDEXES_LENGTH]
    int indexesLength = inBytes.getInt();
    if (indexesLength < 2 || inLength < indexesLength * 4) {
        throw new ICUException("not enough indexes");
    }
    int[] inIndexes = new int[IX_TOTAL_SIZE + 1];
    inIndexes[0] = indexesLength;
    for (int i = 1; i < indexesLength && i < inIndexes.length; ++i) {
        inIndexes[i] = inBytes.getInt();
    }
    for (int i = indexesLength; i < inIndexes.length; ++i) {
        inIndexes[i] = -1;
    }
    if (indexesLength > inIndexes.length) {
        ICUBinary.skipBytes(inBytes, (indexesLength - inIndexes.length) * 4);
    }
    // Assume that the tailoring data is in initial state,
    // with null pointers and 0 lengths.
    // Set pointers to non-empty data parts.
    // Do this in order of their byte offsets. (Should help porting to Java.)
    // one of the indexes[] slots
    int index;
    // byte offset for the index part
    int offset;
    // number of bytes in the index part
    int length;
    if (indexesLength > IX_TOTAL_SIZE) {
        length = inIndexes[IX_TOTAL_SIZE];
    } else if (indexesLength > IX_REORDER_CODES_OFFSET) {
        length = inIndexes[indexesLength - 1];
    } else {
        // only indexes, and inLength was already checked for them
        length = 0;
    }
    if (inLength < length) {
        throw new ICUException("not enough bytes");
    }
    CollationData baseData = base == null ? null : base.data;
    int[] reorderCodes;
    int reorderCodesLength;
    index = IX_REORDER_CODES_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 4) {
        if (baseData == null) {
            // the base data does not have a reordering.
            throw new ICUException("Collation base data must not reorder scripts");
        }
        reorderCodesLength = length / 4;
        reorderCodes = ICUBinary.getInts(inBytes, reorderCodesLength, length & 3);
        // The reorderRanges (if any) are the trailing reorderCodes entries.
        // Split the array at the boundary.
        // Script or reorder codes do not exceed 16-bit values.
        // Range limits are stored in the upper 16 bits, and are never 0.
        int reorderRangesLength = 0;
        while (reorderRangesLength < reorderCodesLength && (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
            ++reorderRangesLength;
        }
        assert (reorderRangesLength < reorderCodesLength);
        reorderCodesLength -= reorderRangesLength;
    } else {
        reorderCodes = new int[0];
        reorderCodesLength = 0;
        ICUBinary.skipBytes(inBytes, length);
    }
    // There should be a reorder table only if there are reorder codes.
    // However, when there are reorder codes the reorder table may be omitted to reduce
    // the data size.
    byte[] reorderTable = null;
    index = IX_REORDER_TABLE_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 256) {
        if (reorderCodesLength == 0) {
            throw new ICUException("Reordering table without reordering codes");
        }
        reorderTable = new byte[256];
        inBytes.get(reorderTable);
        length -= 256;
    } else {
    // If we have reorder codes, then build the reorderTable at the end,
    // when the CollationData is otherwise complete.
    }
    ICUBinary.skipBytes(inBytes, length);
    if (baseData != null && baseData.numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000L)) {
        throw new ICUException("Tailoring numeric primary weight differs from base data");
    }
    // Remains null if there are no mappings.
    CollationData data = null;
    index = IX_TRIE_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 8) {
        tailoring.ensureOwnedData();
        data = tailoring.ownedData;
        data.base = baseData;
        data.numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000L;
        data.trie = tailoring.trie = Trie2_32.createFromSerialized(inBytes);
        int trieLength = data.trie.getSerializedLength();
        if (trieLength > length) {
            // No mappings.
            throw new ICUException("Not enough bytes for the mappings trie");
        }
        length -= trieLength;
    } else if (baseData != null) {
        // Use the base data. Only the settings are tailored.
        tailoring.data = baseData;
    } else {
        // No mappings.
        throw new ICUException("Missing collation data mappings");
    }
    ICUBinary.skipBytes(inBytes, length);
    index = IX_RESERVED8_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    ICUBinary.skipBytes(inBytes, length);
    index = IX_CES_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 8) {
        if (data == null) {
            throw new ICUException("Tailored ces without tailored trie");
        }
        data.ces = ICUBinary.getLongs(inBytes, length / 8, length & 7);
    } else {
        ICUBinary.skipBytes(inBytes, length);
    }
    index = IX_RESERVED10_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    ICUBinary.skipBytes(inBytes, length);
    index = IX_CE32S_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 4) {
        if (data == null) {
            throw new ICUException("Tailored ce32s without tailored trie");
        }
        data.ce32s = ICUBinary.getInts(inBytes, length / 4, length & 3);
    } else {
        ICUBinary.skipBytes(inBytes, length);
    }
    int jamoCE32sStart = inIndexes[IX_JAMO_CE32S_START];
    if (jamoCE32sStart >= 0) {
        if (data == null || data.ce32s == null) {
            throw new ICUException("JamoCE32sStart index into non-existent ce32s[]");
        }
        data.jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
        System.arraycopy(data.ce32s, jamoCE32sStart, data.jamoCE32s, 0, CollationData.JAMO_CE32S_LENGTH);
    } else if (data == null) {
    // Nothing to do.
    } else if (baseData != null) {
        data.jamoCE32s = baseData.jamoCE32s;
    } else {
        throw new ICUException("Missing Jamo CE32s for Hangul processing");
    }
    index = IX_ROOT_ELEMENTS_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 4) {
        int rootElementsLength = length / 4;
        if (data == null) {
            throw new ICUException("Root elements but no mappings");
        }
        if (rootElementsLength <= CollationRootElements.IX_SEC_TER_BOUNDARIES) {
            throw new ICUException("Root elements array too short");
        }
        data.rootElements = new long[rootElementsLength];
        for (int i = 0; i < rootElementsLength; ++i) {
            // unsigned int -> long
            data.rootElements[i] = inBytes.getInt() & 0xffffffffL;
        }
        long commonSecTer = data.rootElements[CollationRootElements.IX_COMMON_SEC_AND_TER_CE];
        if (commonSecTer != Collation.COMMON_SEC_AND_TER_CE) {
            throw new ICUException("Common sec/ter weights in base data differ from the hardcoded value");
        }
        long secTerBoundaries = data.rootElements[CollationRootElements.IX_SEC_TER_BOUNDARIES];
        if ((secTerBoundaries >>> 24) < CollationKeys.SEC_COMMON_HIGH) {
            // and secondary weights would collide with compressed common secondaries.
            throw new ICUException("[fixed last secondary common byte] is too low");
        }
        length &= 3;
    }
    ICUBinary.skipBytes(inBytes, length);
    index = IX_CONTEXTS_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 2) {
        if (data == null) {
            throw new ICUException("Tailored contexts without tailored trie");
        }
        data.contexts = ICUBinary.getString(inBytes, length / 2, length & 1);
    } else {
        ICUBinary.skipBytes(inBytes, length);
    }
    index = IX_UNSAFE_BWD_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 2) {
        if (data == null) {
            throw new ICUException("Unsafe-backward-set but no mappings");
        }
        if (baseData == null) {
            // Create the unsafe-backward set for the root collator.
            // Include all non-zero combining marks and trail surrogates.
            // We do this at load time, rather than at build time,
            // to simplify Unicode version bootstrapping:
            // The root data builder only needs the new FractionalUCA.txt data,
            // but it need not be built with a version of ICU already updated to
            // the corresponding new Unicode Character Database.
            // 
            // The following is an optimized version of
            // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
            // It is faster and requires fewer code dependencies.
            // trail surrogates
            tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff);
            data.nfcImpl.addLcccChars(tailoring.unsafeBackwardSet);
        } else {
            // Clone the root collator's set contents.
            tailoring.unsafeBackwardSet = baseData.unsafeBackwardSet.cloneAsThawed();
        }
        // Add the ranges from the data file to the unsafe-backward set.
        USerializedSet sset = new USerializedSet();
        char[] unsafeData = ICUBinary.getChars(inBytes, length / 2, length & 1);
        length = 0;
        sset.getSet(unsafeData, 0);
        int count = sset.countRanges();
        int[] range = new int[2];
        for (int i = 0; i < count; ++i) {
            sset.getRange(i, range);
            tailoring.unsafeBackwardSet.add(range[0], range[1]);
        }
        // Mark each lead surrogate as "unsafe"
        // if any of its 1024 associated supplementary code points is "unsafe".
        int c = 0x10000;
        for (int lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
            if (!tailoring.unsafeBackwardSet.containsNone(c, c + 0x3ff)) {
                tailoring.unsafeBackwardSet.add(lead);
            }
        }
        tailoring.unsafeBackwardSet.freeze();
        data.unsafeBackwardSet = tailoring.unsafeBackwardSet;
    } else if (data == null) {
    // Nothing to do.
    } else if (baseData != null) {
        // No tailoring-specific data: Alias the root collator's set.
        data.unsafeBackwardSet = baseData.unsafeBackwardSet;
    } else {
        throw new ICUException("Missing unsafe-backward-set");
    }
    ICUBinary.skipBytes(inBytes, length);
    // If the fast Latin format version is different,
    // or the version is set to 0 for "no fast Latin table",
    // then just always use the normal string comparison path.
    index = IX_FAST_LATIN_TABLE_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (data != null) {
        data.fastLatinTable = null;
        data.fastLatinTableHeader = null;
        if (((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin.VERSION) {
            if (length >= 2) {
                char header0 = inBytes.getChar();
                int headerLength = header0 & 0xff;
                data.fastLatinTableHeader = new char[headerLength];
                data.fastLatinTableHeader[0] = header0;
                for (int i = 1; i < headerLength; ++i) {
                    data.fastLatinTableHeader[i] = inBytes.getChar();
                }
                int tableLength = length / 2 - headerLength;
                data.fastLatinTable = ICUBinary.getChars(inBytes, tableLength, length & 1);
                length = 0;
                if ((header0 >> 8) != CollationFastLatin.VERSION) {
                    throw new ICUException("Fast-Latin table version differs from version in data header");
                }
            } else if (baseData != null) {
                data.fastLatinTable = baseData.fastLatinTable;
                data.fastLatinTableHeader = baseData.fastLatinTableHeader;
            }
        }
    }
    ICUBinary.skipBytes(inBytes, length);
    index = IX_SCRIPTS_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 2) {
        if (data == null) {
            throw new ICUException("Script order data but no mappings");
        }
        int scriptsLength = length / 2;
        CharBuffer inChars = inBytes.asCharBuffer();
        data.numScripts = inChars.get();
        // There must be enough entries for both arrays, including more than two range starts.
        int scriptStartsLength = scriptsLength - (1 + data.numScripts + 16);
        if (scriptStartsLength <= 2) {
            throw new ICUException("Script order data too short");
        }
        inChars.get(data.scriptsIndex = new char[data.numScripts + 16]);
        inChars.get(data.scriptStarts = new char[scriptStartsLength]);
        if (!(data.scriptStarts[0] == 0 && data.scriptStarts[1] == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8) && data.scriptStarts[scriptStartsLength - 1] == (Collation.TRAIL_WEIGHT_BYTE << 8))) {
            throw new ICUException("Script order data not valid");
        }
    } else if (data == null) {
    // Nothing to do.
    } else if (baseData != null) {
        data.numScripts = baseData.numScripts;
        data.scriptsIndex = baseData.scriptsIndex;
        data.scriptStarts = baseData.scriptStarts;
    }
    ICUBinary.skipBytes(inBytes, length);
    index = IX_COMPRESSIBLE_BYTES_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    if (length >= 256) {
        if (data == null) {
            throw new ICUException("Data for compressible primary lead bytes but no mappings");
        }
        data.compressibleBytes = new boolean[256];
        for (int i = 0; i < 256; ++i) {
            data.compressibleBytes[i] = inBytes.get() != 0;
        }
        length -= 256;
    } else if (data == null) {
    // Nothing to do.
    } else if (baseData != null) {
        data.compressibleBytes = baseData.compressibleBytes;
    } else {
        throw new ICUException("Missing data for compressible primary lead bytes");
    }
    ICUBinary.skipBytes(inBytes, length);
    index = IX_RESERVED18_OFFSET;
    offset = inIndexes[index];
    length = inIndexes[index + 1] - offset;
    ICUBinary.skipBytes(inBytes, length);
    CollationSettings ts = tailoring.settings.readOnly();
    int options = inIndexes[IX_OPTIONS] & 0xffff;
    char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT];
    int fastLatinOptions = CollationFastLatin.getOptions(tailoring.data, ts, fastLatinPrimaries);
    if (options == ts.options && ts.variableTop != 0 && Arrays.equals(reorderCodes, ts.reorderCodes) && fastLatinOptions == ts.fastLatinOptions && (fastLatinOptions < 0 || Arrays.equals(fastLatinPrimaries, ts.fastLatinPrimaries))) {
        return;
    }
    CollationSettings settings = tailoring.settings.copyOnWrite();
    settings.options = options;
    // Set variableTop from options and scripts data.
    settings.variableTop = tailoring.data.getLastPrimaryForGroup(Collator.ReorderCodes.FIRST + settings.getMaxVariable());
    if (settings.variableTop == 0) {
        throw new ICUException("The maxVariable could not be mapped to a variableTop");
    }
    if (reorderCodesLength != 0) {
        settings.aliasReordering(baseData, reorderCodes, reorderCodesLength, reorderTable);
    }
    settings.fastLatinOptions = CollationFastLatin.getOptions(tailoring.data, settings, settings.fastLatinPrimaries);
}

Also used : ICUException(android.icu.util.ICUException) CharBuffer(java.nio.CharBuffer) UnicodeSet(android.icu.text.UnicodeSet) USerializedSet(android.icu.impl.USerializedSet)

Example 2 with USerializedSet

use of android.icu.impl.USerializedSet in project j2objc by google.

the class BasicTest method TestSerializedSet.

@Test
public void TestSerializedSet() {
    USerializedSet sset = new USerializedSet();
    UnicodeSet set = new UnicodeSet();
    int start, end;
    char[] serialized = { // length
    0x8007, // bmpLength
    3, 0xc0, 0xfe, 0xfffc, 1, 9, 0x10, 0xfffc };
    sset.getSet(serialized, 0);
    // collect all sets into one for contiguous output
    int[] startEnd = new int[2];
    int count = sset.countRanges();
    for (int j = 0; j < count; ++j) {
        sset.getRange(j, startEnd);
        set.add(startEnd[0], startEnd[1]);
    }
    // test all of these characters
    UnicodeSetIterator it = new UnicodeSetIterator(set);
    while (it.nextRange() && it.codepoint != UnicodeSetIterator.IS_STRING) {
        start = it.codepoint;
        end = it.codepointEnd;
        while (start <= end) {
            if (!sset.contains(start)) {
                errln("USerializedSet.contains failed for " + Utility.hex(start, 8));
            }
            ++start;
        }
    }
}

Also used : UnicodeSetIterator(android.icu.text.UnicodeSetIterator) USerializedSet(android.icu.impl.USerializedSet) UnicodeSet(android.icu.text.UnicodeSet) Test(org.junit.Test)

Aggregations

USerializedSet (android.icu.impl.USerializedSet)2 UnicodeSet (android.icu.text.UnicodeSet)2 UnicodeSetIterator (android.icu.text.UnicodeSetIterator)1 ICUException (android.icu.util.ICUException)1 CharBuffer (java.nio.CharBuffer)1 Test (org.junit.Test)1