Search in sources :

Example 1 with IntsRefFSTEnum

use of org.apache.lucene.util.fst.IntsRefFSTEnum in project lucene-solr by apache.

the class TestTokenInfoDictionary method testEnumerateAll.

/** enumerates the entire FST/lookup data and just does basic sanity checks */
public void testEnumerateAll() throws Exception {
    // just for debugging
    int numTerms = 0;
    int numWords = 0;
    int lastWordId = -1;
    int lastSourceId = -1;
    TokenInfoDictionary tid = TokenInfoDictionary.getInstance();
    ConnectionCosts matrix = ConnectionCosts.getInstance();
    FST<Long> fst = tid.getFST().getInternalFST();
    IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
    InputOutput<Long> mapping;
    IntsRef scratch = new IntsRef();
    while ((mapping = fstEnum.next()) != null) {
        numTerms++;
        IntsRef input = mapping.input;
        char[] chars = new char[input.length];
        for (int i = 0; i < chars.length; i++) {
            chars[i] = (char) input.ints[input.offset + i];
        }
        assertTrue(UnicodeUtil.validUTF16String(new String(chars)));
        Long output = mapping.output;
        int sourceId = output.intValue();
        // we walk in order, terms, sourceIds, and wordIds should always be increasing
        assertTrue(sourceId > lastSourceId);
        lastSourceId = sourceId;
        tid.lookupWordIds(sourceId, scratch);
        for (int i = 0; i < scratch.length; i++) {
            numWords++;
            int wordId = scratch.ints[scratch.offset + i];
            assertTrue(wordId > lastWordId);
            lastWordId = wordId;
            String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
            assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));
            String inflectionForm = tid.getInflectionForm(wordId);
            assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm));
            if (inflectionForm != null) {
                // check that it's actually an ipadic inflection form
                assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm));
            }
            String inflectionType = tid.getInflectionType(wordId);
            assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType));
            if (inflectionType != null) {
                // check that it's actually an ipadic inflection type
                assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType));
            }
            int leftId = tid.getLeftId(wordId);
            int rightId = tid.getRightId(wordId);
            matrix.get(rightId, leftId);
            tid.getWordCost(wordId);
            String pos = tid.getPartOfSpeech(wordId);
            assertNotNull(pos);
            assertTrue(UnicodeUtil.validUTF16String(pos));
            // check that it's actually an ipadic pos tag
            assertNotNull(ToStringUtil.getPOSTranslation(pos));
            String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
            assertNotNull(pronunciation);
            assertTrue(UnicodeUtil.validUTF16String(pronunciation));
            String reading = tid.getReading(wordId, chars, 0, chars.length);
            assertNotNull(reading);
            assertTrue(UnicodeUtil.validUTF16String(reading));
        }
    }
    if (VERBOSE) {
        System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
    }
}
Also used : IntsRefFSTEnum(org.apache.lucene.util.fst.IntsRefFSTEnum) IntsRef(org.apache.lucene.util.IntsRef)

Aggregations

IntsRef (org.apache.lucene.util.IntsRef)1 IntsRefFSTEnum (org.apache.lucene.util.fst.IntsRefFSTEnum)1