Search in sources :

Example 46 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class TestTokenInfoDictionary method testEnumerateAll.

/** enumerates the entire FST/lookup data and just does basic sanity checks */
public void testEnumerateAll() throws Exception {
    // just for debugging
    int numTerms = 0;
    int numWords = 0;
    int lastWordId = -1;
    int lastSourceId = -1;
    TokenInfoDictionary tid = TokenInfoDictionary.getInstance();
    ConnectionCosts matrix = ConnectionCosts.getInstance();
    FST<Long> fst = tid.getFST().getInternalFST();
    IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
    InputOutput<Long> mapping;
    IntsRef scratch = new IntsRef();
    while ((mapping = fstEnum.next()) != null) {
        numTerms++;
        IntsRef input = mapping.input;
        char[] chars = new char[input.length];
        for (int i = 0; i < chars.length; i++) {
            chars[i] = (char) input.ints[input.offset + i];
        }
        assertTrue(UnicodeUtil.validUTF16String(new String(chars)));
        Long output = mapping.output;
        int sourceId = output.intValue();
        // we walk in order, terms, sourceIds, and wordIds should always be increasing
        assertTrue(sourceId > lastSourceId);
        lastSourceId = sourceId;
        tid.lookupWordIds(sourceId, scratch);
        for (int i = 0; i < scratch.length; i++) {
            numWords++;
            int wordId = scratch.ints[scratch.offset + i];
            assertTrue(wordId > lastWordId);
            lastWordId = wordId;
            String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
            assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));
            String inflectionForm = tid.getInflectionForm(wordId);
            assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm));
            if (inflectionForm != null) {
                // check that it's actually an ipadic inflection form
                assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm));
            }
            String inflectionType = tid.getInflectionType(wordId);
            assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType));
            if (inflectionType != null) {
                // check that it's actually an ipadic inflection type
                assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType));
            }
            int leftId = tid.getLeftId(wordId);
            int rightId = tid.getRightId(wordId);
            matrix.get(rightId, leftId);
            tid.getWordCost(wordId);
            String pos = tid.getPartOfSpeech(wordId);
            assertNotNull(pos);
            assertTrue(UnicodeUtil.validUTF16String(pos));
            // check that it's actually an ipadic pos tag
            assertNotNull(ToStringUtil.getPOSTranslation(pos));
            String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
            assertNotNull(pronunciation);
            assertTrue(UnicodeUtil.validUTF16String(pronunciation));
            String reading = tid.getReading(wordId, chars, 0, chars.length);
            assertNotNull(reading);
            assertTrue(UnicodeUtil.validUTF16String(reading));
        }
    }
    if (VERBOSE) {
        System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
    }
}
Also used : IntsRefFSTEnum(org.apache.lucene.util.fst.IntsRefFSTEnum) IntsRef(org.apache.lucene.util.IntsRef)

Example 47 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class TestGraphTokenizers method toPathStrings.

/** Returns all paths */
private Set<String> toPathStrings(Automaton a) {
    BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
    Set<String> paths = new HashSet<>();
    for (IntsRef ir : AutomatonTestUtil.getFiniteStringsRecursive(a, -1)) {
        paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
    }
    return paths;
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) HashSet(java.util.HashSet)

Example 48 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class DocValuesOrdinalsReader method getReader.

@Override
public OrdinalsSegmentReader getReader(LeafReaderContext context) throws IOException {
    BinaryDocValues values0 = context.reader().getBinaryDocValues(field);
    if (values0 == null) {
        values0 = DocValues.emptyBinary();
    }
    final BinaryDocValues values = values0;
    return new OrdinalsSegmentReader() {

        private int lastDocID;

        @Override
        public void get(int docID, IntsRef ordinals) throws IOException {
            if (docID < lastDocID) {
                throw new AssertionError("docs out of order: lastDocID=" + lastDocID + " vs docID=" + docID);
            }
            lastDocID = docID;
            if (docID > values.docID()) {
                values.advance(docID);
            }
            final BytesRef bytes;
            if (values.docID() == docID) {
                bytes = values.binaryValue();
            } else {
                bytes = new BytesRef(BytesRef.EMPTY_BYTES);
            }
            decode(bytes, ordinals);
        }
    };
}
Also used : IntsRef(org.apache.lucene.util.IntsRef) BinaryDocValues(org.apache.lucene.index.BinaryDocValues) BytesRef(org.apache.lucene.util.BytesRef)

Example 49 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class TaxonomyFacetSumValueSource method sumValues.

private void sumValues(List<MatchingDocs> matchingDocs, boolean keepScores, DoubleValuesSource valueSource) throws IOException {
    IntsRef scratch = new IntsRef();
    for (MatchingDocs hits : matchingDocs) {
        OrdinalsReader.OrdinalsSegmentReader ords = ordinalsReader.getReader(hits.context);
        DoubleValues scores = keepScores ? scores(hits) : null;
        DoubleValues functionValues = valueSource.getValues(hits.context, scores);
        DocIdSetIterator docs = hits.bits.iterator();
        int doc;
        while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            ords.get(doc, scratch);
            if (functionValues.advanceExact(doc)) {
                float value = (float) functionValues.doubleValue();
                for (int i = 0; i < scratch.length; i++) {
                    values[scratch.ints[i]] += value;
                }
            }
        }
    }
    rollup();
}
Also used : MatchingDocs(org.apache.lucene.facet.FacetsCollector.MatchingDocs) DoubleValues(org.apache.lucene.search.DoubleValues) IntsRef(org.apache.lucene.util.IntsRef) DocIdSetIterator(org.apache.lucene.search.DocIdSetIterator)

Example 50 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class BaseTokenStreamTestCase method getGraphStrings.

/** Returns all paths accepted by the token stream graph produced by the already initialized {@link TokenStream}. */
public static Set<String> getGraphStrings(TokenStream tokenStream) throws IOException {
    Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
    Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
    BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
    Set<String> paths = new HashSet<>();
    for (IntsRef ir : actualStringPaths) {
        paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
    }
    return paths;
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) Automaton(org.apache.lucene.util.automaton.Automaton) IntsRef(org.apache.lucene.util.IntsRef)

Aggregations

IntsRef (org.apache.lucene.util.IntsRef)63 BytesRef (org.apache.lucene.util.BytesRef)19 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)19 HashSet (java.util.HashSet)16 ArrayList (java.util.ArrayList)13 Automaton (org.apache.lucene.util.automaton.Automaton)13 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)12 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)9 IOException (java.io.IOException)7 Directory (org.apache.lucene.store.Directory)7 HashMap (java.util.HashMap)5 FiniteStringsIterator (org.apache.lucene.util.automaton.FiniteStringsIterator)5 BytesReader (org.apache.lucene.util.fst.FST.BytesReader)5 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 FilterInputStream (java.io.FilterInputStream)4 InputStream (java.io.InputStream)4 Map (java.util.Map)4 Random (java.util.Random)4 TokenStream (org.apache.lucene.analysis.TokenStream)4