Search in sources :

Example 56 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class TestFSTsMisc method testRandomWords.

private void testRandomWords(int maxNumWords, int numIter) throws IOException {
    Random random = new Random(random().nextLong());
    for (int iter = 0; iter < numIter; iter++) {
        if (VERBOSE) {
            System.out.println("\nTEST: iter " + iter);
        for (int inputMode = 0; inputMode < 2; inputMode++) {
            final int numWords = random.nextInt(maxNumWords + 1);
            Set<IntsRef> termsSet = new HashSet<>();
            IntsRef[] terms = new IntsRef[numWords];
            while (termsSet.size() < numWords) {
                final String term = getRandomString(random);
                termsSet.add(toIntsRef(term, inputMode));
            doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()]));
Also used : Random(java.util.Random) IntsRef(org.apache.lucene.util.IntsRef) FSTTester.toIntsRef(org.apache.lucene.util.fst.FSTTester.toIntsRef) FSTTester.getRandomString(org.apache.lucene.util.fst.FSTTester.getRandomString) HashSet(java.util.HashSet)

Example 57 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class TestTokenInfoDictionary method testEnumerateAll.

/** enumerates the entire FST/lookup data and just does basic sanity checks */
public void testEnumerateAll() throws Exception {
    // just for debugging
    int numTerms = 0;
    int numWords = 0;
    int lastWordId = -1;
    int lastSourceId = -1;
    TokenInfoDictionary tid = TokenInfoDictionary.getInstance();
    ConnectionCosts matrix = ConnectionCosts.getInstance();
    FST<Long> fst = tid.getFST().getInternalFST();
    IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
    InputOutput<Long> mapping;
    IntsRef scratch = new IntsRef();
    while ((mapping = != null) {
        IntsRef input = mapping.input;
        char[] chars = new char[input.length];
        for (int i = 0; i < chars.length; i++) {
            chars[i] = (char) input.ints[input.offset + i];
        assertTrue(UnicodeUtil.validUTF16String(new String(chars)));
        Long output = mapping.output;
        int sourceId = output.intValue();
        // we walk in order, terms, sourceIds, and wordIds should always be increasing
        assertTrue(sourceId > lastSourceId);
        lastSourceId = sourceId;
        tid.lookupWordIds(sourceId, scratch);
        for (int i = 0; i < scratch.length; i++) {
            int wordId = scratch.ints[scratch.offset + i];
            assertTrue(wordId > lastWordId);
            lastWordId = wordId;
            String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
            assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));
            String inflectionForm = tid.getInflectionForm(wordId);
            assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm));
            if (inflectionForm != null) {
                // check that it's actually an ipadic inflection form
            String inflectionType = tid.getInflectionType(wordId);
            assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType));
            if (inflectionType != null) {
                // check that it's actually an ipadic inflection type
            int leftId = tid.getLeftId(wordId);
            int rightId = tid.getRightId(wordId);
            matrix.get(rightId, leftId);
            String pos = tid.getPartOfSpeech(wordId);
            // check that it's actually an ipadic pos tag
            String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
            String reading = tid.getReading(wordId, chars, 0, chars.length);
    if (VERBOSE) {
        System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
Also used : IntsRefFSTEnum(org.apache.lucene.util.fst.IntsRefFSTEnum) IntsRef(org.apache.lucene.util.IntsRef)

Example 58 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class TestGraphTokenizers method toPathStrings.

/** Returns all paths */
private Set<String> toPathStrings(Automaton a) {
    BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
    Set<String> paths = new HashSet<>();
    for (IntsRef ir : AutomatonTestUtil.getFiniteStringsRecursive(a, -1)) {
        paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
    return paths;
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) HashSet(java.util.HashSet)

Example 59 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class TestFSTs method main.

// TODO: try experiment: reverse terms before
// compressing -- how much smaller?
// TODO: can FST be used to index all internal substrings,
// mapping to term?
// java -cp ../build/codecs/classes/java:../test-framework/lib/randomizedtesting-runner-*.jar:../build/core/classes/test:../build/core/classes/test-framework:../build/core/classes/java:../build/test-framework/classes/java:../test-framework/lib/junit-4.10.jar org.apache.lucene.util.fst.TestFSTs /xold/tmp/allTerms3.txt out
public static void main(String[] args) throws IOException {
    int prune = 0;
    int limit = Integer.MAX_VALUE;
    // utf8
    int inputMode = 0;
    boolean storeOrds = false;
    boolean storeDocFreqs = false;
    boolean verify = true;
    boolean noArcArrays = false;
    Path wordsFileIn = null;
    Path dirOut = null;
    int idx = 0;
    while (idx < args.length) {
        if (args[idx].equals("-prune")) {
            prune = Integer.parseInt(args[1 + idx]);
        } else if (args[idx].equals("-limit")) {
            limit = Integer.parseInt(args[1 + idx]);
        } else if (args[idx].equals("-utf8")) {
            inputMode = 0;
        } else if (args[idx].equals("-utf32")) {
            inputMode = 1;
        } else if (args[idx].equals("-docFreq")) {
            storeDocFreqs = true;
        } else if (args[idx].equals("-noArcArrays")) {
            noArcArrays = true;
        } else if (args[idx].equals("-ords")) {
            storeOrds = true;
        } else if (args[idx].equals("-noverify")) {
            verify = false;
        } else if (args[idx].startsWith("-")) {
            System.err.println("Unrecognized option: " + args[idx]);
        } else {
            if (wordsFileIn == null) {
                wordsFileIn = Paths.get(args[idx]);
            } else if (dirOut == null) {
                dirOut = Paths.get(args[idx]);
            } else {
                System.err.println("Too many arguments, expected: input [output]");
    if (wordsFileIn == null) {
        System.err.println("No input file.");
    if (storeOrds && storeDocFreqs) {
        // Store both ord & docFreq:
        final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton();
        final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton();
        final PairOutputs<Long, Long> outputs = new PairOutputs<>(o1, o2);
        new VisitTerms<PairOutputs.Pair<Long, Long>>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {

            Random rand;

            public PairOutputs.Pair<Long, Long> getOutput(IntsRef input, int ord) {
                if (ord == 0) {
                    rand = new Random(17);
                return outputs.newPair((long) ord, (long) TestUtil.nextInt(rand, 1, 5000));
        }.run(limit, verify, false);
    } else if (storeOrds) {
        // Store only ords
        final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
        new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {

            public Long getOutput(IntsRef input, int ord) {
                return (long) ord;
        }.run(limit, verify, true);
    } else if (storeDocFreqs) {
        // Store only docFreq
        final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
        new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {

            Random rand;

            public Long getOutput(IntsRef input, int ord) {
                if (ord == 0) {
                    rand = new Random(17);
                return (long) TestUtil.nextInt(rand, 1, 5000);
        }.run(limit, verify, false);
    } else {
        // Store nothing
        final NoOutputs outputs = NoOutputs.getSingleton();
        final Object NO_OUTPUT = outputs.getNoOutput();
        new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {

            public Object getOutput(IntsRef input, int ord) {
                return NO_OUTPUT;
        }.run(limit, verify, false);
Also used : Path(java.nio.file.Path) Random(java.util.Random) IntsRef(org.apache.lucene.util.IntsRef) FSTTester.toIntsRef(org.apache.lucene.util.fst.FSTTester.toIntsRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair)

Example 60 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class Test2BFST method test.

public void test() throws Exception {
    int[] ints = new int[7];
    IntsRef input = new IntsRef(ints, 0, ints.length);
    long seed = random().nextLong();
    Directory dir = new MMapDirectory(createTempDir("2BFST"));
    for (int iter = 0; iter < 1; iter++) {
        // Build FST w/ NoOutputs and stop when nodeCount > 2.2B
            System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
            Outputs<Object> outputs = NoOutputs.getSingleton();
            Object NO_OUTPUT = outputs.getNoOutput();
            final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
            int count = 0;
            Random r = new Random(seed);
            int[] ints2 = new int[200];
            IntsRef input2 = new IntsRef(ints2, 0, ints2.length);
            while (true) {
                //System.out.println("add: " + input + " -> " + output);
                for (int i = 10; i < ints2.length; i++) {
                    ints2[i] = r.nextInt(256);
                b.add(input2, NO_OUTPUT);
                if (count % 100000 == 0) {
                    System.out.println(count + ": " + b.fstRamBytesUsed() + " bytes; " + b.getNodeCount() + " nodes");
                if (b.getNodeCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) {
                nextInput(r, ints2);
            FST<Object> fst = b.finish();
            for (int verify = 0; verify < 2; verify++) {
                System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
                Arrays.fill(ints2, 0);
                r = new Random(seed);
                for (int i = 0; i < count; i++) {
                    if (i % 1000000 == 0) {
                        System.out.println(i + "...: ");
                    for (int j = 10; j < ints2.length; j++) {
                        ints2[j] = r.nextInt(256);
                    assertEquals(NO_OUTPUT, Util.get(fst, input2));
                    nextInput(r, ints2);
                System.out.println("\nTEST: enum all input/outputs");
                IntsRefFSTEnum<Object> fstEnum = new IntsRefFSTEnum<>(fst);
                Arrays.fill(ints2, 0);
                r = new Random(seed);
                int upto = 0;
                while (true) {
                    IntsRefFSTEnum.InputOutput<Object> pair =;
                    if (pair == null) {
                    for (int j = 10; j < ints2.length; j++) {
                        ints2[j] = r.nextInt(256);
                    assertEquals(input2, pair.input);
                    assertEquals(NO_OUTPUT, pair.output);
                    nextInput(r, ints2);
                assertEquals(count, upto);
                if (verify == 0) {
                    System.out.println("\nTEST: save/load FST and re-verify");
                    IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
                    IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
                    fst = new FST<>(in, outputs);
                } else {
        // Build FST w/ ByteSequenceOutputs and stop when FST
        // size = 3GB
            System.out.println("\nTEST: 3 GB size; outputs=bytes");
            Outputs<BytesRef> outputs = ByteSequenceOutputs.getSingleton();
            final Builder<BytesRef> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
            byte[] outputBytes = new byte[20];
            BytesRef output = new BytesRef(outputBytes);
            Arrays.fill(ints, 0);
            int count = 0;
            Random r = new Random(seed);
            while (true) {
                //System.out.println("add: " + input + " -> " + output);
                b.add(input, BytesRef.deepCopyOf(output));
                if (count % 1000000 == 0) {
                    System.out.println(count + "...: " + b.fstRamBytesUsed() + " bytes");
                if (b.fstRamBytesUsed() > LIMIT) {
                nextInput(r, ints);
            FST<BytesRef> fst = b.finish();
            for (int verify = 0; verify < 2; verify++) {
                System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
                r = new Random(seed);
                Arrays.fill(ints, 0);
                for (int i = 0; i < count; i++) {
                    if (i % 1000000 == 0) {
                        System.out.println(i + "...: ");
                    assertEquals(output, Util.get(fst, input));
                    nextInput(r, ints);
                System.out.println("\nTEST: enum all input/outputs");
                IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<>(fst);
                Arrays.fill(ints, 0);
                r = new Random(seed);
                int upto = 0;
                while (true) {
                    IntsRefFSTEnum.InputOutput<BytesRef> pair =;
                    if (pair == null) {
                    assertEquals(input, pair.input);
                    assertEquals(output, pair.output);
                    nextInput(r, ints);
                assertEquals(count, upto);
                if (verify == 0) {
                    System.out.println("\nTEST: save/load FST and re-verify");
                    IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
                    IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
                    fst = new FST<>(in, outputs);
                } else {
        // Build FST w/ PositiveIntOutputs and stop when FST
        // size = 3GB
            System.out.println("\nTEST: 3 GB size; outputs=long");
            Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
            final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
            long output = 1;
            Arrays.fill(ints, 0);
            int count = 0;
            Random r = new Random(seed);
            while (true) {
                //System.out.println("add: " + input + " -> " + output);
                b.add(input, output);
                output += 1 + r.nextInt(10);
                if (count % 1000000 == 0) {
                    System.out.println(count + "...: " + b.fstRamBytesUsed() + " bytes");
                if (b.fstRamBytesUsed() > LIMIT) {
                nextInput(r, ints);
            FST<Long> fst = b.finish();
            for (int verify = 0; verify < 2; verify++) {
                System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
                Arrays.fill(ints, 0);
                output = 1;
                r = new Random(seed);
                for (int i = 0; i < count; i++) {
                    if (i % 1000000 == 0) {
                        System.out.println(i + "...: ");
                    // forward lookup:
                    assertEquals(output, Util.get(fst, input).longValue());
                    // reverse lookup:
                    assertEquals(input, Util.getByOutput(fst, output));
                    output += 1 + r.nextInt(10);
                    nextInput(r, ints);
                System.out.println("\nTEST: enum all input/outputs");
                IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
                Arrays.fill(ints, 0);
                r = new Random(seed);
                int upto = 0;
                output = 1;
                while (true) {
                    IntsRefFSTEnum.InputOutput<Long> pair =;
                    if (pair == null) {
                    assertEquals(input, pair.input);
                    assertEquals(output, pair.output.longValue());
                    output += 1 + r.nextInt(10);
                    nextInput(r, ints);
                assertEquals(count, upto);
                if (verify == 0) {
                    System.out.println("\nTEST: save/load FST and re-verify");
                    IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
                    IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
                    fst = new FST<>(in, outputs);
                } else {
Also used : IndexOutput( MMapDirectory( Random(java.util.Random) IndexInput( IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) MMapDirectory( Directory(


IntsRef (org.apache.lucene.util.IntsRef)63 BytesRef (org.apache.lucene.util.BytesRef)19 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)19 HashSet (java.util.HashSet)16 ArrayList (java.util.ArrayList)13 Automaton (org.apache.lucene.util.automaton.Automaton)13 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)12 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)9 IOException ( Directory ( HashMap (java.util.HashMap)5 FiniteStringsIterator (org.apache.lucene.util.automaton.FiniteStringsIterator)5 BytesReader (org.apache.lucene.util.fst.FST.BytesReader)5 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)5 ByteArrayInputStream ( FilterInputStream ( InputStream ( Map (java.util.Map)4 Random (java.util.Random)4 TokenStream (org.apache.lucene.analysis.TokenStream)4