Search in sources :

Example 26 with IntsRefBuilder

use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.

the class FSTTester method verifyPruned.

// FST is pruned
private void verifyPruned(int inputMode, FST<T> fst, int prune1, int prune2) throws IOException {
    if (LuceneTestCase.VERBOSE) {
        System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs);
        for (InputOutput<T> pair : pairs) {
            System.out.println("  " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
        }
    }
    // To validate the FST, we brute-force compute all prefixes
    // in the terms, matched to their "common" outputs, prune that
    // set according to the prune thresholds, then assert the FST
    // matches that same set.
    // NOTE: Crazy RAM intensive!!
    //System.out.println("TEST: tally prefixes");
    // build all prefixes
    final Map<IntsRef, CountMinOutput<T>> prefixes = new HashMap<>();
    final IntsRefBuilder scratch = new IntsRefBuilder();
    for (InputOutput<T> pair : pairs) {
        scratch.copyInts(pair.input);
        for (int idx = 0; idx <= pair.input.length; idx++) {
            scratch.setLength(idx);
            CountMinOutput<T> cmo = prefixes.get(scratch.get());
            if (cmo == null) {
                cmo = new CountMinOutput<>();
                cmo.count = 1;
                cmo.output = pair.output;
                prefixes.put(scratch.toIntsRef(), cmo);
            } else {
                cmo.count++;
                T output1 = cmo.output;
                if (output1.equals(outputs.getNoOutput())) {
                    output1 = outputs.getNoOutput();
                }
                T output2 = pair.output;
                if (output2.equals(outputs.getNoOutput())) {
                    output2 = outputs.getNoOutput();
                }
                cmo.output = outputs.common(output1, output2);
            }
            if (idx == pair.input.length) {
                cmo.isFinal = true;
                cmo.finalOutput = cmo.output;
            }
        }
    }
    if (LuceneTestCase.VERBOSE) {
        System.out.println("TEST: now prune");
    }
    // prune 'em
    final Iterator<Map.Entry<IntsRef, CountMinOutput<T>>> it = prefixes.entrySet().iterator();
    while (it.hasNext()) {
        Map.Entry<IntsRef, CountMinOutput<T>> ent = it.next();
        final IntsRef prefix = ent.getKey();
        final CountMinOutput<T> cmo = ent.getValue();
        if (LuceneTestCase.VERBOSE) {
            System.out.println("  term prefix=" + inputToString(inputMode, prefix, false) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal);
        }
        final boolean keep;
        if (prune1 > 0) {
            keep = cmo.count >= prune1;
        } else {
            assert prune2 > 0;
            if (prune2 > 1 && cmo.count >= prune2) {
                keep = true;
            } else if (prefix.length > 0) {
                // consult our parent
                scratch.setLength(prefix.length - 1);
                System.arraycopy(prefix.ints, prefix.offset, scratch.ints(), 0, scratch.length());
                final CountMinOutput<T> cmo2 = prefixes.get(scratch.get());
                //System.out.println("    parent count = " + (cmo2 == null ? -1 : cmo2.count));
                keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1)));
            } else if (cmo.count >= prune2) {
                keep = true;
            } else {
                keep = false;
            }
        }
        if (!keep) {
            it.remove();
        //System.out.println("    remove");
        } else {
            // clear isLeaf for all ancestors
            //System.out.println("    keep");
            scratch.copyInts(prefix);
            scratch.setLength(scratch.length() - 1);
            while (scratch.length() >= 0) {
                final CountMinOutput<T> cmo2 = prefixes.get(scratch.get());
                if (cmo2 != null) {
                    //System.out.println("    clear isLeaf " + inputToString(inputMode, scratch));
                    cmo2.isLeaf = false;
                }
                scratch.setLength(scratch.length() - 1);
            }
        }
    }
    if (LuceneTestCase.VERBOSE) {
        System.out.println("TEST: after prune");
        for (Map.Entry<IntsRef, CountMinOutput<T>> ent : prefixes.entrySet()) {
            System.out.println("  " + inputToString(inputMode, ent.getKey(), false) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
            if (ent.getValue().isFinal) {
                System.out.println("    finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
            }
        }
    }
    if (prefixes.size() <= 1) {
        assertNull(fst);
        return;
    }
    assertNotNull(fst);
    // make sure FST only enums valid prefixes
    if (LuceneTestCase.VERBOSE) {
        System.out.println("TEST: check pruned enum");
    }
    IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<>(fst);
    IntsRefFSTEnum.InputOutput<T> current;
    while ((current = fstEnum.next()) != null) {
        if (LuceneTestCase.VERBOSE) {
            System.out.println("  fstEnum.next prefix=" + inputToString(inputMode, current.input, false) + " output=" + outputs.outputToString(current.output));
        }
        final CountMinOutput<T> cmo = prefixes.get(current.input);
        assertNotNull(cmo);
        assertTrue(cmo.isLeaf || cmo.isFinal);
        //if (cmo.isFinal && !cmo.isLeaf) {
        if (cmo.isFinal) {
            assertEquals(cmo.finalOutput, current.output);
        } else {
            assertEquals(cmo.output, current.output);
        }
    }
    // make sure all non-pruned prefixes are present in the FST
    if (LuceneTestCase.VERBOSE) {
        System.out.println("TEST: verify all prefixes");
    }
    final int[] stopNode = new int[1];
    for (Map.Entry<IntsRef, CountMinOutput<T>> ent : prefixes.entrySet()) {
        if (ent.getKey().length > 0) {
            final CountMinOutput<T> cmo = ent.getValue();
            final T output = run(fst, ent.getKey(), stopNode);
            if (LuceneTestCase.VERBOSE) {
                System.out.println("TEST: verify prefix=" + inputToString(inputMode, ent.getKey(), false) + " output=" + outputs.outputToString(cmo.output));
            }
            // if (cmo.isFinal && !cmo.isLeaf) {
            if (cmo.isFinal) {
                assertEquals(cmo.finalOutput, output);
            } else {
                assertEquals(cmo.output, output);
            }
            assertEquals(ent.getKey().length, stopNode[0]);
        }
    }
}
Also used : HashMap(java.util.HashMap) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) HashMap(java.util.HashMap) Map(java.util.Map)

Example 27 with IntsRefBuilder

use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.

the class BaseSynonymParserTestCase method assertEntryEquals.

/**
   * Helper method to validate synonym parsing.
   *
   * @param synonynMap  the generated synonym map after parsing
   * @param word        word (phrase) we are validating the synonyms for. Should be the value that comes out of the analyzer.
   *                    All spaces will be replaced by word separators.
   * @param includeOrig if synonyms should include original
   * @param synonyms    actual synonyms. All word separators are replaced with a single space.
   */
public static void assertEntryEquals(SynonymMap synonynMap, String word, boolean includeOrig, String[] synonyms) throws Exception {
    word = word.replace(' ', SynonymMap.WORD_SEPARATOR);
    BytesRef value = Util.get(synonynMap.fst, Util.toUTF32(new CharsRef(word), new IntsRefBuilder()));
    assertNotNull("No synonyms found for: " + word, value);
    ByteArrayDataInput bytesReader = new ByteArrayDataInput(value.bytes, value.offset, value.length);
    final int code = bytesReader.readVInt();
    final boolean keepOrig = (code & 0x1) == 0;
    assertEquals("Include original different than expected. Expected " + includeOrig + " was " + keepOrig, includeOrig, keepOrig);
    final int count = code >>> 1;
    assertEquals("Invalid synonym count. Expected " + synonyms.length + " was " + count, synonyms.length, count);
    Set<String> synonymSet = new HashSet<>(Arrays.asList(synonyms));
    BytesRef scratchBytes = new BytesRef();
    for (int i = 0; i < count; i++) {
        synonynMap.words.get(bytesReader.readVInt(), scratchBytes);
        String synonym = scratchBytes.utf8ToString().replace(SynonymMap.WORD_SEPARATOR, ' ');
        assertTrue("Unexpected synonym found: " + synonym, synonymSet.contains(synonym));
    }
}
Also used : IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) BytesRef(org.apache.lucene.util.BytesRef) CharsRef(org.apache.lucene.util.CharsRef) HashSet(java.util.HashSet)

Example 28 with IntsRefBuilder

use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.

the class MemoryDocValuesConsumer method writeFST.

private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException {
    meta.writeVInt(field.number);
    meta.writeByte(FST);
    meta.writeLong(data.getFilePointer());
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> builder = new Builder<>(INPUT_TYPE.BYTE1, outputs);
    IntsRefBuilder scratch = new IntsRefBuilder();
    long ord = 0;
    for (BytesRef v : values) {
        builder.add(Util.toIntsRef(v, scratch), ord);
        ord++;
    }
    FST<Long> fst = builder.finish();
    if (fst != null) {
        fst.save(data);
    }
    meta.writeVLong(ord);
}
Also used : PositiveIntOutputs(org.apache.lucene.util.fst.PositiveIntOutputs) Builder(org.apache.lucene.util.fst.Builder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 29 with IntsRefBuilder

use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.

the class MemoryDocValuesProducer method getSortedNonIterator.

private LegacySortedDocValues getSortedNonIterator(FieldInfo field) throws IOException {
    final FSTEntry entry = fsts.get(field.name);
    if (entry.numOrds == 0) {
        return DocValues.emptyLegacySorted();
    }
    FST<Long> instance;
    synchronized (this) {
        instance = fstInstances.get(field.name);
        if (instance == null) {
            IndexInput data = this.data.clone();
            data.seek(entry.offset);
            instance = new FST<>(data, PositiveIntOutputs.getSingleton());
            if (!merging) {
                ramBytesUsed.addAndGet(instance.ramBytesUsed());
                fstInstances.put(field.name, instance);
            }
        }
    }
    final LegacyNumericDocValues docToOrd = getNumericNonIterator(field);
    final FST<Long> fst = instance;
    // per-thread resources
    final BytesReader in = fst.getBytesReader();
    final Arc<Long> firstArc = new Arc<>();
    final Arc<Long> scratchArc = new Arc<>();
    final IntsRefBuilder scratchInts = new IntsRefBuilder();
    final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
    return new LegacySortedDocValues() {

        final BytesRefBuilder term = new BytesRefBuilder();

        @Override
        public int getOrd(int docID) {
            return (int) docToOrd.get(docID);
        }

        @Override
        public BytesRef lookupOrd(int ord) {
            try {
                in.setPosition(0);
                fst.getFirstArc(firstArc);
                IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts);
                return Util.toBytesRef(output, term);
            } catch (IOException bogus) {
                throw new RuntimeException(bogus);
            }
        }

        @Override
        public int lookupTerm(BytesRef key) {
            try {
                InputOutput<Long> o = fstEnum.seekCeil(key);
                if (o == null) {
                    return -getValueCount() - 1;
                } else if (o.input.equals(key)) {
                    return o.output.intValue();
                } else {
                    return (int) -o.output - 1;
                }
            } catch (IOException bogus) {
                throw new RuntimeException(bogus);
            }
        }

        @Override
        public int getValueCount() {
            return (int) entry.numOrds;
        }

        @Override
        public TermsEnum termsEnum() {
            return new FSTTermsEnum(fst);
        }
    };
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IOException(java.io.IOException) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesRefFSTEnum(org.apache.lucene.util.fst.BytesRefFSTEnum) BytesReader(org.apache.lucene.util.fst.FST.BytesReader) Arc(org.apache.lucene.util.fst.FST.Arc) AtomicLong(java.util.concurrent.atomic.AtomicLong) ChecksumIndexInput(org.apache.lucene.store.ChecksumIndexInput) IndexInput(org.apache.lucene.store.IndexInput) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef)

Example 30 with IntsRefBuilder

use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.

the class FreeTextSuggester method build.

/** Build the suggest index, using up to the specified
   *  amount of temporary RAM while building.  Note that
   *  the weights for the suggestions are ignored. */
public void build(InputIterator iterator, double ramBufferSizeMB) throws IOException {
    if (iterator.hasPayloads()) {
        throw new IllegalArgumentException("this suggester doesn't support payloads");
    }
    if (iterator.hasContexts()) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    String prefix = getClass().getSimpleName();
    Path tempIndexPath = Files.createTempDirectory(prefix + ".index.");
    Directory dir = FSDirectory.open(tempIndexPath);
    IndexWriterConfig iwc = new IndexWriterConfig(indexAnalyzer);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(ramBufferSizeMB);
    IndexWriter writer = new IndexWriter(dir, iwc);
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    // TODO: if only we had IndexOptions.TERMS_ONLY...
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    ft.setOmitNorms(true);
    ft.freeze();
    Document doc = new Document();
    Field field = new Field("body", "", ft);
    doc.add(field);
    totTokens = 0;
    IndexReader reader = null;
    boolean success = false;
    count = 0;
    try {
        while (true) {
            BytesRef surfaceForm = iterator.next();
            if (surfaceForm == null) {
                break;
            }
            field.setStringValue(surfaceForm.utf8ToString());
            writer.addDocument(doc);
            count++;
        }
        reader = DirectoryReader.open(writer);
        Terms terms = MultiFields.getTerms(reader, "body");
        if (terms == null) {
            throw new IllegalArgumentException("need at least one suggestion");
        }
        // Move all ngrams into an FST:
        TermsEnum termsEnum = terms.iterator();
        Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
        Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
        IntsRefBuilder scratchInts = new IntsRefBuilder();
        while (true) {
            BytesRef term = termsEnum.next();
            if (term == null) {
                break;
            }
            int ngramCount = countGrams(term);
            if (ngramCount > grams) {
                throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams);
            }
            if (ngramCount == 1) {
                totTokens += termsEnum.totalTermFreq();
            }
            builder.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));
        }
        fst = builder.finish();
        if (fst == null) {
            throw new IllegalArgumentException("need at least one suggestion");
        }
        //System.out.println("FST: " + fst.getNodeCount() + " nodes");
        /*
      PrintWriter pw = new PrintWriter("/x/tmp/out.dot");
      Util.toDot(fst, pw, true, true);
      pw.close();
      */
        // Writer was only temporary, to count up bigrams,
        // which we transferred to the FST, so now we
        // rollback:
        writer.rollback();
        success = true;
    } finally {
        try {
            if (success) {
                IOUtils.close(reader, dir);
            } else {
                IOUtils.closeWhileHandlingException(reader, writer, dir);
            }
        } finally {
            IOUtils.rm(tempIndexPath);
        }
    }
}
Also used : Path(java.nio.file.Path) Builder(org.apache.lucene.util.fst.Builder) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Terms(org.apache.lucene.index.Terms) Document(org.apache.lucene.document.Document) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) FieldType(org.apache.lucene.document.FieldType) TermsEnum(org.apache.lucene.index.TermsEnum) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Aggregations

IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)55 BytesRef (org.apache.lucene.util.BytesRef)32 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)25 IntsRef (org.apache.lucene.util.IntsRef)19 ArrayList (java.util.ArrayList)10 HashSet (java.util.HashSet)10 Builder (org.apache.lucene.util.fst.Builder)10 Arc (org.apache.lucene.util.fst.FST.Arc)9 BytesReader (org.apache.lucene.util.fst.FST.BytesReader)8 Map (java.util.Map)7 HashMap (java.util.HashMap)5 ByteArrayDataInput (org.apache.lucene.store.ByteArrayDataInput)5 CharsRef (org.apache.lucene.util.CharsRef)5 TestUtil (org.apache.lucene.util.TestUtil)5 FSTTester.getRandomString (org.apache.lucene.util.fst.FSTTester.getRandomString)5 FSTTester.simpleRandomString (org.apache.lucene.util.fst.FSTTester.simpleRandomString)5 TreeMap (java.util.TreeMap)4 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)4 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)4 IOException (java.io.IOException)3