Search in sources :

Example 41 with IntsRefBuilder

use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.

the class TestFSTsMisc method testListOfOutputsEmptyString.

public void testListOfOutputsEmptyString() throws Exception {
    PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
    ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs);
    final Builder<Object> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    final IntsRefBuilder scratch = new IntsRefBuilder();
    builder.add(scratch.get(), 0L);
    builder.add(scratch.get(), 1L);
    builder.add(scratch.get(), 17L);
    builder.add(scratch.get(), 1L);
    builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
    builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
    builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
    builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 0L);
    final FST<Object> fst = builder.finish();
    Object output = Util.get(fst, new BytesRef(""));
    assertNotNull(output);
    List<Long> outputList = outputs.asList(output);
    assertEquals(4, outputList.size());
    assertEquals(0L, outputList.get(0).longValue());
    assertEquals(1L, outputList.get(1).longValue());
    assertEquals(17L, outputList.get(2).longValue());
    assertEquals(1L, outputList.get(3).longValue());
    output = Util.get(fst, new BytesRef("a"));
    assertNotNull(output);
    outputList = outputs.asList(output);
    assertEquals(3, outputList.size());
    assertEquals(1L, outputList.get(0).longValue());
    assertEquals(3L, outputList.get(1).longValue());
    assertEquals(0L, outputList.get(2).longValue());
    output = Util.get(fst, new BytesRef("b"));
    assertNotNull(output);
    outputList = outputs.asList(output);
    assertEquals(1, outputList.size());
    assertEquals(0L, outputList.get(0).longValue());
}
Also used : IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 42 with IntsRefBuilder

use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.

the class FacetsConfig method processFacetFields.

private void processFacetFields(TaxonomyWriter taxoWriter, Map<String, List<FacetField>> byField, Document doc) throws IOException {
    for (Map.Entry<String, List<FacetField>> ent : byField.entrySet()) {
        String indexFieldName = ent.getKey();
        //System.out.println("  indexFieldName=" + indexFieldName + " fields=" + ent.getValue());
        IntsRefBuilder ordinals = new IntsRefBuilder();
        for (FacetField facetField : ent.getValue()) {
            FacetsConfig.DimConfig ft = getDimConfig(facetField.dim);
            if (facetField.path.length > 1 && ft.hierarchical == false) {
                throw new IllegalArgumentException("dimension \"" + facetField.dim + "\" is not hierarchical yet has " + facetField.path.length + " components");
            }
            FacetLabel cp = new FacetLabel(facetField.dim, facetField.path);
            checkTaxoWriter(taxoWriter);
            int ordinal = taxoWriter.addCategory(cp);
            ordinals.append(ordinal);
            if (ft.multiValued && (ft.hierarchical || ft.requireDimCount)) {
                //System.out.println("  add parents");
                // Add all parents too:
                int parent = taxoWriter.getParent(ordinal);
                while (parent > 0) {
                    ordinals.append(parent);
                    parent = taxoWriter.getParent(parent);
                }
                if (ft.requireDimCount == false) {
                    // Remove last (dimension) ord:
                    ordinals.setLength(ordinals.length() - 1);
                }
            }
            // Drill down:
            for (int i = 1; i <= cp.length; i++) {
                doc.add(new StringField(indexFieldName, pathToString(cp.components, i), Field.Store.NO));
            }
        }
        // Facet counts:
        // DocValues are considered stored fields:
        doc.add(new BinaryDocValuesField(indexFieldName, dedupAndEncode(ordinals.get())));
    }
}
Also used : FacetLabel(org.apache.lucene.facet.taxonomy.FacetLabel) FloatAssociationFacetField(org.apache.lucene.facet.taxonomy.FloatAssociationFacetField) AssociationFacetField(org.apache.lucene.facet.taxonomy.AssociationFacetField) IntAssociationFacetField(org.apache.lucene.facet.taxonomy.IntAssociationFacetField) SortedSetDocValuesFacetField(org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BinaryDocValuesField(org.apache.lucene.document.BinaryDocValuesField) StringField(org.apache.lucene.document.StringField) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap)

Example 43 with IntsRefBuilder

use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.

the class MemoryDocValuesConsumer method writeFST.

private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException {
    meta.writeVInt(field.number);
    meta.writeByte(FST);
    meta.writeLong(data.getFilePointer());
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> builder = new Builder<>(INPUT_TYPE.BYTE1, outputs);
    IntsRefBuilder scratch = new IntsRefBuilder();
    long ord = 0;
    for (BytesRef v : values) {
        builder.add(Util.toIntsRef(v, scratch), ord);
        ord++;
    }
    FST<Long> fst = builder.finish();
    if (fst != null) {
        fst.save(data);
    }
    meta.writeVLong(ord);
}
Also used : PositiveIntOutputs(org.apache.lucene.util.fst.PositiveIntOutputs) Builder(org.apache.lucene.util.fst.Builder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 44 with IntsRefBuilder

use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.

the class MemoryDocValuesProducer method getSortedNonIterator.

private LegacySortedDocValues getSortedNonIterator(FieldInfo field) throws IOException {
    final FSTEntry entry = fsts.get(field.name);
    if (entry.numOrds == 0) {
        return DocValues.emptyLegacySorted();
    }
    FST<Long> instance;
    synchronized (this) {
        instance = fstInstances.get(field.name);
        if (instance == null) {
            IndexInput data = this.data.clone();
            data.seek(entry.offset);
            instance = new FST<>(data, PositiveIntOutputs.getSingleton());
            if (!merging) {
                ramBytesUsed.addAndGet(instance.ramBytesUsed());
                fstInstances.put(field.name, instance);
            }
        }
    }
    final LegacyNumericDocValues docToOrd = getNumericNonIterator(field);
    final FST<Long> fst = instance;
    // per-thread resources
    final BytesReader in = fst.getBytesReader();
    final Arc<Long> firstArc = new Arc<>();
    final Arc<Long> scratchArc = new Arc<>();
    final IntsRefBuilder scratchInts = new IntsRefBuilder();
    final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
    return new LegacySortedDocValues() {

        final BytesRefBuilder term = new BytesRefBuilder();

        @Override
        public int getOrd(int docID) {
            return (int) docToOrd.get(docID);
        }

        @Override
        public BytesRef lookupOrd(int ord) {
            try {
                in.setPosition(0);
                fst.getFirstArc(firstArc);
                IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts);
                return Util.toBytesRef(output, term);
            } catch (IOException bogus) {
                throw new RuntimeException(bogus);
            }
        }

        @Override
        public int lookupTerm(BytesRef key) {
            try {
                InputOutput<Long> o = fstEnum.seekCeil(key);
                if (o == null) {
                    return -getValueCount() - 1;
                } else if (o.input.equals(key)) {
                    return o.output.intValue();
                } else {
                    return (int) -o.output - 1;
                }
            } catch (IOException bogus) {
                throw new RuntimeException(bogus);
            }
        }

        @Override
        public int getValueCount() {
            return (int) entry.numOrds;
        }

        @Override
        public TermsEnum termsEnum() {
            return new FSTTermsEnum(fst);
        }
    };
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IOException(java.io.IOException) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesRefFSTEnum(org.apache.lucene.util.fst.BytesRefFSTEnum) BytesReader(org.apache.lucene.util.fst.FST.BytesReader) Arc(org.apache.lucene.util.fst.FST.Arc) AtomicLong(java.util.concurrent.atomic.AtomicLong) ChecksumIndexInput(org.apache.lucene.store.ChecksumIndexInput) IndexInput(org.apache.lucene.store.IndexInput) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef)

Example 45 with IntsRefBuilder

use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.

the class FreeTextSuggester method build.

/** Build the suggest index, using up to the specified
   *  amount of temporary RAM while building.  Note that
   *  the weights for the suggestions are ignored. */
public void build(InputIterator iterator, double ramBufferSizeMB) throws IOException {
    if (iterator.hasPayloads()) {
        throw new IllegalArgumentException("this suggester doesn't support payloads");
    }
    if (iterator.hasContexts()) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    String prefix = getClass().getSimpleName();
    Path tempIndexPath = Files.createTempDirectory(prefix + ".index.");
    Directory dir = FSDirectory.open(tempIndexPath);
    IndexWriterConfig iwc = new IndexWriterConfig(indexAnalyzer);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(ramBufferSizeMB);
    IndexWriter writer = new IndexWriter(dir, iwc);
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    // TODO: if only we had IndexOptions.TERMS_ONLY...
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    ft.setOmitNorms(true);
    ft.freeze();
    Document doc = new Document();
    Field field = new Field("body", "", ft);
    doc.add(field);
    totTokens = 0;
    IndexReader reader = null;
    boolean success = false;
    count = 0;
    try {
        while (true) {
            BytesRef surfaceForm = iterator.next();
            if (surfaceForm == null) {
                break;
            }
            field.setStringValue(surfaceForm.utf8ToString());
            writer.addDocument(doc);
            count++;
        }
        reader = DirectoryReader.open(writer);
        Terms terms = MultiFields.getTerms(reader, "body");
        if (terms == null) {
            throw new IllegalArgumentException("need at least one suggestion");
        }
        // Move all ngrams into an FST:
        TermsEnum termsEnum = terms.iterator();
        Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
        Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
        IntsRefBuilder scratchInts = new IntsRefBuilder();
        while (true) {
            BytesRef term = termsEnum.next();
            if (term == null) {
                break;
            }
            int ngramCount = countGrams(term);
            if (ngramCount > grams) {
                throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams);
            }
            if (ngramCount == 1) {
                totTokens += termsEnum.totalTermFreq();
            }
            builder.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));
        }
        fst = builder.finish();
        if (fst == null) {
            throw new IllegalArgumentException("need at least one suggestion");
        }
        //System.out.println("FST: " + fst.getNodeCount() + " nodes");
        /*
      PrintWriter pw = new PrintWriter("/x/tmp/out.dot");
      Util.toDot(fst, pw, true, true);
      pw.close();
      */
        // Writer was only temporary, to count up bigrams,
        // which we transferred to the FST, so now we
        // rollback:
        writer.rollback();
        success = true;
    } finally {
        try {
            if (success) {
                IOUtils.close(reader, dir);
            } else {
                IOUtils.closeWhileHandlingException(reader, writer, dir);
            }
        } finally {
            IOUtils.rm(tempIndexPath);
        }
    }
}
Also used : Path(java.nio.file.Path) Builder(org.apache.lucene.util.fst.Builder) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Terms(org.apache.lucene.index.Terms) Document(org.apache.lucene.document.Document) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) FieldType(org.apache.lucene.document.FieldType) TermsEnum(org.apache.lucene.index.TermsEnum) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Aggregations

IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)55 BytesRef (org.apache.lucene.util.BytesRef)32 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)25 IntsRef (org.apache.lucene.util.IntsRef)19 ArrayList (java.util.ArrayList)10 HashSet (java.util.HashSet)10 Builder (org.apache.lucene.util.fst.Builder)10 Arc (org.apache.lucene.util.fst.FST.Arc)9 BytesReader (org.apache.lucene.util.fst.FST.BytesReader)8 Map (java.util.Map)7 HashMap (java.util.HashMap)5 ByteArrayDataInput (org.apache.lucene.store.ByteArrayDataInput)5 CharsRef (org.apache.lucene.util.CharsRef)5 TestUtil (org.apache.lucene.util.TestUtil)5 FSTTester.getRandomString (org.apache.lucene.util.fst.FSTTester.getRandomString)5 FSTTester.simpleRandomString (org.apache.lucene.util.fst.FSTTester.simpleRandomString)5 TreeMap (java.util.TreeMap)4 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)4 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)4 IOException (java.io.IOException)3