Search in sources :

Example 66 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.

the class SimpleTextUtil method checkFooter.

public static void checkFooter(ChecksumIndexInput input) throws IOException {
    BytesRefBuilder scratch = new BytesRefBuilder();
    String expectedChecksum = String.format(Locale.ROOT, "%020d", input.getChecksum());
    readLine(input, scratch);
    if (StringHelper.startsWith(scratch.get(), CHECKSUM) == false) {
        throw new CorruptIndexException("SimpleText failure: expected checksum line but got " + scratch.get().utf8ToString(), input);
    }
    String actualChecksum = new BytesRef(scratch.bytes(), CHECKSUM.length, scratch.length() - CHECKSUM.length).utf8ToString();
    if (!expectedChecksum.equals(actualChecksum)) {
        throw new CorruptIndexException("SimpleText checksum failure: " + actualChecksum + " != " + expectedChecksum, input);
    }
    if (input.length() != input.getFilePointer()) {
        throw new CorruptIndexException("Unexpected stuff at the end of file, please be careful with your text editor!", input);
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) BytesRef(org.apache.lucene.util.BytesRef)

Example 67 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.

the class SimpleTextCompoundFormat method getCompoundReader.

@Override
public Directory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) throws IOException {
    String dataFile = IndexFileNames.segmentFileName(si.name, "", DATA_EXTENSION);
    final IndexInput in = dir.openInput(dataFile, context);
    BytesRefBuilder scratch = new BytesRefBuilder();
    // first get to TOC:
    DecimalFormat df = new DecimalFormat(OFFSETPATTERN, DecimalFormatSymbols.getInstance(Locale.ROOT));
    long pos = in.length() - TABLEPOS.length - OFFSETPATTERN.length() - 1;
    in.seek(pos);
    SimpleTextUtil.readLine(in, scratch);
    assert StringHelper.startsWith(scratch.get(), TABLEPOS);
    long tablePos = -1;
    try {
        tablePos = df.parse(stripPrefix(scratch, TABLEPOS)).longValue();
    } catch (ParseException e) {
        throw new CorruptIndexException("can't parse CFS trailer, got: " + scratch.get().utf8ToString(), in);
    }
    // seek to TOC and read it
    in.seek(tablePos);
    SimpleTextUtil.readLine(in, scratch);
    assert StringHelper.startsWith(scratch.get(), TABLE);
    int numEntries = Integer.parseInt(stripPrefix(scratch, TABLE));
    final String[] fileNames = new String[numEntries];
    final long[] startOffsets = new long[numEntries];
    final long[] endOffsets = new long[numEntries];
    for (int i = 0; i < numEntries; i++) {
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch.get(), TABLENAME);
        fileNames[i] = si.name + IndexFileNames.stripSegmentName(stripPrefix(scratch, TABLENAME));
        if (i > 0) {
            // files must be unique and in sorted order
            assert fileNames[i].compareTo(fileNames[i - 1]) > 0;
        }
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch.get(), TABLESTART);
        startOffsets[i] = Long.parseLong(stripPrefix(scratch, TABLESTART));
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch.get(), TABLEEND);
        endOffsets[i] = Long.parseLong(stripPrefix(scratch, TABLEEND));
    }
    return new Directory() {

        private int getIndex(String name) throws IOException {
            int index = Arrays.binarySearch(fileNames, name);
            if (index < 0) {
                throw new FileNotFoundException("No sub-file found (fileName=" + name + " files: " + Arrays.toString(fileNames) + ")");
            }
            return index;
        }

        @Override
        public String[] listAll() throws IOException {
            ensureOpen();
            return fileNames.clone();
        }

        @Override
        public long fileLength(String name) throws IOException {
            ensureOpen();
            int index = getIndex(name);
            return endOffsets[index] - startOffsets[index];
        }

        @Override
        public IndexInput openInput(String name, IOContext context) throws IOException {
            ensureOpen();
            int index = getIndex(name);
            return in.slice(name, startOffsets[index], endOffsets[index] - startOffsets[index]);
        }

        @Override
        public void close() throws IOException {
            in.close();
        }

        // write methods: disabled
        @Override
        public IndexOutput createOutput(String name, IOContext context) {
            throw new UnsupportedOperationException();
        }

        @Override
        public IndexOutput createTempOutput(String prefix, String suffix, IOContext context) {
            throw new UnsupportedOperationException();
        }

        @Override
        public void sync(Collection<String> names) {
            throw new UnsupportedOperationException();
        }

        @Override
        public void deleteFile(String name) {
            throw new UnsupportedOperationException();
        }

        @Override
        public void rename(String source, String dest) {
            throw new UnsupportedOperationException();
        }

        @Override
        public void syncMetaData() {
            throw new UnsupportedOperationException();
        }

        @Override
        public Lock obtainLock(String name) {
            throw new UnsupportedOperationException();
        }
    };
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) DecimalFormat(java.text.DecimalFormat) FileNotFoundException(java.io.FileNotFoundException) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) IndexInput(org.apache.lucene.store.IndexInput) IOContext(org.apache.lucene.store.IOContext) Collection(java.util.Collection) ParseException(java.text.ParseException) Directory(org.apache.lucene.store.Directory)

Example 68 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.

the class MemoryDocValuesProducer method getSortedSet.

@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
    SortedSetEntry sortedSetEntry = sortedSets.get(field.name);
    if (sortedSetEntry.singleton) {
        return DocValues.singleton(getSorted(field));
    }
    final FSTEntry entry = fsts.get(field.name);
    if (entry.numOrds == 0) {
        // empty FST!
        return DocValues.emptySortedSet();
    }
    FST<Long> instance;
    synchronized (this) {
        instance = fstInstances.get(field.name);
        if (instance == null) {
            IndexInput data = this.data.clone();
            data.seek(entry.offset);
            instance = new FST<>(data, PositiveIntOutputs.getSingleton());
            if (!merging) {
                ramBytesUsed.addAndGet(instance.ramBytesUsed());
                fstInstances.put(field.name, instance);
            }
        }
    }
    final LegacyBinaryDocValues docToOrds = getLegacyBinary(field);
    final FST<Long> fst = instance;
    // per-thread resources
    final BytesReader in = fst.getBytesReader();
    final Arc<Long> firstArc = new Arc<>();
    final Arc<Long> scratchArc = new Arc<>();
    final IntsRefBuilder scratchInts = new IntsRefBuilder();
    final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
    final ByteArrayDataInput input = new ByteArrayDataInput();
    return new LegacySortedSetDocValuesWrapper(new LegacySortedSetDocValues() {

        final BytesRefBuilder term = new BytesRefBuilder();

        BytesRef ref;

        long currentOrd;

        @Override
        public long nextOrd() {
            if (input.eof()) {
                return NO_MORE_ORDS;
            } else {
                currentOrd += input.readVLong();
                return currentOrd;
            }
        }

        @Override
        public void setDocument(int docID) {
            ref = docToOrds.get(docID);
            input.reset(ref.bytes, ref.offset, ref.length);
            currentOrd = 0;
        }

        @Override
        public BytesRef lookupOrd(long ord) {
            try {
                in.setPosition(0);
                fst.getFirstArc(firstArc);
                IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts);
                return Util.toBytesRef(output, term);
            } catch (IOException bogus) {
                throw new RuntimeException(bogus);
            }
        }

        @Override
        public long lookupTerm(BytesRef key) {
            try {
                InputOutput<Long> o = fstEnum.seekCeil(key);
                if (o == null) {
                    return -getValueCount() - 1;
                } else if (o.input.equals(key)) {
                    return o.output.intValue();
                } else {
                    return -o.output - 1;
                }
            } catch (IOException bogus) {
                throw new RuntimeException(bogus);
            }
        }

        @Override
        public long getValueCount() {
            return entry.numOrds;
        }

        @Override
        public TermsEnum termsEnum() {
            return new FSTTermsEnum(fst);
        }
    }, maxDoc);
}
Also used : ChecksumIndexInput(org.apache.lucene.store.ChecksumIndexInput) IndexInput(org.apache.lucene.store.IndexInput) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) InputOutput(org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput) IOException(java.io.IOException) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesRefFSTEnum(org.apache.lucene.util.fst.BytesRefFSTEnum) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) BytesReader(org.apache.lucene.util.fst.FST.BytesReader) Arc(org.apache.lucene.util.fst.FST.Arc) AtomicLong(java.util.concurrent.atomic.AtomicLong)

Example 69 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.

the class FuzzyTermsEnum method getAutomatonEnum.

/**
   * return an automata-based enum for matching up to editDistance from
   * lastTerm, if possible
   */
private TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm) throws IOException {
    assert editDistance < automata.length;
    final CompiledAutomaton compiled = automata[editDistance];
    BytesRef initialSeekTerm;
    if (lastTerm == null) {
        // This is the first enum we are pulling:
        initialSeekTerm = null;
    } else {
        // We are pulling this enum (e.g., ed=1) after iterating for a while already (e.g., ed=2):
        initialSeekTerm = compiled.floor(lastTerm, new BytesRefBuilder());
    }
    return terms.intersect(compiled, initialSeekTerm);
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) BytesRef(org.apache.lucene.util.BytesRef)

Example 70 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.

the class TopTermsRewrite method rewrite.

@Override
public final Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
    final int maxSize = Math.min(size, getMaxSize());
    final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();
    collectTerms(reader, query, new TermCollector() {

        private final MaxNonCompetitiveBoostAttribute maxBoostAtt = attributes.addAttribute(MaxNonCompetitiveBoostAttribute.class);

        private final Map<BytesRef, ScoreTerm> visitedTerms = new HashMap<>();

        private TermsEnum termsEnum;

        private BoostAttribute boostAtt;

        private ScoreTerm st;

        @Override
        public void setNextEnum(TermsEnum termsEnum) {
            this.termsEnum = termsEnum;
            assert compareToLastTerm(null);
            // lazy init the initial ScoreTerm because comparator is not known on ctor:
            if (st == null)
                st = new ScoreTerm(new TermContext(topReaderContext));
            boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
        }

        // for assert:
        private BytesRefBuilder lastTerm;

        private boolean compareToLastTerm(BytesRef t) {
            if (lastTerm == null && t != null) {
                lastTerm = new BytesRefBuilder();
                lastTerm.append(t);
            } else if (t == null) {
                lastTerm = null;
            } else {
                assert lastTerm.get().compareTo(t) < 0 : "lastTerm=" + lastTerm + " t=" + t;
                lastTerm.copyBytes(t);
            }
            return true;
        }

        @Override
        public boolean collect(BytesRef bytes) throws IOException {
            final float boost = boostAtt.getBoost();
            // terms in order
            assert compareToLastTerm(bytes);
            // ignore uncompetitive hits
            if (stQueue.size() == maxSize) {
                final ScoreTerm t = stQueue.peek();
                if (boost < t.boost)
                    return true;
                if (boost == t.boost && bytes.compareTo(t.bytes.get()) > 0)
                    return true;
            }
            ScoreTerm t = visitedTerms.get(bytes);
            final TermState state = termsEnum.termState();
            assert state != null;
            if (t != null) {
                // if the term is already in the PQ, only update docFreq of term in PQ
                assert t.boost == boost : "boost should be equal in all segment TermsEnums";
                t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
            } else {
                // add new entry in PQ, we must clone the term, else it may get overwritten!
                st.bytes.copyBytes(bytes);
                st.boost = boost;
                visitedTerms.put(st.bytes.get(), st);
                assert st.termState.docFreq() == 0;
                st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
                stQueue.offer(st);
                // possibly drop entries from queue
                if (stQueue.size() > maxSize) {
                    st = stQueue.poll();
                    visitedTerms.remove(st.bytes.get());
                    // reset the termstate! 
                    st.termState.clear();
                } else {
                    st = new ScoreTerm(new TermContext(topReaderContext));
                }
                assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
                // set maxBoostAtt with values to help FuzzyTermsEnum to optimize
                if (stQueue.size() == maxSize) {
                    t = stQueue.peek();
                    maxBoostAtt.setMaxNonCompetitiveBoost(t.boost);
                    maxBoostAtt.setCompetitiveTerm(t.bytes.get());
                }
            }
            return true;
        }
    });
    final B b = getTopLevelBuilder();
    final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
    ArrayUtil.timSort(scoreTerms, scoreTermSortByTermComp);
    for (final ScoreTerm st : scoreTerms) {
        final Term term = new Term(query.field, st.bytes.toBytesRef());
        // We allow negative term scores (fuzzy query does this, for example) while collecting the terms,
        // but truncate such boosts to 0.0f when building the query:
        // add to query
        addClause(b, term, st.termState.docFreq(), Math.max(0.0f, st.boost), st.termState);
    }
    return build(b);
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) HashMap(java.util.HashMap) IOException(java.io.IOException) Term(org.apache.lucene.index.Term) PriorityQueue(java.util.PriorityQueue) TermContext(org.apache.lucene.index.TermContext) TermsEnum(org.apache.lucene.index.TermsEnum) TermState(org.apache.lucene.index.TermState) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)150 BytesRef (org.apache.lucene.util.BytesRef)79 ArrayList (java.util.ArrayList)21 IOException (java.io.IOException)17 Term (org.apache.lucene.index.Term)16 HashSet (java.util.HashSet)15 ChecksumIndexInput (org.apache.lucene.store.ChecksumIndexInput)14 FieldType (org.apache.solr.schema.FieldType)14 IndexInput (org.apache.lucene.store.IndexInput)12 BytesRefIterator (org.apache.lucene.util.BytesRefIterator)10 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)10 IntsRef (org.apache.lucene.util.IntsRef)10 SchemaField (org.apache.solr.schema.SchemaField)10 BufferedChecksumIndexInput (org.apache.lucene.store.BufferedChecksumIndexInput)9 ParseException (java.text.ParseException)8 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)8 DecimalFormat (java.text.DecimalFormat)7 HashMap (java.util.HashMap)7 Map (java.util.Map)7 Directory (org.apache.lucene.store.Directory)7