use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.
the class SimpleTextUtil method checkFooter.
public static void checkFooter(ChecksumIndexInput input) throws IOException {
BytesRefBuilder scratch = new BytesRefBuilder();
String expectedChecksum = String.format(Locale.ROOT, "%020d", input.getChecksum());
readLine(input, scratch);
if (StringHelper.startsWith(scratch.get(), CHECKSUM) == false) {
throw new CorruptIndexException("SimpleText failure: expected checksum line but got " + scratch.get().utf8ToString(), input);
}
String actualChecksum = new BytesRef(scratch.bytes(), CHECKSUM.length, scratch.length() - CHECKSUM.length).utf8ToString();
if (!expectedChecksum.equals(actualChecksum)) {
throw new CorruptIndexException("SimpleText checksum failure: " + actualChecksum + " != " + expectedChecksum, input);
}
if (input.length() != input.getFilePointer()) {
throw new CorruptIndexException("Unexpected stuff at the end of file, please be careful with your text editor!", input);
}
}
use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.
the class SimpleTextCompoundFormat method getCompoundReader.
@Override
public Directory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) throws IOException {
String dataFile = IndexFileNames.segmentFileName(si.name, "", DATA_EXTENSION);
final IndexInput in = dir.openInput(dataFile, context);
BytesRefBuilder scratch = new BytesRefBuilder();
// first get to TOC:
DecimalFormat df = new DecimalFormat(OFFSETPATTERN, DecimalFormatSymbols.getInstance(Locale.ROOT));
long pos = in.length() - TABLEPOS.length - OFFSETPATTERN.length() - 1;
in.seek(pos);
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), TABLEPOS);
long tablePos = -1;
try {
tablePos = df.parse(stripPrefix(scratch, TABLEPOS)).longValue();
} catch (ParseException e) {
throw new CorruptIndexException("can't parse CFS trailer, got: " + scratch.get().utf8ToString(), in);
}
// seek to TOC and read it
in.seek(tablePos);
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), TABLE);
int numEntries = Integer.parseInt(stripPrefix(scratch, TABLE));
final String[] fileNames = new String[numEntries];
final long[] startOffsets = new long[numEntries];
final long[] endOffsets = new long[numEntries];
for (int i = 0; i < numEntries; i++) {
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), TABLENAME);
fileNames[i] = si.name + IndexFileNames.stripSegmentName(stripPrefix(scratch, TABLENAME));
if (i > 0) {
// files must be unique and in sorted order
assert fileNames[i].compareTo(fileNames[i - 1]) > 0;
}
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), TABLESTART);
startOffsets[i] = Long.parseLong(stripPrefix(scratch, TABLESTART));
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), TABLEEND);
endOffsets[i] = Long.parseLong(stripPrefix(scratch, TABLEEND));
}
return new Directory() {
private int getIndex(String name) throws IOException {
int index = Arrays.binarySearch(fileNames, name);
if (index < 0) {
throw new FileNotFoundException("No sub-file found (fileName=" + name + " files: " + Arrays.toString(fileNames) + ")");
}
return index;
}
@Override
public String[] listAll() throws IOException {
ensureOpen();
return fileNames.clone();
}
@Override
public long fileLength(String name) throws IOException {
ensureOpen();
int index = getIndex(name);
return endOffsets[index] - startOffsets[index];
}
@Override
public IndexInput openInput(String name, IOContext context) throws IOException {
ensureOpen();
int index = getIndex(name);
return in.slice(name, startOffsets[index], endOffsets[index] - startOffsets[index]);
}
@Override
public void close() throws IOException {
in.close();
}
// write methods: disabled
@Override
public IndexOutput createOutput(String name, IOContext context) {
throw new UnsupportedOperationException();
}
@Override
public IndexOutput createTempOutput(String prefix, String suffix, IOContext context) {
throw new UnsupportedOperationException();
}
@Override
public void sync(Collection<String> names) {
throw new UnsupportedOperationException();
}
@Override
public void deleteFile(String name) {
throw new UnsupportedOperationException();
}
@Override
public void rename(String source, String dest) {
throw new UnsupportedOperationException();
}
@Override
public void syncMetaData() {
throw new UnsupportedOperationException();
}
@Override
public Lock obtainLock(String name) {
throw new UnsupportedOperationException();
}
};
}
use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.
the class MemoryDocValuesProducer method getSortedSet.
@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
SortedSetEntry sortedSetEntry = sortedSets.get(field.name);
if (sortedSetEntry.singleton) {
return DocValues.singleton(getSorted(field));
}
final FSTEntry entry = fsts.get(field.name);
if (entry.numOrds == 0) {
// empty FST!
return DocValues.emptySortedSet();
}
FST<Long> instance;
synchronized (this) {
instance = fstInstances.get(field.name);
if (instance == null) {
IndexInput data = this.data.clone();
data.seek(entry.offset);
instance = new FST<>(data, PositiveIntOutputs.getSingleton());
if (!merging) {
ramBytesUsed.addAndGet(instance.ramBytesUsed());
fstInstances.put(field.name, instance);
}
}
}
final LegacyBinaryDocValues docToOrds = getLegacyBinary(field);
final FST<Long> fst = instance;
// per-thread resources
final BytesReader in = fst.getBytesReader();
final Arc<Long> firstArc = new Arc<>();
final Arc<Long> scratchArc = new Arc<>();
final IntsRefBuilder scratchInts = new IntsRefBuilder();
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
final ByteArrayDataInput input = new ByteArrayDataInput();
return new LegacySortedSetDocValuesWrapper(new LegacySortedSetDocValues() {
final BytesRefBuilder term = new BytesRefBuilder();
BytesRef ref;
long currentOrd;
@Override
public long nextOrd() {
if (input.eof()) {
return NO_MORE_ORDS;
} else {
currentOrd += input.readVLong();
return currentOrd;
}
}
@Override
public void setDocument(int docID) {
ref = docToOrds.get(docID);
input.reset(ref.bytes, ref.offset, ref.length);
currentOrd = 0;
}
@Override
public BytesRef lookupOrd(long ord) {
try {
in.setPosition(0);
fst.getFirstArc(firstArc);
IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts);
return Util.toBytesRef(output, term);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
@Override
public long lookupTerm(BytesRef key) {
try {
InputOutput<Long> o = fstEnum.seekCeil(key);
if (o == null) {
return -getValueCount() - 1;
} else if (o.input.equals(key)) {
return o.output.intValue();
} else {
return -o.output - 1;
}
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
@Override
public long getValueCount() {
return entry.numOrds;
}
@Override
public TermsEnum termsEnum() {
return new FSTTermsEnum(fst);
}
}, maxDoc);
}
use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.
the class FuzzyTermsEnum method getAutomatonEnum.
/**
* return an automata-based enum for matching up to editDistance from
* lastTerm, if possible
*/
private TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm) throws IOException {
assert editDistance < automata.length;
final CompiledAutomaton compiled = automata[editDistance];
BytesRef initialSeekTerm;
if (lastTerm == null) {
// This is the first enum we are pulling:
initialSeekTerm = null;
} else {
// We are pulling this enum (e.g., ed=1) after iterating for a while already (e.g., ed=2):
initialSeekTerm = compiled.floor(lastTerm, new BytesRefBuilder());
}
return terms.intersect(compiled, initialSeekTerm);
}
use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.
the class TopTermsRewrite method rewrite.
@Override
public final Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
final int maxSize = Math.min(size, getMaxSize());
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();
collectTerms(reader, query, new TermCollector() {
private final MaxNonCompetitiveBoostAttribute maxBoostAtt = attributes.addAttribute(MaxNonCompetitiveBoostAttribute.class);
private final Map<BytesRef, ScoreTerm> visitedTerms = new HashMap<>();
private TermsEnum termsEnum;
private BoostAttribute boostAtt;
private ScoreTerm st;
@Override
public void setNextEnum(TermsEnum termsEnum) {
this.termsEnum = termsEnum;
assert compareToLastTerm(null);
// lazy init the initial ScoreTerm because comparator is not known on ctor:
if (st == null)
st = new ScoreTerm(new TermContext(topReaderContext));
boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
}
// for assert:
private BytesRefBuilder lastTerm;
private boolean compareToLastTerm(BytesRef t) {
if (lastTerm == null && t != null) {
lastTerm = new BytesRefBuilder();
lastTerm.append(t);
} else if (t == null) {
lastTerm = null;
} else {
assert lastTerm.get().compareTo(t) < 0 : "lastTerm=" + lastTerm + " t=" + t;
lastTerm.copyBytes(t);
}
return true;
}
@Override
public boolean collect(BytesRef bytes) throws IOException {
final float boost = boostAtt.getBoost();
// terms in order
assert compareToLastTerm(bytes);
// ignore uncompetitive hits
if (stQueue.size() == maxSize) {
final ScoreTerm t = stQueue.peek();
if (boost < t.boost)
return true;
if (boost == t.boost && bytes.compareTo(t.bytes.get()) > 0)
return true;
}
ScoreTerm t = visitedTerms.get(bytes);
final TermState state = termsEnum.termState();
assert state != null;
if (t != null) {
// if the term is already in the PQ, only update docFreq of term in PQ
assert t.boost == boost : "boost should be equal in all segment TermsEnums";
t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
} else {
// add new entry in PQ, we must clone the term, else it may get overwritten!
st.bytes.copyBytes(bytes);
st.boost = boost;
visitedTerms.put(st.bytes.get(), st);
assert st.termState.docFreq() == 0;
st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
stQueue.offer(st);
// possibly drop entries from queue
if (stQueue.size() > maxSize) {
st = stQueue.poll();
visitedTerms.remove(st.bytes.get());
// reset the termstate!
st.termState.clear();
} else {
st = new ScoreTerm(new TermContext(topReaderContext));
}
assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
// set maxBoostAtt with values to help FuzzyTermsEnum to optimize
if (stQueue.size() == maxSize) {
t = stQueue.peek();
maxBoostAtt.setMaxNonCompetitiveBoost(t.boost);
maxBoostAtt.setCompetitiveTerm(t.bytes.get());
}
}
return true;
}
});
final B b = getTopLevelBuilder();
final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
ArrayUtil.timSort(scoreTerms, scoreTermSortByTermComp);
for (final ScoreTerm st : scoreTerms) {
final Term term = new Term(query.field, st.bytes.toBytesRef());
// We allow negative term scores (fuzzy query does this, for example) while collecting the terms,
// but truncate such boosts to 0.0f when building the query:
// add to query
addClause(b, term, st.termState.docFreq(), Math.max(0.0f, st.boost), st.termState);
}
return build(b);
}
Aggregations