use of org.apache.lucene.util.OfflineSorter in project elasticsearch by elastic.
the class XAnalyzingSuggester method build.
@Override
public void build(InputIterator iterator) throws IOException {
String prefix = getClass().getSimpleName();
Directory tempDir = getTempDir();
OfflineSorter sorter = new OfflineSorter(tempDir, prefix, new AnalyzingComparator(hasPayloads));
IndexOutput tempInput = tempDir.createTempOutput(prefix, "input", IOContext.DEFAULT);
OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
OfflineSorter.ByteSequencesReader reader = null;
hasPayloads = iterator.hasPayloads();
BytesRefBuilder scratch = new BytesRefBuilder();
TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
String tempSortedFileName = null;
count = 0;
byte[] buffer = new byte[8];
try {
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
Util.toBytesRef(string, scratch);
// length of the analyzed text (FST input)
if (scratch.length() > Short.MAX_VALUE - 2) {
throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE - 2) + " in length (got " + scratch.length() + ")");
}
short analyzedLength = (short) scratch.length();
// compute the required length:
// analyzed sequence + weight (4) + surface + analyzedLength (short)
int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
BytesRef payload;
if (hasPayloads) {
if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE - 2) + " in length (got " + surfaceForm.length + ")");
}
payload = iterator.payload();
// payload + surfaceLength (short)
requiredLength += payload.length + 2;
} else {
payload = null;
}
buffer = ArrayUtil.grow(buffer, requiredLength);
output.reset(buffer);
output.writeShort(analyzedLength);
output.writeBytes(scratch.bytes(), 0, scratch.length());
output.writeInt(encodeWeight(iterator.weight()));
if (hasPayloads) {
for (int i = 0; i < surfaceForm.length; i++) {
if (surfaceForm.bytes[i] == payloadSep) {
throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
}
}
output.writeShort((short) surfaceForm.length);
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
output.writeBytes(payload.bytes, payload.offset, payload.length);
} else {
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
}
assert output.getPosition() == requiredLength : output.getPosition() + " vs " + requiredLength;
writer.write(buffer, 0, output.getPosition());
}
maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
}
writer.close();
// Sort all input/output pairs (required by FST.Builder):
tempSortedFileName = sorter.sort(tempInput.getName());
// Free disk space:
tempDir.deleteFile(tempInput.getName());
reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), prefix);
PairOutputs<Long, BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
Builder<Pair<Long, BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
// Build FST:
BytesRefBuilder previousAnalyzed = null;
BytesRefBuilder analyzed = new BytesRefBuilder();
BytesRef surface = new BytesRef();
IntsRefBuilder scratchInts = new IntsRefBuilder();
ByteArrayDataInput input = new ByteArrayDataInput();
// Used to remove duplicate surface forms (but we
// still index the hightest-weight one). We clear
// this when we see a new analyzed form, so it cannot
// grow unbounded (at most 256 entries):
Set<BytesRef> seenSurfaceForms = new HashSet<>();
int dedup = 0;
while (true) {
BytesRef bytes = reader.next();
if (bytes == null) {
break;
}
input.reset(bytes.bytes, bytes.offset, bytes.length);
short analyzedLength = input.readShort();
analyzed.grow(analyzedLength + 2);
input.readBytes(analyzed.bytes(), 0, analyzedLength);
analyzed.setLength(analyzedLength);
long cost = input.readInt();
surface.bytes = bytes.bytes;
if (hasPayloads) {
surface.length = input.readShort();
surface.offset = input.getPosition();
} else {
surface.offset = input.getPosition();
surface.length = bytes.length - surface.offset;
}
if (previousAnalyzed == null) {
previousAnalyzed = new BytesRefBuilder();
previousAnalyzed.copyBytes(analyzed);
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else if (analyzed.get().equals(previousAnalyzed.get())) {
dedup++;
if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
// dups: skip the rest:
continue;
}
if (seenSurfaceForms.contains(surface)) {
continue;
}
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else {
dedup = 0;
previousAnalyzed.copyBytes(analyzed);
seenSurfaceForms.clear();
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
}
// TODO: I think we can avoid the extra 2 bytes when
// there is no dup (dedup==0), but we'd have to fix
// the exactFirst logic ... which would be sort of
// hairy because we'd need to special case the two
// (dup/not dup)...
// NOTE: must be byte 0 so we sort before whatever
// is next
analyzed.append((byte) 0);
analyzed.append((byte) dedup);
Util.toIntsRef(analyzed.get(), scratchInts);
//System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
if (!hasPayloads) {
builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
} else {
int payloadOffset = input.getPosition() + surface.length;
int payloadLength = bytes.length - payloadOffset;
BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
br.bytes[surface.length] = (byte) payloadSep;
System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
br.length = br.bytes.length;
builder.add(scratchInts.get(), outputs.newPair(cost, br));
}
}
fst = builder.finish();
//PrintWriter pw = new PrintWriter("/tmp/out.dot");
//Util.toDot(fst, pw, true, true);
//pw.close();
} finally {
IOUtils.closeWhileHandlingException(reader, writer);
IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
}
}
use of org.apache.lucene.util.OfflineSorter in project lucene-solr by apache.
the class Dictionary method readDictionaryFiles.
/**
* Reads the dictionary file through the provided InputStreams, building up the words map
*
* @param dictionaries InputStreams to read the dictionary file through
* @param decoder CharsetDecoder used to decode the contents of the file
* @throws IOException Can be thrown while reading from the file
*/
private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException {
BytesRefBuilder flagsScratch = new BytesRefBuilder();
IntsRefBuilder scratchInts = new IntsRefBuilder();
StringBuilder sb = new StringBuilder();
IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
for (InputStream dictionary : dictionaries) {
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
// first line is number of entries (approximately, sometimes)
String line = lines.readLine();
while ((line = lines.readLine()) != null) {
// wild and unpredictable code comment rules
if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') {
continue;
}
line = unescapeEntry(line);
// if we havent seen any stem exceptions, try to parse one
if (hasStemExceptions == false) {
int morphStart = line.indexOf(MORPH_SEPARATOR);
if (morphStart >= 0 && morphStart < line.length()) {
hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
}
}
if (needsInputCleaning) {
int flagSep = line.indexOf(FLAG_SEPARATOR);
if (flagSep == -1) {
flagSep = line.indexOf(MORPH_SEPARATOR);
}
if (flagSep == -1) {
CharSequence cleansed = cleanInput(line, sb);
writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
} else {
String text = line.substring(0, flagSep);
CharSequence cleansed = cleanInput(text, sb);
if (cleansed != sb) {
sb.setLength(0);
sb.append(cleansed);
}
sb.append(line.substring(flagSep));
writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
}
} else {
writer.write(line.getBytes(StandardCharsets.UTF_8));
}
}
}
CodecUtil.writeFooter(unsorted);
}
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new Comparator<BytesRef>() {
BytesRef scratch1 = new BytesRef();
BytesRef scratch2 = new BytesRef();
@Override
public int compare(BytesRef o1, BytesRef o2) {
scratch1.bytes = o1.bytes;
scratch1.offset = o1.offset;
scratch1.length = o1.length;
for (int i = scratch1.length - 1; i >= 0; i--) {
if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
scratch1.length = i;
break;
}
}
scratch2.bytes = o2.bytes;
scratch2.offset = o2.offset;
scratch2.length = o2.length;
for (int i = scratch2.length - 1; i >= 0; i--) {
if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
scratch2.length = i;
break;
}
}
int cmp = scratch1.compareTo(scratch2);
if (cmp == 0) {
// tie break on whole row
return o1.compareTo(o2);
} else {
return cmp;
}
}
});
String sorted;
boolean success = false;
try {
sorted = sorter.sort(unsorted.getName());
success = true;
} finally {
if (success) {
tempDir.deleteFile(unsorted.getName());
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
}
}
boolean success2 = false;
try (ByteSequencesReader reader = new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
// TODO: the flags themselves can be double-chars (long) or also numeric
// either way the trick is to encode them as char... but they must be parsed differently
String currentEntry = null;
IntsRefBuilder currentOrds = new IntsRefBuilder();
while (true) {
BytesRef scratch = reader.next();
if (scratch == null) {
break;
}
String line = scratch.utf8ToString();
String entry;
char[] wordForm;
int end;
int flagSep = line.indexOf(FLAG_SEPARATOR);
if (flagSep == -1) {
wordForm = NOFLAGS;
end = line.indexOf(MORPH_SEPARATOR);
entry = line.substring(0, end);
} else {
end = line.indexOf(MORPH_SEPARATOR);
String flagPart = line.substring(flagSep + 1, end);
if (aliasCount > 0) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
wordForm = flagParsingStrategy.parseFlags(flagPart);
Arrays.sort(wordForm);
entry = line.substring(0, flagSep);
}
// we possibly have morphological data
int stemExceptionID = 0;
if (hasStemExceptions && end + 1 < line.length()) {
String stemException = parseStemException(line.substring(end + 1));
if (stemException != null) {
if (stemExceptionCount == stemExceptions.length) {
int newSize = ArrayUtil.oversize(stemExceptionCount + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
stemExceptions = Arrays.copyOf(stemExceptions, newSize);
}
// we use '0' to indicate no exception for the form
stemExceptionID = stemExceptionCount + 1;
stemExceptions[stemExceptionCount++] = stemException;
}
}
int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
if (cmp < 0) {
throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
} else {
encodeFlags(flagsScratch, wordForm);
int ord = flagLookup.add(flagsScratch.get());
if (ord < 0) {
// already exists in our hash
ord = (-ord) - 1;
}
// finalize current entry, and switch "current" if necessary
if (cmp > 0 && currentEntry != null) {
Util.toUTF32(currentEntry, scratchInts);
words.add(scratchInts.get(), currentOrds.get());
}
// swap current
if (cmp > 0 || currentEntry == null) {
currentEntry = entry;
// must be this way
currentOrds = new IntsRefBuilder();
}
if (hasStemExceptions) {
currentOrds.append(ord);
currentOrds.append(stemExceptionID);
} else {
currentOrds.append(ord);
}
}
}
// finalize last entry
Util.toUTF32(currentEntry, scratchInts);
words.add(scratchInts.get(), currentOrds.get());
success2 = true;
} finally {
if (success2) {
tempDir.deleteFile(sorted);
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
}
}
}
use of org.apache.lucene.util.OfflineSorter in project lucene-solr by apache.
the class SortedInputIterator method sort.
private ByteSequencesReader sort() throws IOException {
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, tieBreakByCostComparator);
tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
try (OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput)) {
BytesRef spare;
byte[] buffer = new byte[0];
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
while ((spare = source.next()) != null) {
encode(writer, output, buffer, spare, source.payload(), source.contexts(), source.weight());
}
CodecUtil.writeFooter(tempInput);
}
tempSortedFileName = sorter.sort(tempInput.getName());
return new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
}
use of org.apache.lucene.util.OfflineSorter in project lucene-solr by apache.
the class AnalyzingSuggester method build.
@Override
public void build(InputIterator iterator) throws IOException {
if (iterator.hasContexts()) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
hasPayloads = iterator.hasPayloads();
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new AnalyzingComparator(hasPayloads));
IndexOutput tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
OfflineSorter.ByteSequencesReader reader = null;
BytesRefBuilder scratch = new BytesRefBuilder();
TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
String tempSortedFileName = null;
count = 0;
byte[] buffer = new byte[8];
try {
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
Util.toBytesRef(string, scratch);
// length of the analyzed text (FST input)
if (scratch.length() > Short.MAX_VALUE - 2) {
throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE - 2) + " in length (got " + scratch.length() + ")");
}
short analyzedLength = (short) scratch.length();
// compute the required length:
// analyzed sequence + weight (4) + surface + analyzedLength (short)
int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
BytesRef payload;
if (hasPayloads) {
if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE - 2) + " in length (got " + surfaceForm.length + ")");
}
payload = iterator.payload();
// payload + surfaceLength (short)
requiredLength += payload.length + 2;
} else {
payload = null;
}
buffer = ArrayUtil.grow(buffer, requiredLength);
output.reset(buffer);
output.writeShort(analyzedLength);
output.writeBytes(scratch.bytes(), 0, scratch.length());
output.writeInt(encodeWeight(iterator.weight()));
if (hasPayloads) {
for (int i = 0; i < surfaceForm.length; i++) {
if (surfaceForm.bytes[i] == PAYLOAD_SEP) {
throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
}
}
output.writeShort((short) surfaceForm.length);
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
output.writeBytes(payload.bytes, payload.offset, payload.length);
} else {
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
}
assert output.getPosition() == requiredLength : output.getPosition() + " vs " + requiredLength;
writer.write(buffer, 0, output.getPosition());
}
maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
}
CodecUtil.writeFooter(tempInput);
writer.close();
// Sort all input/output pairs (required by FST.Builder):
tempSortedFileName = sorter.sort(tempInput.getName());
// Free disk space:
tempDir.deleteFile(tempInput.getName());
reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
PairOutputs<Long, BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
Builder<Pair<Long, BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
// Build FST:
BytesRefBuilder previousAnalyzed = null;
BytesRefBuilder analyzed = new BytesRefBuilder();
BytesRef surface = new BytesRef();
IntsRefBuilder scratchInts = new IntsRefBuilder();
ByteArrayDataInput input = new ByteArrayDataInput();
// Used to remove duplicate surface forms (but we
// still index the hightest-weight one). We clear
// this when we see a new analyzed form, so it cannot
// grow unbounded (at most 256 entries):
Set<BytesRef> seenSurfaceForms = new HashSet<>();
int dedup = 0;
while (true) {
BytesRef bytes = reader.next();
if (bytes == null) {
break;
}
input.reset(bytes.bytes, bytes.offset, bytes.length);
short analyzedLength = input.readShort();
analyzed.grow(analyzedLength + 2);
input.readBytes(analyzed.bytes(), 0, analyzedLength);
analyzed.setLength(analyzedLength);
long cost = input.readInt();
surface.bytes = bytes.bytes;
if (hasPayloads) {
surface.length = input.readShort();
surface.offset = input.getPosition();
} else {
surface.offset = input.getPosition();
surface.length = bytes.length - surface.offset;
}
if (previousAnalyzed == null) {
previousAnalyzed = new BytesRefBuilder();
previousAnalyzed.copyBytes(analyzed.get());
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else if (analyzed.get().equals(previousAnalyzed.get())) {
dedup++;
if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
// dups: skip the rest:
continue;
}
if (seenSurfaceForms.contains(surface)) {
continue;
}
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else {
dedup = 0;
previousAnalyzed.copyBytes(analyzed);
seenSurfaceForms.clear();
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
}
// TODO: I think we can avoid the extra 2 bytes when
// there is no dup (dedup==0), but we'd have to fix
// the exactFirst logic ... which would be sort of
// hairy because we'd need to special case the two
// (dup/not dup)...
// NOTE: must be byte 0 so we sort before whatever
// is next
analyzed.append((byte) 0);
analyzed.append((byte) dedup);
Util.toIntsRef(analyzed.get(), scratchInts);
//System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
if (!hasPayloads) {
builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
} else {
int payloadOffset = input.getPosition() + surface.length;
int payloadLength = bytes.length - payloadOffset;
BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
br.bytes[surface.length] = PAYLOAD_SEP;
System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
br.length = br.bytes.length;
builder.add(scratchInts.get(), outputs.newPair(cost, br));
}
}
fst = builder.finish();
//Util.dotToFile(fst, "/tmp/suggest.dot");
} finally {
IOUtils.closeWhileHandlingException(reader, writer);
IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
}
}
use of org.apache.lucene.util.OfflineSorter in project lucene-solr by apache.
the class FSTCompletionLookup method build.
@Override
public void build(InputIterator iterator) throws IOException {
if (iterator.hasPayloads()) {
throw new IllegalArgumentException("this suggester doesn't support payloads");
}
if (iterator.hasContexts()) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix);
ExternalRefSorter externalSorter = new ExternalRefSorter(sorter);
IndexOutput tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
String tempSortedFileName = null;
OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
OfflineSorter.ByteSequencesReader reader = null;
// Push floats up front before sequences to sort them. For now, assume they are non-negative.
// If negative floats are allowed some trickery needs to be done to find their byte order.
count = 0;
try {
byte[] buffer = new byte[0];
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
BytesRef spare;
int inputLineCount = 0;
while ((spare = iterator.next()) != null) {
if (spare.length + 4 >= buffer.length) {
buffer = ArrayUtil.grow(buffer, spare.length + 4);
}
output.reset(buffer);
output.writeInt(encodeWeight(iterator.weight()));
output.writeBytes(spare.bytes, spare.offset, spare.length);
writer.write(buffer, 0, output.getPosition());
inputLineCount++;
}
CodecUtil.writeFooter(tempInput);
writer.close();
// We don't know the distribution of scores and we need to bucket them, so we'll sort
// and divide into equal buckets.
tempSortedFileName = sorter.sort(tempInput.getName());
tempDir.deleteFile(tempInput.getName());
FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, externalSorter, sharedTailLength);
reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
long line = 0;
int previousBucket = 0;
int previousScore = 0;
ByteArrayDataInput input = new ByteArrayDataInput();
BytesRef tmp2 = new BytesRef();
while (true) {
BytesRef scratch = reader.next();
if (scratch == null) {
break;
}
input.reset(scratch.bytes, scratch.offset, scratch.length);
int currentScore = input.readInt();
int bucket;
if (line > 0 && currentScore == previousScore) {
bucket = previousBucket;
} else {
bucket = (int) (line * buckets / inputLineCount);
}
previousScore = currentScore;
previousBucket = bucket;
// Only append the input, discard the weight.
tmp2.bytes = scratch.bytes;
tmp2.offset = scratch.offset + input.getPosition();
tmp2.length = scratch.length - input.getPosition();
builder.add(tmp2, bucket);
line++;
count++;
}
// The two FSTCompletions share the same automaton.
this.higherWeightsCompletion = builder.build();
this.normalCompletion = new FSTCompletion(higherWeightsCompletion.getFST(), false, exactMatchFirst);
} finally {
IOUtils.closeWhileHandlingException(reader, writer, externalSorter);
IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
}
}
Aggregations