use of org.apache.lucene.util.OfflineSorter.ByteSequencesWriter in project lucene-solr by apache.
the class Dictionary method readDictionaryFiles.
/**
* Reads the dictionary file through the provided InputStreams, building up the words map
*
* @param dictionaries InputStreams to read the dictionary file through
* @param decoder CharsetDecoder used to decode the contents of the file
* @throws IOException Can be thrown while reading from the file
*/
private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException {
BytesRefBuilder flagsScratch = new BytesRefBuilder();
IntsRefBuilder scratchInts = new IntsRefBuilder();
StringBuilder sb = new StringBuilder();
IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
for (InputStream dictionary : dictionaries) {
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
// first line is number of entries (approximately, sometimes)
String line = lines.readLine();
while ((line = lines.readLine()) != null) {
// wild and unpredictable code comment rules
if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') {
continue;
}
line = unescapeEntry(line);
// if we havent seen any stem exceptions, try to parse one
if (hasStemExceptions == false) {
int morphStart = line.indexOf(MORPH_SEPARATOR);
if (morphStart >= 0 && morphStart < line.length()) {
hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
}
}
if (needsInputCleaning) {
int flagSep = line.indexOf(FLAG_SEPARATOR);
if (flagSep == -1) {
flagSep = line.indexOf(MORPH_SEPARATOR);
}
if (flagSep == -1) {
CharSequence cleansed = cleanInput(line, sb);
writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
} else {
String text = line.substring(0, flagSep);
CharSequence cleansed = cleanInput(text, sb);
if (cleansed != sb) {
sb.setLength(0);
sb.append(cleansed);
}
sb.append(line.substring(flagSep));
writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
}
} else {
writer.write(line.getBytes(StandardCharsets.UTF_8));
}
}
}
CodecUtil.writeFooter(unsorted);
}
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new Comparator<BytesRef>() {
BytesRef scratch1 = new BytesRef();
BytesRef scratch2 = new BytesRef();
@Override
public int compare(BytesRef o1, BytesRef o2) {
scratch1.bytes = o1.bytes;
scratch1.offset = o1.offset;
scratch1.length = o1.length;
for (int i = scratch1.length - 1; i >= 0; i--) {
if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
scratch1.length = i;
break;
}
}
scratch2.bytes = o2.bytes;
scratch2.offset = o2.offset;
scratch2.length = o2.length;
for (int i = scratch2.length - 1; i >= 0; i--) {
if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
scratch2.length = i;
break;
}
}
int cmp = scratch1.compareTo(scratch2);
if (cmp == 0) {
// tie break on whole row
return o1.compareTo(o2);
} else {
return cmp;
}
}
});
String sorted;
boolean success = false;
try {
sorted = sorter.sort(unsorted.getName());
success = true;
} finally {
if (success) {
tempDir.deleteFile(unsorted.getName());
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
}
}
boolean success2 = false;
try (ByteSequencesReader reader = new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
// TODO: the flags themselves can be double-chars (long) or also numeric
// either way the trick is to encode them as char... but they must be parsed differently
String currentEntry = null;
IntsRefBuilder currentOrds = new IntsRefBuilder();
while (true) {
BytesRef scratch = reader.next();
if (scratch == null) {
break;
}
String line = scratch.utf8ToString();
String entry;
char[] wordForm;
int end;
int flagSep = line.indexOf(FLAG_SEPARATOR);
if (flagSep == -1) {
wordForm = NOFLAGS;
end = line.indexOf(MORPH_SEPARATOR);
entry = line.substring(0, end);
} else {
end = line.indexOf(MORPH_SEPARATOR);
String flagPart = line.substring(flagSep + 1, end);
if (aliasCount > 0) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
wordForm = flagParsingStrategy.parseFlags(flagPart);
Arrays.sort(wordForm);
entry = line.substring(0, flagSep);
}
// we possibly have morphological data
int stemExceptionID = 0;
if (hasStemExceptions && end + 1 < line.length()) {
String stemException = parseStemException(line.substring(end + 1));
if (stemException != null) {
if (stemExceptionCount == stemExceptions.length) {
int newSize = ArrayUtil.oversize(stemExceptionCount + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
stemExceptions = Arrays.copyOf(stemExceptions, newSize);
}
// we use '0' to indicate no exception for the form
stemExceptionID = stemExceptionCount + 1;
stemExceptions[stemExceptionCount++] = stemException;
}
}
int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
if (cmp < 0) {
throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
} else {
encodeFlags(flagsScratch, wordForm);
int ord = flagLookup.add(flagsScratch.get());
if (ord < 0) {
// already exists in our hash
ord = (-ord) - 1;
}
// finalize current entry, and switch "current" if necessary
if (cmp > 0 && currentEntry != null) {
Util.toUTF32(currentEntry, scratchInts);
words.add(scratchInts.get(), currentOrds.get());
}
// swap current
if (cmp > 0 || currentEntry == null) {
currentEntry = entry;
// must be this way
currentOrds = new IntsRefBuilder();
}
if (hasStemExceptions) {
currentOrds.append(ord);
currentOrds.append(stemExceptionID);
} else {
currentOrds.append(ord);
}
}
}
// finalize last entry
Util.toUTF32(currentEntry, scratchInts);
words.add(scratchInts.get(), currentOrds.get());
success2 = true;
} finally {
if (success2) {
tempDir.deleteFile(sorted);
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
}
}
}
use of org.apache.lucene.util.OfflineSorter.ByteSequencesWriter in project lucene-solr by apache.
the class TestOfflineSorter method testOverNexting.
// OfflineSorter should not call my BytesSequencesReader.next() again after it already returned null:
public void testOverNexting() throws Exception {
Directory dir = newDirectory();
IndexOutput out = dir.createTempOutput("unsorted", "tmp", IOContext.DEFAULT);
try (ByteSequencesWriter w = new OfflineSorter.ByteSequencesWriter(out)) {
byte[] bytes = new byte[Integer.BYTES];
random().nextBytes(bytes);
w.write(bytes);
CodecUtil.writeFooter(out);
}
new OfflineSorter(dir, "foo", OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(4), OfflineSorter.MAX_TEMPFILES, Integer.BYTES, null, 0) {
@Override
protected ByteSequencesReader getReader(ChecksumIndexInput in, String name) throws IOException {
ByteSequencesReader other = super.getReader(in, name);
return new ByteSequencesReader(in, name) {
private boolean alreadyEnded;
@Override
public BytesRef next() throws IOException {
// if we returned null already, OfflineSorter should not call next() again
assertFalse(alreadyEnded);
BytesRef result = other.next();
if (result == null) {
alreadyEnded = true;
}
return result;
}
@Override
public void close() throws IOException {
other.close();
}
};
}
}.sort(out.getName());
dir.close();
}
use of org.apache.lucene.util.OfflineSorter.ByteSequencesWriter in project lucene-solr by apache.
the class TestOfflineSorter method testFixedLengthLiesLiesLies.
public void testFixedLengthLiesLiesLies() throws Exception {
// Make sure OfflineSorter catches me if I lie about the fixed value length:
Directory dir = newDirectory();
IndexOutput out = dir.createTempOutput("unsorted", "tmp", IOContext.DEFAULT);
try (ByteSequencesWriter w = new OfflineSorter.ByteSequencesWriter(out)) {
byte[] bytes = new byte[Integer.BYTES];
random().nextBytes(bytes);
w.write(bytes);
CodecUtil.writeFooter(out);
}
OfflineSorter sorter = new OfflineSorter(dir, "foo", OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(4), OfflineSorter.MAX_TEMPFILES, Long.BYTES, null, 0);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> {
sorter.sort(out.getName());
});
assertEquals("value length is 4 but is supposed to always be 8", e.getMessage());
dir.close();
}
use of org.apache.lucene.util.OfflineSorter.ByteSequencesWriter in project lucene-solr by apache.
the class TestOfflineSorter method testFixedLengthHeap.
public void testFixedLengthHeap() throws Exception {
// Make sure the RAM accounting is correct, i.e. if we are sorting fixed width
// ints (4 bytes) then the heap used is really only 4 bytes per value:
Directory dir = newDirectory();
IndexOutput out = dir.createTempOutput("unsorted", "tmp", IOContext.DEFAULT);
try (ByteSequencesWriter w = new OfflineSorter.ByteSequencesWriter(out)) {
byte[] bytes = new byte[Integer.BYTES];
for (int i = 0; i < 1024 * 1024; i++) {
random().nextBytes(bytes);
w.write(bytes);
}
CodecUtil.writeFooter(out);
}
ExecutorService exec = randomExecutorServiceOrNull();
OfflineSorter sorter = new OfflineSorter(dir, "foo", OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(4), OfflineSorter.MAX_TEMPFILES, Integer.BYTES, exec, TestUtil.nextInt(random(), 1, 4));
sorter.sort(out.getName());
if (exec != null) {
exec.shutdownNow();
}
// 1 MB of ints with 4 MH heap allowed should have been sorted in a single heap partition:
assertEquals(0, sorter.sortInfo.mergeRounds);
dir.close();
}
Aggregations