use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class BaseSynonymParserTestCase method assertEntryAbsent.
/**
* Validates that there are no synonyms for the given word.
* @param synonynMap the generated synonym map after parsing
* @param word word (phrase) we are validating the synonyms for. Should be the value that comes out of the analyzer.
* All spaces will be replaced by word separators.
*/
public static void assertEntryAbsent(SynonymMap synonynMap, String word) throws IOException {
word = word.replace(' ', SynonymMap.WORD_SEPARATOR);
BytesRef value = Util.get(synonynMap.fst, Util.toUTF32(new CharsRef(word), new IntsRefBuilder()));
assertNull("There should be no synonyms for: " + word, value);
}
use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class Dictionary method readDictionaryFiles.
/**
* Reads the dictionary file through the provided InputStreams, building up the words map
*
* @param dictionaries InputStreams to read the dictionary file through
* @param decoder CharsetDecoder used to decode the contents of the file
* @throws IOException Can be thrown while reading from the file
*/
private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException {
BytesRefBuilder flagsScratch = new BytesRefBuilder();
IntsRefBuilder scratchInts = new IntsRefBuilder();
StringBuilder sb = new StringBuilder();
IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
for (InputStream dictionary : dictionaries) {
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
// first line is number of entries (approximately, sometimes)
String line = lines.readLine();
while ((line = lines.readLine()) != null) {
// wild and unpredictable code comment rules
if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') {
continue;
}
line = unescapeEntry(line);
// if we havent seen any stem exceptions, try to parse one
if (hasStemExceptions == false) {
int morphStart = line.indexOf(MORPH_SEPARATOR);
if (morphStart >= 0 && morphStart < line.length()) {
hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
}
}
if (needsInputCleaning) {
int flagSep = line.indexOf(FLAG_SEPARATOR);
if (flagSep == -1) {
flagSep = line.indexOf(MORPH_SEPARATOR);
}
if (flagSep == -1) {
CharSequence cleansed = cleanInput(line, sb);
writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
} else {
String text = line.substring(0, flagSep);
CharSequence cleansed = cleanInput(text, sb);
if (cleansed != sb) {
sb.setLength(0);
sb.append(cleansed);
}
sb.append(line.substring(flagSep));
writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
}
} else {
writer.write(line.getBytes(StandardCharsets.UTF_8));
}
}
}
CodecUtil.writeFooter(unsorted);
}
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new Comparator<BytesRef>() {
BytesRef scratch1 = new BytesRef();
BytesRef scratch2 = new BytesRef();
@Override
public int compare(BytesRef o1, BytesRef o2) {
scratch1.bytes = o1.bytes;
scratch1.offset = o1.offset;
scratch1.length = o1.length;
for (int i = scratch1.length - 1; i >= 0; i--) {
if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
scratch1.length = i;
break;
}
}
scratch2.bytes = o2.bytes;
scratch2.offset = o2.offset;
scratch2.length = o2.length;
for (int i = scratch2.length - 1; i >= 0; i--) {
if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
scratch2.length = i;
break;
}
}
int cmp = scratch1.compareTo(scratch2);
if (cmp == 0) {
// tie break on whole row
return o1.compareTo(o2);
} else {
return cmp;
}
}
});
String sorted;
boolean success = false;
try {
sorted = sorter.sort(unsorted.getName());
success = true;
} finally {
if (success) {
tempDir.deleteFile(unsorted.getName());
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
}
}
boolean success2 = false;
try (ByteSequencesReader reader = new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
// TODO: the flags themselves can be double-chars (long) or also numeric
// either way the trick is to encode them as char... but they must be parsed differently
String currentEntry = null;
IntsRefBuilder currentOrds = new IntsRefBuilder();
while (true) {
BytesRef scratch = reader.next();
if (scratch == null) {
break;
}
String line = scratch.utf8ToString();
String entry;
char[] wordForm;
int end;
int flagSep = line.indexOf(FLAG_SEPARATOR);
if (flagSep == -1) {
wordForm = NOFLAGS;
end = line.indexOf(MORPH_SEPARATOR);
entry = line.substring(0, end);
} else {
end = line.indexOf(MORPH_SEPARATOR);
String flagPart = line.substring(flagSep + 1, end);
if (aliasCount > 0) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
wordForm = flagParsingStrategy.parseFlags(flagPart);
Arrays.sort(wordForm);
entry = line.substring(0, flagSep);
}
// we possibly have morphological data
int stemExceptionID = 0;
if (hasStemExceptions && end + 1 < line.length()) {
String stemException = parseStemException(line.substring(end + 1));
if (stemException != null) {
if (stemExceptionCount == stemExceptions.length) {
int newSize = ArrayUtil.oversize(stemExceptionCount + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
stemExceptions = Arrays.copyOf(stemExceptions, newSize);
}
// we use '0' to indicate no exception for the form
stemExceptionID = stemExceptionCount + 1;
stemExceptions[stemExceptionCount++] = stemException;
}
}
int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
if (cmp < 0) {
throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
} else {
encodeFlags(flagsScratch, wordForm);
int ord = flagLookup.add(flagsScratch.get());
if (ord < 0) {
// already exists in our hash
ord = (-ord) - 1;
}
// finalize current entry, and switch "current" if necessary
if (cmp > 0 && currentEntry != null) {
Util.toUTF32(currentEntry, scratchInts);
words.add(scratchInts.get(), currentOrds.get());
}
// swap current
if (cmp > 0 || currentEntry == null) {
currentEntry = entry;
// must be this way
currentOrds = new IntsRefBuilder();
}
if (hasStemExceptions) {
currentOrds.append(ord);
currentOrds.append(stemExceptionID);
} else {
currentOrds.append(ord);
}
}
}
// finalize last entry
Util.toUTF32(currentEntry, scratchInts);
words.add(scratchInts.get(), currentOrds.get());
success2 = true;
} finally {
if (success2) {
tempDir.deleteFile(sorted);
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
}
}
}
use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class TestDictionary method testReplacements.
public void testReplacements() throws Exception {
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder();
// a -> b
Util.toUTF16("a", scratchInts);
builder.add(scratchInts.get(), new CharsRef("b"));
// ab -> c
Util.toUTF16("ab", scratchInts);
builder.add(scratchInts.get(), new CharsRef("c"));
// c -> de
Util.toUTF16("c", scratchInts);
builder.add(scratchInts.get(), new CharsRef("de"));
// def -> gh
Util.toUTF16("def", scratchInts);
builder.add(scratchInts.get(), new CharsRef("gh"));
FST<CharsRef> fst = builder.finish();
StringBuilder sb = new StringBuilder("atestanother");
Dictionary.applyMappings(fst, sb);
assertEquals("btestbnother", sb.toString());
sb = new StringBuilder("abtestanother");
Dictionary.applyMappings(fst, sb);
assertEquals("ctestbnother", sb.toString());
sb = new StringBuilder("atestabnother");
Dictionary.applyMappings(fst, sb);
assertEquals("btestcnother", sb.toString());
sb = new StringBuilder("abtestabnother");
Dictionary.applyMappings(fst, sb);
assertEquals("ctestcnother", sb.toString());
sb = new StringBuilder("abtestabcnother");
Dictionary.applyMappings(fst, sb);
assertEquals("ctestcdenother", sb.toString());
sb = new StringBuilder("defdefdefc");
Dictionary.applyMappings(fst, sb);
assertEquals("ghghghde", sb.toString());
}
use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class Operations method getSingleton.
/** If this automaton accepts a single input, return it. Else, return null.
* The automaton must be deterministic. */
public static IntsRef getSingleton(Automaton a) {
if (a.isDeterministic() == false) {
throw new IllegalArgumentException("input automaton must be deterministic");
}
IntsRefBuilder builder = new IntsRefBuilder();
HashSet<Integer> visited = new HashSet<>();
int s = 0;
Transition t = new Transition();
while (true) {
visited.add(s);
if (a.isAccept(s) == false) {
if (a.getNumTransitions(s) == 1) {
a.getTransition(s, 0, t);
if (t.min == t.max && !visited.contains(t.dest)) {
builder.append(t.min);
s = t.dest;
continue;
}
}
} else if (a.getNumTransitions(s) == 0) {
return builder.get();
}
// Automaton accepts more than one string:
return null;
}
}
use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class FSTUtil method intersectPrefixPaths.
/**
* Enumerates all minimal prefix paths in the automaton that also intersect the FST,
* accumulating the FST end node and output for each path.
*/
public static <T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst) throws IOException {
assert a.isDeterministic();
final List<Path<T>> queue = new ArrayList<>();
final List<Path<T>> endNodes = new ArrayList<>();
if (a.getNumStates() == 0) {
return endNodes;
}
queue.add(new Path<>(0, fst.getFirstArc(new FST.Arc<T>()), fst.outputs.getNoOutput(), new IntsRefBuilder()));
final FST.Arc<T> scratchArc = new FST.Arc<>();
final FST.BytesReader fstReader = fst.getBytesReader();
Transition t = new Transition();
while (queue.size() != 0) {
final Path<T> path = queue.remove(queue.size() - 1);
if (a.isAccept(path.state)) {
endNodes.add(path);
// we accept all further paths too
continue;
}
IntsRefBuilder currentInput = path.input;
int count = a.initTransition(path.state, t);
for (int i = 0; i < count; i++) {
a.getNextTransition(t);
final int min = t.min;
final int max = t.max;
if (min == max) {
final FST.Arc<T> nextArc = fst.findTargetArc(t.min, path.fstNode, scratchArc, fstReader);
if (nextArc != null) {
final IntsRefBuilder newInput = new IntsRefBuilder();
newInput.copyInts(currentInput.get());
newInput.append(t.min);
queue.add(new Path<>(t.dest, new FST.Arc<T>().copyFrom(nextArc), fst.outputs.add(path.output, nextArc.output), newInput));
}
} else {
// TODO: if this transition's TO state is accepting, and
// it accepts the entire range possible in the FST (ie. 0 to 255),
// we can simply use the prefix as the accepted state instead of
// looking up all the ranges and terminate early
// here. This just shifts the work from one queue
// (this one) to another (the completion search
// done in AnalyzingSuggester).
FST.Arc<T> nextArc = Util.readCeilArc(min, fst, path.fstNode, scratchArc, fstReader);
while (nextArc != null && nextArc.label <= max) {
assert nextArc.label <= max;
assert nextArc.label >= min : nextArc.label + " " + min;
final IntsRefBuilder newInput = new IntsRefBuilder();
newInput.copyInts(currentInput.get());
newInput.append(nextArc.label);
queue.add(new Path<>(t.dest, new FST.Arc<T>().copyFrom(nextArc), fst.outputs.add(path.output, nextArc.output), newInput));
// used in assert
final int label = nextArc.label;
nextArc = nextArc.isLast() ? null : fst.readNextRealArc(nextArc, fstReader);
assert nextArc == null || label < nextArc.label : "last: " + label + " next: " + nextArc.label;
}
}
}
}
return endNodes;
}
Aggregations