Search in sources :

Example 6 with IntsRefBuilder

use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.

the class BaseSynonymParserTestCase method assertEntryAbsent.

/**
   * Validates that there are no synonyms for the given word.
   * @param synonynMap  the generated synonym map after parsing
   * @param word        word (phrase) we are validating the synonyms for. Should be the value that comes out of the analyzer.
   *                    All spaces will be replaced by word separators.
   */
public static void assertEntryAbsent(SynonymMap synonynMap, String word) throws IOException {
    word = word.replace(' ', SynonymMap.WORD_SEPARATOR);
    BytesRef value = Util.get(synonynMap.fst, Util.toUTF32(new CharsRef(word), new IntsRefBuilder()));
    assertNull("There should be no synonyms for: " + word, value);
}
Also used : IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef) CharsRef(org.apache.lucene.util.CharsRef)

Example 7 with IntsRefBuilder

use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.

the class Dictionary method readDictionaryFiles.

/**
   * Reads the dictionary file through the provided InputStreams, building up the words map
   *
   * @param dictionaries InputStreams to read the dictionary file through
   * @param decoder CharsetDecoder used to decode the contents of the file
   * @throws IOException Can be thrown while reading from the file
   */
private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException {
    BytesRefBuilder flagsScratch = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    StringBuilder sb = new StringBuilder();
    IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
    try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
        for (InputStream dictionary : dictionaries) {
            BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
            // first line is number of entries (approximately, sometimes)
            String line = lines.readLine();
            while ((line = lines.readLine()) != null) {
                // wild and unpredictable code comment rules
                if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') {
                    continue;
                }
                line = unescapeEntry(line);
                // if we havent seen any stem exceptions, try to parse one
                if (hasStemExceptions == false) {
                    int morphStart = line.indexOf(MORPH_SEPARATOR);
                    if (morphStart >= 0 && morphStart < line.length()) {
                        hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
                    }
                }
                if (needsInputCleaning) {
                    int flagSep = line.indexOf(FLAG_SEPARATOR);
                    if (flagSep == -1) {
                        flagSep = line.indexOf(MORPH_SEPARATOR);
                    }
                    if (flagSep == -1) {
                        CharSequence cleansed = cleanInput(line, sb);
                        writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
                    } else {
                        String text = line.substring(0, flagSep);
                        CharSequence cleansed = cleanInput(text, sb);
                        if (cleansed != sb) {
                            sb.setLength(0);
                            sb.append(cleansed);
                        }
                        sb.append(line.substring(flagSep));
                        writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
                    }
                } else {
                    writer.write(line.getBytes(StandardCharsets.UTF_8));
                }
            }
        }
        CodecUtil.writeFooter(unsorted);
    }
    OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new Comparator<BytesRef>() {

        BytesRef scratch1 = new BytesRef();

        BytesRef scratch2 = new BytesRef();

        @Override
        public int compare(BytesRef o1, BytesRef o2) {
            scratch1.bytes = o1.bytes;
            scratch1.offset = o1.offset;
            scratch1.length = o1.length;
            for (int i = scratch1.length - 1; i >= 0; i--) {
                if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
                    scratch1.length = i;
                    break;
                }
            }
            scratch2.bytes = o2.bytes;
            scratch2.offset = o2.offset;
            scratch2.length = o2.length;
            for (int i = scratch2.length - 1; i >= 0; i--) {
                if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
                    scratch2.length = i;
                    break;
                }
            }
            int cmp = scratch1.compareTo(scratch2);
            if (cmp == 0) {
                // tie break on whole row
                return o1.compareTo(o2);
            } else {
                return cmp;
            }
        }
    });
    String sorted;
    boolean success = false;
    try {
        sorted = sorter.sort(unsorted.getName());
        success = true;
    } finally {
        if (success) {
            tempDir.deleteFile(unsorted.getName());
        } else {
            IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
        }
    }
    boolean success2 = false;
    try (ByteSequencesReader reader = new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
        // TODO: the flags themselves can be double-chars (long) or also numeric
        // either way the trick is to encode them as char... but they must be parsed differently
        String currentEntry = null;
        IntsRefBuilder currentOrds = new IntsRefBuilder();
        while (true) {
            BytesRef scratch = reader.next();
            if (scratch == null) {
                break;
            }
            String line = scratch.utf8ToString();
            String entry;
            char[] wordForm;
            int end;
            int flagSep = line.indexOf(FLAG_SEPARATOR);
            if (flagSep == -1) {
                wordForm = NOFLAGS;
                end = line.indexOf(MORPH_SEPARATOR);
                entry = line.substring(0, end);
            } else {
                end = line.indexOf(MORPH_SEPARATOR);
                String flagPart = line.substring(flagSep + 1, end);
                if (aliasCount > 0) {
                    flagPart = getAliasValue(Integer.parseInt(flagPart));
                }
                wordForm = flagParsingStrategy.parseFlags(flagPart);
                Arrays.sort(wordForm);
                entry = line.substring(0, flagSep);
            }
            // we possibly have morphological data
            int stemExceptionID = 0;
            if (hasStemExceptions && end + 1 < line.length()) {
                String stemException = parseStemException(line.substring(end + 1));
                if (stemException != null) {
                    if (stemExceptionCount == stemExceptions.length) {
                        int newSize = ArrayUtil.oversize(stemExceptionCount + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
                        stemExceptions = Arrays.copyOf(stemExceptions, newSize);
                    }
                    // we use '0' to indicate no exception for the form
                    stemExceptionID = stemExceptionCount + 1;
                    stemExceptions[stemExceptionCount++] = stemException;
                }
            }
            int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
            if (cmp < 0) {
                throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
            } else {
                encodeFlags(flagsScratch, wordForm);
                int ord = flagLookup.add(flagsScratch.get());
                if (ord < 0) {
                    // already exists in our hash
                    ord = (-ord) - 1;
                }
                // finalize current entry, and switch "current" if necessary
                if (cmp > 0 && currentEntry != null) {
                    Util.toUTF32(currentEntry, scratchInts);
                    words.add(scratchInts.get(), currentOrds.get());
                }
                // swap current
                if (cmp > 0 || currentEntry == null) {
                    currentEntry = entry;
                    // must be this way
                    currentOrds = new IntsRefBuilder();
                }
                if (hasStemExceptions) {
                    currentOrds.append(ord);
                    currentOrds.append(stemExceptionID);
                } else {
                    currentOrds.append(ord);
                }
            }
        }
        // finalize last entry
        Util.toUTF32(currentEntry, scratchInts);
        words.add(scratchInts.get(), currentOrds.get());
        success2 = true;
    } finally {
        if (success2) {
            tempDir.deleteFile(sorted);
        } else {
            IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
        }
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) InputStreamReader(java.io.InputStreamReader) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) IndexOutput(org.apache.lucene.store.IndexOutput) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ByteSequencesReader(org.apache.lucene.util.OfflineSorter.ByteSequencesReader) BufferedReader(java.io.BufferedReader) ByteSequencesWriter(org.apache.lucene.util.OfflineSorter.ByteSequencesWriter) BytesRef(org.apache.lucene.util.BytesRef)

Example 8 with IntsRefBuilder

use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.

the class TestDictionary method testReplacements.

public void testReplacements() throws Exception {
    Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    // a -> b
    Util.toUTF16("a", scratchInts);
    builder.add(scratchInts.get(), new CharsRef("b"));
    // ab -> c
    Util.toUTF16("ab", scratchInts);
    builder.add(scratchInts.get(), new CharsRef("c"));
    // c -> de
    Util.toUTF16("c", scratchInts);
    builder.add(scratchInts.get(), new CharsRef("de"));
    // def -> gh
    Util.toUTF16("def", scratchInts);
    builder.add(scratchInts.get(), new CharsRef("gh"));
    FST<CharsRef> fst = builder.finish();
    StringBuilder sb = new StringBuilder("atestanother");
    Dictionary.applyMappings(fst, sb);
    assertEquals("btestbnother", sb.toString());
    sb = new StringBuilder("abtestanother");
    Dictionary.applyMappings(fst, sb);
    assertEquals("ctestbnother", sb.toString());
    sb = new StringBuilder("atestabnother");
    Dictionary.applyMappings(fst, sb);
    assertEquals("btestcnother", sb.toString());
    sb = new StringBuilder("abtestabnother");
    Dictionary.applyMappings(fst, sb);
    assertEquals("ctestcnother", sb.toString());
    sb = new StringBuilder("abtestabcnother");
    Dictionary.applyMappings(fst, sb);
    assertEquals("ctestcdenother", sb.toString());
    sb = new StringBuilder("defdefdefc");
    Dictionary.applyMappings(fst, sb);
    assertEquals("ghghghde", sb.toString());
}
Also used : IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Builder(org.apache.lucene.util.fst.Builder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) CharsRef(org.apache.lucene.util.CharsRef)

Example 9 with IntsRefBuilder

use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.

the class Operations method getSingleton.

/** If this automaton accepts a single input, return it.  Else, return null.
   *  The automaton must be deterministic. */
public static IntsRef getSingleton(Automaton a) {
    if (a.isDeterministic() == false) {
        throw new IllegalArgumentException("input automaton must be deterministic");
    }
    IntsRefBuilder builder = new IntsRefBuilder();
    HashSet<Integer> visited = new HashSet<>();
    int s = 0;
    Transition t = new Transition();
    while (true) {
        visited.add(s);
        if (a.isAccept(s) == false) {
            if (a.getNumTransitions(s) == 1) {
                a.getTransition(s, 0, t);
                if (t.min == t.max && !visited.contains(t.dest)) {
                    builder.append(t.min);
                    s = t.dest;
                    continue;
                }
            }
        } else if (a.getNumTransitions(s) == 0) {
            return builder.get();
        }
        // Automaton accepts more than one string:
        return null;
    }
}
Also used : IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) HashSet(java.util.HashSet)

Example 10 with IntsRefBuilder

use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.

the class FSTUtil method intersectPrefixPaths.

/**
   * Enumerates all minimal prefix paths in the automaton that also intersect the FST,
   * accumulating the FST end node and output for each path.
   */
public static <T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst) throws IOException {
    assert a.isDeterministic();
    final List<Path<T>> queue = new ArrayList<>();
    final List<Path<T>> endNodes = new ArrayList<>();
    if (a.getNumStates() == 0) {
        return endNodes;
    }
    queue.add(new Path<>(0, fst.getFirstArc(new FST.Arc<T>()), fst.outputs.getNoOutput(), new IntsRefBuilder()));
    final FST.Arc<T> scratchArc = new FST.Arc<>();
    final FST.BytesReader fstReader = fst.getBytesReader();
    Transition t = new Transition();
    while (queue.size() != 0) {
        final Path<T> path = queue.remove(queue.size() - 1);
        if (a.isAccept(path.state)) {
            endNodes.add(path);
            // we accept all further paths too
            continue;
        }
        IntsRefBuilder currentInput = path.input;
        int count = a.initTransition(path.state, t);
        for (int i = 0; i < count; i++) {
            a.getNextTransition(t);
            final int min = t.min;
            final int max = t.max;
            if (min == max) {
                final FST.Arc<T> nextArc = fst.findTargetArc(t.min, path.fstNode, scratchArc, fstReader);
                if (nextArc != null) {
                    final IntsRefBuilder newInput = new IntsRefBuilder();
                    newInput.copyInts(currentInput.get());
                    newInput.append(t.min);
                    queue.add(new Path<>(t.dest, new FST.Arc<T>().copyFrom(nextArc), fst.outputs.add(path.output, nextArc.output), newInput));
                }
            } else {
                // TODO: if this transition's TO state is accepting, and
                // it accepts the entire range possible in the FST (ie. 0 to 255),
                // we can simply use the prefix as the accepted state instead of
                // looking up all the ranges and terminate early
                // here.  This just shifts the work from one queue
                // (this one) to another (the completion search
                // done in AnalyzingSuggester).
                FST.Arc<T> nextArc = Util.readCeilArc(min, fst, path.fstNode, scratchArc, fstReader);
                while (nextArc != null && nextArc.label <= max) {
                    assert nextArc.label <= max;
                    assert nextArc.label >= min : nextArc.label + " " + min;
                    final IntsRefBuilder newInput = new IntsRefBuilder();
                    newInput.copyInts(currentInput.get());
                    newInput.append(nextArc.label);
                    queue.add(new Path<>(t.dest, new FST.Arc<T>().copyFrom(nextArc), fst.outputs.add(path.output, nextArc.output), newInput));
                    // used in assert
                    final int label = nextArc.label;
                    nextArc = nextArc.isLast() ? null : fst.readNextRealArc(nextArc, fstReader);
                    assert nextArc == null || label < nextArc.label : "last: " + label + " next: " + nextArc.label;
                }
            }
        }
    }
    return endNodes;
}
Also used : FST(org.apache.lucene.util.fst.FST) ArrayList(java.util.ArrayList) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) FST(org.apache.lucene.util.fst.FST) Transition(org.apache.lucene.util.automaton.Transition)

Aggregations

IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)55 BytesRef (org.apache.lucene.util.BytesRef)32 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)25 IntsRef (org.apache.lucene.util.IntsRef)19 ArrayList (java.util.ArrayList)10 HashSet (java.util.HashSet)10 Builder (org.apache.lucene.util.fst.Builder)10 Arc (org.apache.lucene.util.fst.FST.Arc)9 BytesReader (org.apache.lucene.util.fst.FST.BytesReader)8 Map (java.util.Map)7 HashMap (java.util.HashMap)5 ByteArrayDataInput (org.apache.lucene.store.ByteArrayDataInput)5 CharsRef (org.apache.lucene.util.CharsRef)5 TestUtil (org.apache.lucene.util.TestUtil)5 FSTTester.getRandomString (org.apache.lucene.util.fst.FSTTester.getRandomString)5 FSTTester.simpleRandomString (org.apache.lucene.util.fst.FSTTester.simpleRandomString)5 TreeMap (java.util.TreeMap)4 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)4 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)4 IOException (java.io.IOException)3