Search in sources :

Example 1 with Builder

use of org.apache.lucene.util.fst.Builder in project elasticsearch by elastic.

the class XAnalyzingSuggester method build.

@Override
public void build(InputIterator iterator) throws IOException {
    String prefix = getClass().getSimpleName();
    Directory tempDir = getTempDir();
    OfflineSorter sorter = new OfflineSorter(tempDir, prefix, new AnalyzingComparator(hasPayloads));
    IndexOutput tempInput = tempDir.createTempOutput(prefix, "input", IOContext.DEFAULT);
    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;
    hasPayloads = iterator.hasPayloads();
    BytesRefBuilder scratch = new BytesRefBuilder();
    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
    String tempSortedFileName = null;
    count = 0;
    byte[] buffer = new byte[8];
    try {
        ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
        for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
            LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
            for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
                Util.toBytesRef(string, scratch);
                // length of the analyzed text (FST input)
                if (scratch.length() > Short.MAX_VALUE - 2) {
                    throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE - 2) + " in length (got " + scratch.length() + ")");
                }
                short analyzedLength = (short) scratch.length();
                // compute the required length:
                // analyzed sequence + weight (4) + surface + analyzedLength (short)
                int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
                BytesRef payload;
                if (hasPayloads) {
                    if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
                        throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE - 2) + " in length (got " + surfaceForm.length + ")");
                    }
                    payload = iterator.payload();
                    // payload + surfaceLength (short)
                    requiredLength += payload.length + 2;
                } else {
                    payload = null;
                }
                buffer = ArrayUtil.grow(buffer, requiredLength);
                output.reset(buffer);
                output.writeShort(analyzedLength);
                output.writeBytes(scratch.bytes(), 0, scratch.length());
                output.writeInt(encodeWeight(iterator.weight()));
                if (hasPayloads) {
                    for (int i = 0; i < surfaceForm.length; i++) {
                        if (surfaceForm.bytes[i] == payloadSep) {
                            throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
                        }
                    }
                    output.writeShort((short) surfaceForm.length);
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                    output.writeBytes(payload.bytes, payload.offset, payload.length);
                } else {
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                }
                assert output.getPosition() == requiredLength : output.getPosition() + " vs " + requiredLength;
                writer.write(buffer, 0, output.getPosition());
            }
            maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
        }
        writer.close();
        // Sort all input/output pairs (required by FST.Builder):
        tempSortedFileName = sorter.sort(tempInput.getName());
        // Free disk space:
        tempDir.deleteFile(tempInput.getName());
        reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), prefix);
        PairOutputs<Long, BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
        Builder<Pair<Long, BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
        // Build FST:
        BytesRefBuilder previousAnalyzed = null;
        BytesRefBuilder analyzed = new BytesRefBuilder();
        BytesRef surface = new BytesRef();
        IntsRefBuilder scratchInts = new IntsRefBuilder();
        ByteArrayDataInput input = new ByteArrayDataInput();
        // Used to remove duplicate surface forms (but we
        // still index the hightest-weight one).  We clear
        // this when we see a new analyzed form, so it cannot
        // grow unbounded (at most 256 entries):
        Set<BytesRef> seenSurfaceForms = new HashSet<>();
        int dedup = 0;
        while (true) {
            BytesRef bytes = reader.next();
            if (bytes == null) {
                break;
            }
            input.reset(bytes.bytes, bytes.offset, bytes.length);
            short analyzedLength = input.readShort();
            analyzed.grow(analyzedLength + 2);
            input.readBytes(analyzed.bytes(), 0, analyzedLength);
            analyzed.setLength(analyzedLength);
            long cost = input.readInt();
            surface.bytes = bytes.bytes;
            if (hasPayloads) {
                surface.length = input.readShort();
                surface.offset = input.getPosition();
            } else {
                surface.offset = input.getPosition();
                surface.length = bytes.length - surface.offset;
            }
            if (previousAnalyzed == null) {
                previousAnalyzed = new BytesRefBuilder();
                previousAnalyzed.copyBytes(analyzed);
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else if (analyzed.get().equals(previousAnalyzed.get())) {
                dedup++;
                if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
                    // dups: skip the rest:
                    continue;
                }
                if (seenSurfaceForms.contains(surface)) {
                    continue;
                }
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else {
                dedup = 0;
                previousAnalyzed.copyBytes(analyzed);
                seenSurfaceForms.clear();
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            }
            // TODO: I think we can avoid the extra 2 bytes when
            // there is no dup (dedup==0), but we'd have to fix
            // the exactFirst logic ... which would be sort of
            // hairy because we'd need to special case the two
            // (dup/not dup)...
            // NOTE: must be byte 0 so we sort before whatever
            // is next
            analyzed.append((byte) 0);
            analyzed.append((byte) dedup);
            Util.toIntsRef(analyzed.get(), scratchInts);
            //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
            if (!hasPayloads) {
                builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
            } else {
                int payloadOffset = input.getPosition() + surface.length;
                int payloadLength = bytes.length - payloadOffset;
                BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
                System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
                br.bytes[surface.length] = (byte) payloadSep;
                System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
                br.length = br.bytes.length;
                builder.add(scratchInts.get(), outputs.newPair(cost, br));
            }
        }
        fst = builder.finish();
    //PrintWriter pw = new PrintWriter("/tmp/out.dot");
    //Util.toDot(fst, pw, true, true);
    //pw.close();
    } finally {
        IOUtils.closeWhileHandlingException(reader, writer);
        IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) Builder(org.apache.lucene.util.fst.Builder) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) PairOutputs(org.apache.lucene.util.fst.PairOutputs) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IndexOutput(org.apache.lucene.store.IndexOutput) LimitedFiniteStringsIterator(org.apache.lucene.util.automaton.LimitedFiniteStringsIterator) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) ByteArrayDataOutput(org.apache.lucene.store.ByteArrayDataOutput) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 2 with Builder

use of org.apache.lucene.util.fst.Builder in project lucene-solr by apache.

the class AnalyzingSuggester method build.

@Override
public void build(InputIterator iterator) throws IOException {
    if (iterator.hasContexts()) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    hasPayloads = iterator.hasPayloads();
    OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new AnalyzingComparator(hasPayloads));
    IndexOutput tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;
    BytesRefBuilder scratch = new BytesRefBuilder();
    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
    String tempSortedFileName = null;
    count = 0;
    byte[] buffer = new byte[8];
    try {
        ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
        for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
            LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
            for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
                Util.toBytesRef(string, scratch);
                // length of the analyzed text (FST input)
                if (scratch.length() > Short.MAX_VALUE - 2) {
                    throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE - 2) + " in length (got " + scratch.length() + ")");
                }
                short analyzedLength = (short) scratch.length();
                // compute the required length:
                // analyzed sequence + weight (4) + surface + analyzedLength (short)
                int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
                BytesRef payload;
                if (hasPayloads) {
                    if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
                        throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE - 2) + " in length (got " + surfaceForm.length + ")");
                    }
                    payload = iterator.payload();
                    // payload + surfaceLength (short)
                    requiredLength += payload.length + 2;
                } else {
                    payload = null;
                }
                buffer = ArrayUtil.grow(buffer, requiredLength);
                output.reset(buffer);
                output.writeShort(analyzedLength);
                output.writeBytes(scratch.bytes(), 0, scratch.length());
                output.writeInt(encodeWeight(iterator.weight()));
                if (hasPayloads) {
                    for (int i = 0; i < surfaceForm.length; i++) {
                        if (surfaceForm.bytes[i] == PAYLOAD_SEP) {
                            throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
                        }
                    }
                    output.writeShort((short) surfaceForm.length);
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                    output.writeBytes(payload.bytes, payload.offset, payload.length);
                } else {
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                }
                assert output.getPosition() == requiredLength : output.getPosition() + " vs " + requiredLength;
                writer.write(buffer, 0, output.getPosition());
            }
            maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
        }
        CodecUtil.writeFooter(tempInput);
        writer.close();
        // Sort all input/output pairs (required by FST.Builder):
        tempSortedFileName = sorter.sort(tempInput.getName());
        // Free disk space:
        tempDir.deleteFile(tempInput.getName());
        reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
        PairOutputs<Long, BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
        Builder<Pair<Long, BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
        // Build FST:
        BytesRefBuilder previousAnalyzed = null;
        BytesRefBuilder analyzed = new BytesRefBuilder();
        BytesRef surface = new BytesRef();
        IntsRefBuilder scratchInts = new IntsRefBuilder();
        ByteArrayDataInput input = new ByteArrayDataInput();
        // Used to remove duplicate surface forms (but we
        // still index the hightest-weight one).  We clear
        // this when we see a new analyzed form, so it cannot
        // grow unbounded (at most 256 entries):
        Set<BytesRef> seenSurfaceForms = new HashSet<>();
        int dedup = 0;
        while (true) {
            BytesRef bytes = reader.next();
            if (bytes == null) {
                break;
            }
            input.reset(bytes.bytes, bytes.offset, bytes.length);
            short analyzedLength = input.readShort();
            analyzed.grow(analyzedLength + 2);
            input.readBytes(analyzed.bytes(), 0, analyzedLength);
            analyzed.setLength(analyzedLength);
            long cost = input.readInt();
            surface.bytes = bytes.bytes;
            if (hasPayloads) {
                surface.length = input.readShort();
                surface.offset = input.getPosition();
            } else {
                surface.offset = input.getPosition();
                surface.length = bytes.length - surface.offset;
            }
            if (previousAnalyzed == null) {
                previousAnalyzed = new BytesRefBuilder();
                previousAnalyzed.copyBytes(analyzed.get());
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else if (analyzed.get().equals(previousAnalyzed.get())) {
                dedup++;
                if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
                    // dups: skip the rest:
                    continue;
                }
                if (seenSurfaceForms.contains(surface)) {
                    continue;
                }
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else {
                dedup = 0;
                previousAnalyzed.copyBytes(analyzed);
                seenSurfaceForms.clear();
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            }
            // TODO: I think we can avoid the extra 2 bytes when
            // there is no dup (dedup==0), but we'd have to fix
            // the exactFirst logic ... which would be sort of
            // hairy because we'd need to special case the two
            // (dup/not dup)...
            // NOTE: must be byte 0 so we sort before whatever
            // is next
            analyzed.append((byte) 0);
            analyzed.append((byte) dedup);
            Util.toIntsRef(analyzed.get(), scratchInts);
            //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
            if (!hasPayloads) {
                builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
            } else {
                int payloadOffset = input.getPosition() + surface.length;
                int payloadLength = bytes.length - payloadOffset;
                BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
                System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
                br.bytes[surface.length] = PAYLOAD_SEP;
                System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
                br.length = br.bytes.length;
                builder.add(scratchInts.get(), outputs.newPair(cost, br));
            }
        }
        fst = builder.finish();
    //Util.dotToFile(fst, "/tmp/suggest.dot");
    } finally {
        IOUtils.closeWhileHandlingException(reader, writer);
        IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) Builder(org.apache.lucene.util.fst.Builder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) PairOutputs(org.apache.lucene.util.fst.PairOutputs) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IndexOutput(org.apache.lucene.store.IndexOutput) LimitedFiniteStringsIterator(org.apache.lucene.util.automaton.LimitedFiniteStringsIterator) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) ByteArrayDataOutput(org.apache.lucene.store.ByteArrayDataOutput) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 3 with Builder

use of org.apache.lucene.util.fst.Builder in project lucene-solr by apache.

the class WFSTCompletionLookup method build.

@Override
public void build(InputIterator iterator) throws IOException {
    if (iterator.hasPayloads()) {
        throw new IllegalArgumentException("this suggester doesn't support payloads");
    }
    if (iterator.hasContexts()) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    count = 0;
    BytesRef scratch = new BytesRef();
    InputIterator iter = new WFSTInputIterator(tempDir, tempFileNamePrefix, iterator);
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    BytesRefBuilder previous = null;
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    while ((scratch = iter.next()) != null) {
        long cost = iter.weight();
        if (previous == null) {
            previous = new BytesRefBuilder();
        } else if (scratch.equals(previous.get())) {
            // for duplicate suggestions, the best weight is actually
            continue;
        // added
        }
        Util.toIntsRef(scratch, scratchInts);
        builder.add(scratchInts.get(), cost);
        previous.copyBytes(scratch);
        count++;
    }
    fst = builder.finish();
}
Also used : InputIterator(org.apache.lucene.search.suggest.InputIterator) SortedInputIterator(org.apache.lucene.search.suggest.SortedInputIterator) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) PositiveIntOutputs(org.apache.lucene.util.fst.PositiveIntOutputs) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) Builder(org.apache.lucene.util.fst.Builder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 4 with Builder

use of org.apache.lucene.util.fst.Builder in project lucene-solr by apache.

the class TestDictionary method testReplacements.

public void testReplacements() throws Exception {
    Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    // a -> b
    Util.toUTF16("a", scratchInts);
    builder.add(scratchInts.get(), new CharsRef("b"));
    // ab -> c
    Util.toUTF16("ab", scratchInts);
    builder.add(scratchInts.get(), new CharsRef("c"));
    // c -> de
    Util.toUTF16("c", scratchInts);
    builder.add(scratchInts.get(), new CharsRef("de"));
    // def -> gh
    Util.toUTF16("def", scratchInts);
    builder.add(scratchInts.get(), new CharsRef("gh"));
    FST<CharsRef> fst = builder.finish();
    StringBuilder sb = new StringBuilder("atestanother");
    Dictionary.applyMappings(fst, sb);
    assertEquals("btestbnother", sb.toString());
    sb = new StringBuilder("abtestanother");
    Dictionary.applyMappings(fst, sb);
    assertEquals("ctestbnother", sb.toString());
    sb = new StringBuilder("atestabnother");
    Dictionary.applyMappings(fst, sb);
    assertEquals("btestcnother", sb.toString());
    sb = new StringBuilder("abtestabnother");
    Dictionary.applyMappings(fst, sb);
    assertEquals("ctestcnother", sb.toString());
    sb = new StringBuilder("abtestabcnother");
    Dictionary.applyMappings(fst, sb);
    assertEquals("ctestcdenother", sb.toString());
    sb = new StringBuilder("defdefdefc");
    Dictionary.applyMappings(fst, sb);
    assertEquals("ghghghde", sb.toString());
}
Also used : IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Builder(org.apache.lucene.util.fst.Builder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) CharsRef(org.apache.lucene.util.CharsRef)

Example 5 with Builder

use of org.apache.lucene.util.fst.Builder in project lucene-solr by apache.

the class TokenInfoDictionaryBuilder method buildDictionary.

public TokenInfoDictionaryWriter buildDictionary(List<File> csvFiles) throws IOException {
    TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
    // all lines in the file
    System.out.println("  parse...");
    List<String[]> lines = new ArrayList<>(400000);
    for (File file : csvFiles) {
        FileInputStream inputStream = new FileInputStream(file);
        Charset cs = Charset.forName(encoding);
        CharsetDecoder decoder = cs.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT);
        InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
        BufferedReader reader = new BufferedReader(streamReader);
        String line = null;
        while ((line = reader.readLine()) != null) {
            String[] entry = CSVUtil.parse(line);
            if (entry.length < 13) {
                System.out.println("Entry in CSV is not valid: " + line);
                continue;
            }
            String[] formatted = formatEntry(entry);
            lines.add(formatted);
            // NFKC normalize dictionary entry
            if (normalizeEntries) {
                if (normalizer.isNormalized(entry[0])) {
                    continue;
                }
                String[] normalizedEntry = new String[entry.length];
                for (int i = 0; i < entry.length; i++) {
                    normalizedEntry[i] = normalizer.normalize(entry[i]);
                }
                formatted = formatEntry(normalizedEntry);
                lines.add(formatted);
            }
        }
    }
    System.out.println("  sort...");
    // sort by term: we sorted the files already and use a stable sort.
    Collections.sort(lines, new Comparator<String[]>() {

        public int compare(String[] left, String[] right) {
            return left[0].compareTo(right[0]);
        }
    });
    System.out.println("  encode...");
    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
    IntsRefBuilder scratch = new IntsRefBuilder();
    // first ord will be 0
    long ord = -1;
    String lastValue = null;
    // build tokeninfo dictionary
    for (String[] entry : lines) {
        int next = dictionary.put(entry);
        if (next == offset) {
            System.out.println("Failed to process line: " + Arrays.toString(entry));
            continue;
        }
        String token = entry[0];
        if (!token.equals(lastValue)) {
            // new word to add to fst
            ord++;
            lastValue = token;
            scratch.grow(token.length());
            scratch.setLength(token.length());
            for (int i = 0; i < token.length(); i++) {
                scratch.setIntAt(i, (int) token.charAt(i));
            }
            fstBuilder.add(scratch.get(), ord);
        }
        dictionary.addMapping((int) ord, offset);
        offset = next;
    }
    final FST<Long> fst = fstBuilder.finish();
    System.out.print("  " + fstBuilder.getNodeCount() + " nodes, " + fstBuilder.getArcCount() + " arcs, " + fst.ramBytesUsed() + " bytes...  ");
    dictionary.setFST(fst);
    System.out.println(" done");
    return dictionary;
}
Also used : CharsetDecoder(java.nio.charset.CharsetDecoder) InputStreamReader(java.io.InputStreamReader) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Builder(org.apache.lucene.util.fst.Builder) ArrayList(java.util.ArrayList) Charset(java.nio.charset.Charset) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) FileInputStream(java.io.FileInputStream) PositiveIntOutputs(org.apache.lucene.util.fst.PositiveIntOutputs) BufferedReader(java.io.BufferedReader) File(java.io.File)

Aggregations

IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)10 Builder (org.apache.lucene.util.fst.Builder)10 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)7 BytesRef (org.apache.lucene.util.BytesRef)5 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)4 PositiveIntOutputs (org.apache.lucene.util.fst.PositiveIntOutputs)4 Map (java.util.Map)3 IntsRef (org.apache.lucene.util.IntsRef)3 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 LinkedHashMap (java.util.LinkedHashMap)2 TreeMap (java.util.TreeMap)2 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)2 ByteArrayDataInput (org.apache.lucene.store.ByteArrayDataInput)2 ByteArrayDataOutput (org.apache.lucene.store.ByteArrayDataOutput)2 Directory (org.apache.lucene.store.Directory)2 FSDirectory (org.apache.lucene.store.FSDirectory)2 IndexOutput (org.apache.lucene.store.IndexOutput)2 CharsRef (org.apache.lucene.util.CharsRef)2