Search in sources :

Example 1 with PositiveIntOutputs

use of org.apache.lucene.util.fst.PositiveIntOutputs in project lucene-solr by apache.

the class WFSTCompletionLookup method build.

@Override
public void build(InputIterator iterator) throws IOException {
    if (iterator.hasPayloads()) {
        throw new IllegalArgumentException("this suggester doesn't support payloads");
    }
    if (iterator.hasContexts()) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    count = 0;
    BytesRef scratch = new BytesRef();
    InputIterator iter = new WFSTInputIterator(tempDir, tempFileNamePrefix, iterator);
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    BytesRefBuilder previous = null;
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    while ((scratch = iter.next()) != null) {
        long cost = iter.weight();
        if (previous == null) {
            previous = new BytesRefBuilder();
        } else if (scratch.equals(previous.get())) {
            // for duplicate suggestions, the best weight is actually
            continue;
        // added
        }
        Util.toIntsRef(scratch, scratchInts);
        builder.add(scratchInts.get(), cost);
        previous.copyBytes(scratch);
        count++;
    }
    fst = builder.finish();
}
Also used : InputIterator(org.apache.lucene.search.suggest.InputIterator) SortedInputIterator(org.apache.lucene.search.suggest.SortedInputIterator) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) PositiveIntOutputs(org.apache.lucene.util.fst.PositiveIntOutputs) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) Builder(org.apache.lucene.util.fst.Builder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 2 with PositiveIntOutputs

use of org.apache.lucene.util.fst.PositiveIntOutputs in project lucene-solr by apache.

the class MemoryDocValuesConsumer method writeFST.

private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException {
    meta.writeVInt(field.number);
    meta.writeByte(FST);
    meta.writeLong(data.getFilePointer());
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> builder = new Builder<>(INPUT_TYPE.BYTE1, outputs);
    IntsRefBuilder scratch = new IntsRefBuilder();
    long ord = 0;
    for (BytesRef v : values) {
        builder.add(Util.toIntsRef(v, scratch), ord);
        ord++;
    }
    FST<Long> fst = builder.finish();
    if (fst != null) {
        fst.save(data);
    }
    meta.writeVLong(ord);
}
Also used : PositiveIntOutputs(org.apache.lucene.util.fst.PositiveIntOutputs) Builder(org.apache.lucene.util.fst.Builder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 3 with PositiveIntOutputs

use of org.apache.lucene.util.fst.PositiveIntOutputs in project lucene-solr by apache.

the class TokenInfoDictionaryBuilder method buildDictionary.

public TokenInfoDictionaryWriter buildDictionary(List<File> csvFiles) throws IOException {
    TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
    // all lines in the file
    System.out.println("  parse...");
    List<String[]> lines = new ArrayList<>(400000);
    for (File file : csvFiles) {
        FileInputStream inputStream = new FileInputStream(file);
        Charset cs = Charset.forName(encoding);
        CharsetDecoder decoder = cs.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT);
        InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
        BufferedReader reader = new BufferedReader(streamReader);
        String line = null;
        while ((line = reader.readLine()) != null) {
            String[] entry = CSVUtil.parse(line);
            if (entry.length < 13) {
                System.out.println("Entry in CSV is not valid: " + line);
                continue;
            }
            String[] formatted = formatEntry(entry);
            lines.add(formatted);
            // NFKC normalize dictionary entry
            if (normalizeEntries) {
                if (normalizer.isNormalized(entry[0])) {
                    continue;
                }
                String[] normalizedEntry = new String[entry.length];
                for (int i = 0; i < entry.length; i++) {
                    normalizedEntry[i] = normalizer.normalize(entry[i]);
                }
                formatted = formatEntry(normalizedEntry);
                lines.add(formatted);
            }
        }
    }
    System.out.println("  sort...");
    // sort by term: we sorted the files already and use a stable sort.
    Collections.sort(lines, new Comparator<String[]>() {

        public int compare(String[] left, String[] right) {
            return left[0].compareTo(right[0]);
        }
    });
    System.out.println("  encode...");
    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
    IntsRefBuilder scratch = new IntsRefBuilder();
    // first ord will be 0
    long ord = -1;
    String lastValue = null;
    // build tokeninfo dictionary
    for (String[] entry : lines) {
        int next = dictionary.put(entry);
        if (next == offset) {
            System.out.println("Failed to process line: " + Arrays.toString(entry));
            continue;
        }
        String token = entry[0];
        if (!token.equals(lastValue)) {
            // new word to add to fst
            ord++;
            lastValue = token;
            scratch.grow(token.length());
            scratch.setLength(token.length());
            for (int i = 0; i < token.length(); i++) {
                scratch.setIntAt(i, (int) token.charAt(i));
            }
            fstBuilder.add(scratch.get(), ord);
        }
        dictionary.addMapping((int) ord, offset);
        offset = next;
    }
    final FST<Long> fst = fstBuilder.finish();
    System.out.print("  " + fstBuilder.getNodeCount() + " nodes, " + fstBuilder.getArcCount() + " arcs, " + fst.ramBytesUsed() + " bytes...  ");
    dictionary.setFST(fst);
    System.out.println(" done");
    return dictionary;
}
Also used : CharsetDecoder(java.nio.charset.CharsetDecoder) InputStreamReader(java.io.InputStreamReader) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Builder(org.apache.lucene.util.fst.Builder) ArrayList(java.util.ArrayList) Charset(java.nio.charset.Charset) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) FileInputStream(java.io.FileInputStream) PositiveIntOutputs(org.apache.lucene.util.fst.PositiveIntOutputs) BufferedReader(java.io.BufferedReader) File(java.io.File)

Example 4 with PositiveIntOutputs

use of org.apache.lucene.util.fst.PositiveIntOutputs in project lucene-solr by apache.

the class BooleanPerceptronClassifier method updateFST.

private void updateFST(SortedMap<String, Double> weights) throws IOException {
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    BytesRefBuilder scratchBytes = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    for (Map.Entry<String, Double> entry : weights.entrySet()) {
        scratchBytes.copyChars(entry.getKey());
        fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry.getValue().longValue());
    }
    fst = fstBuilder.finish();
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) PositiveIntOutputs(org.apache.lucene.util.fst.PositiveIntOutputs) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) Builder(org.apache.lucene.util.fst.Builder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Map(java.util.Map) ConcurrentSkipListMap(java.util.concurrent.ConcurrentSkipListMap) SortedMap(java.util.SortedMap)

Aggregations

IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)4 Builder (org.apache.lucene.util.fst.Builder)4 PositiveIntOutputs (org.apache.lucene.util.fst.PositiveIntOutputs)4 BytesRef (org.apache.lucene.util.BytesRef)2 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)2 BufferedReader (java.io.BufferedReader)1 File (java.io.File)1 FileInputStream (java.io.FileInputStream)1 InputStreamReader (java.io.InputStreamReader)1 Charset (java.nio.charset.Charset)1 CharsetDecoder (java.nio.charset.CharsetDecoder)1 ArrayList (java.util.ArrayList)1 Map (java.util.Map)1 SortedMap (java.util.SortedMap)1 ConcurrentSkipListMap (java.util.concurrent.ConcurrentSkipListMap)1 InputIterator (org.apache.lucene.search.suggest.InputIterator)1 SortedInputIterator (org.apache.lucene.search.suggest.SortedInputIterator)1 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)1