Search in sources :

Example 6 with Builder

use of org.apache.lucene.util.fst.Builder in project lucene-solr by apache.

the class FreeTextSuggester method build.

/** Build the suggest index, using up to the specified
   *  amount of temporary RAM while building.  Note that
   *  the weights for the suggestions are ignored. */
public void build(InputIterator iterator, double ramBufferSizeMB) throws IOException {
    if (iterator.hasPayloads()) {
        throw new IllegalArgumentException("this suggester doesn't support payloads");
    }
    if (iterator.hasContexts()) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    String prefix = getClass().getSimpleName();
    Path tempIndexPath = Files.createTempDirectory(prefix + ".index.");
    Directory dir = FSDirectory.open(tempIndexPath);
    IndexWriterConfig iwc = new IndexWriterConfig(indexAnalyzer);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(ramBufferSizeMB);
    IndexWriter writer = new IndexWriter(dir, iwc);
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    // TODO: if only we had IndexOptions.TERMS_ONLY...
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    ft.setOmitNorms(true);
    ft.freeze();
    Document doc = new Document();
    Field field = new Field("body", "", ft);
    doc.add(field);
    totTokens = 0;
    IndexReader reader = null;
    boolean success = false;
    count = 0;
    try {
        while (true) {
            BytesRef surfaceForm = iterator.next();
            if (surfaceForm == null) {
                break;
            }
            field.setStringValue(surfaceForm.utf8ToString());
            writer.addDocument(doc);
            count++;
        }
        reader = DirectoryReader.open(writer);
        Terms terms = MultiFields.getTerms(reader, "body");
        if (terms == null) {
            throw new IllegalArgumentException("need at least one suggestion");
        }
        // Move all ngrams into an FST:
        TermsEnum termsEnum = terms.iterator();
        Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
        Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
        IntsRefBuilder scratchInts = new IntsRefBuilder();
        while (true) {
            BytesRef term = termsEnum.next();
            if (term == null) {
                break;
            }
            int ngramCount = countGrams(term);
            if (ngramCount > grams) {
                throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams);
            }
            if (ngramCount == 1) {
                totTokens += termsEnum.totalTermFreq();
            }
            builder.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));
        }
        fst = builder.finish();
        if (fst == null) {
            throw new IllegalArgumentException("need at least one suggestion");
        }
        //System.out.println("FST: " + fst.getNodeCount() + " nodes");
        /*
      PrintWriter pw = new PrintWriter("/x/tmp/out.dot");
      Util.toDot(fst, pw, true, true);
      pw.close();
      */
        // Writer was only temporary, to count up bigrams,
        // which we transferred to the FST, so now we
        // rollback:
        writer.rollback();
        success = true;
    } finally {
        try {
            if (success) {
                IOUtils.close(reader, dir);
            } else {
                IOUtils.closeWhileHandlingException(reader, writer, dir);
            }
        } finally {
            IOUtils.rm(tempIndexPath);
        }
    }
}
Also used : Path(java.nio.file.Path) Builder(org.apache.lucene.util.fst.Builder) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Terms(org.apache.lucene.index.Terms) Document(org.apache.lucene.document.Document) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) FieldType(org.apache.lucene.document.FieldType) TermsEnum(org.apache.lucene.index.TermsEnum) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 7 with Builder

use of org.apache.lucene.util.fst.Builder in project lucene-solr by apache.

the class TokenInfoDictionaryBuilder method buildDictionary.

public TokenInfoDictionaryWriter buildDictionary(List<File> csvFiles) throws IOException {
    TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
    // all lines in the file
    System.out.println("  parse...");
    List<String[]> lines = new ArrayList<>(400000);
    for (File file : csvFiles) {
        FileInputStream inputStream = new FileInputStream(file);
        Charset cs = Charset.forName(encoding);
        CharsetDecoder decoder = cs.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT);
        InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
        BufferedReader reader = new BufferedReader(streamReader);
        String line = null;
        while ((line = reader.readLine()) != null) {
            String[] entry = CSVUtil.parse(line);
            if (entry.length < 13) {
                System.out.println("Entry in CSV is not valid: " + line);
                continue;
            }
            String[] formatted = formatEntry(entry);
            lines.add(formatted);
            // NFKC normalize dictionary entry
            if (normalizeEntries) {
                if (normalizer.isNormalized(entry[0])) {
                    continue;
                }
                String[] normalizedEntry = new String[entry.length];
                for (int i = 0; i < entry.length; i++) {
                    normalizedEntry[i] = normalizer.normalize(entry[i]);
                }
                formatted = formatEntry(normalizedEntry);
                lines.add(formatted);
            }
        }
    }
    System.out.println("  sort...");
    // sort by term: we sorted the files already and use a stable sort.
    Collections.sort(lines, new Comparator<String[]>() {

        public int compare(String[] left, String[] right) {
            return left[0].compareTo(right[0]);
        }
    });
    System.out.println("  encode...");
    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
    IntsRefBuilder scratch = new IntsRefBuilder();
    // first ord will be 0
    long ord = -1;
    String lastValue = null;
    // build tokeninfo dictionary
    for (String[] entry : lines) {
        int next = dictionary.put(entry);
        if (next == offset) {
            System.out.println("Failed to process line: " + Arrays.toString(entry));
            continue;
        }
        String token = entry[0];
        if (!token.equals(lastValue)) {
            // new word to add to fst
            ord++;
            lastValue = token;
            scratch.grow(token.length());
            scratch.setLength(token.length());
            for (int i = 0; i < token.length(); i++) {
                scratch.setIntAt(i, (int) token.charAt(i));
            }
            fstBuilder.add(scratch.get(), ord);
        }
        dictionary.addMapping((int) ord, offset);
        offset = next;
    }
    final FST<Long> fst = fstBuilder.finish();
    System.out.print("  " + fstBuilder.getNodeCount() + " nodes, " + fstBuilder.getArcCount() + " arcs, " + fst.ramBytesUsed() + " bytes...  ");
    dictionary.setFST(fst);
    System.out.println(" done");
    return dictionary;
}
Also used : CharsetDecoder(java.nio.charset.CharsetDecoder) InputStreamReader(java.io.InputStreamReader) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Builder(org.apache.lucene.util.fst.Builder) ArrayList(java.util.ArrayList) Charset(java.nio.charset.Charset) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) FileInputStream(java.io.FileInputStream) PositiveIntOutputs(org.apache.lucene.util.fst.PositiveIntOutputs) BufferedReader(java.io.BufferedReader) File(java.io.File)

Example 8 with Builder

use of org.apache.lucene.util.fst.Builder in project lucene-solr by apache.

the class Dictionary method affixFST.

private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException {
    IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
    Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
    IntsRefBuilder scratch = new IntsRefBuilder();
    for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) {
        Util.toUTF32(entry.getKey(), scratch);
        List<Integer> entries = entry.getValue();
        IntsRef output = new IntsRef(entries.size());
        for (Integer c : entries) {
            output.ints[output.length++] = c;
        }
        builder.add(scratch.get(), output);
    }
    return builder.finish();
}
Also used : Builder(org.apache.lucene.util.fst.Builder) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) IntSequenceOutputs(org.apache.lucene.util.fst.IntSequenceOutputs) List(java.util.List) ArrayList(java.util.ArrayList) IntsRef(org.apache.lucene.util.IntsRef) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) TreeMap(java.util.TreeMap)

Example 9 with Builder

use of org.apache.lucene.util.fst.Builder in project lucene-solr by apache.

the class Dictionary method parseConversions.

private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
    Map<String, String> mappings = new TreeMap<>();
    for (int i = 0; i < num; i++) {
        String line = reader.readLine();
        String[] parts = line.split("\\s+");
        if (parts.length != 3) {
            throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
        }
        if (mappings.put(parts[1], parts[2]) != null) {
            throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
        }
    }
    Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    for (Map.Entry<String, String> entry : mappings.entrySet()) {
        Util.toUTF16(entry.getKey(), scratchInts);
        builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
    }
    return builder.finish();
}
Also used : Builder(org.apache.lucene.util.fst.Builder) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) TreeMap(java.util.TreeMap) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) CharsRef(org.apache.lucene.util.CharsRef) ParseException(java.text.ParseException) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) TreeMap(java.util.TreeMap)

Example 10 with Builder

use of org.apache.lucene.util.fst.Builder in project lucene-solr by apache.

the class BooleanPerceptronClassifier method updateFST.

private void updateFST(SortedMap<String, Double> weights) throws IOException {
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    BytesRefBuilder scratchBytes = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    for (Map.Entry<String, Double> entry : weights.entrySet()) {
        scratchBytes.copyChars(entry.getKey());
        fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry.getValue().longValue());
    }
    fst = fstBuilder.finish();
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) PositiveIntOutputs(org.apache.lucene.util.fst.PositiveIntOutputs) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) Builder(org.apache.lucene.util.fst.Builder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Map(java.util.Map) ConcurrentSkipListMap(java.util.concurrent.ConcurrentSkipListMap) SortedMap(java.util.SortedMap)

Aggregations

IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)10 Builder (org.apache.lucene.util.fst.Builder)10 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)7 BytesRef (org.apache.lucene.util.BytesRef)5 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)4 PositiveIntOutputs (org.apache.lucene.util.fst.PositiveIntOutputs)4 Map (java.util.Map)3 IntsRef (org.apache.lucene.util.IntsRef)3 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 LinkedHashMap (java.util.LinkedHashMap)2 TreeMap (java.util.TreeMap)2 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)2 ByteArrayDataInput (org.apache.lucene.store.ByteArrayDataInput)2 ByteArrayDataOutput (org.apache.lucene.store.ByteArrayDataOutput)2 Directory (org.apache.lucene.store.Directory)2 FSDirectory (org.apache.lucene.store.FSDirectory)2 IndexOutput (org.apache.lucene.store.IndexOutput)2 CharsRef (org.apache.lucene.util.CharsRef)2