Search in sources :

Example 1 with Mphf

use of zemberek.core.hash.Mphf in project zemberek-nlp by ahmetaa.

the class CompressedCharNgramModel method compress.

public static void compress(MapBasedCharNgramLanguageModel model, File output) throws IOException {
    Mphf[] mphfs = new MultiLevelMphf[model.getOrder() + 1];
    DoubleLookup[] lookups = new DoubleLookup[model.getOrder() + 1];
    try (DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(output)))) {
        dos.writeInt(model.getOrder());
        dos.writeUTF(model.getId());
        for (int i = 1; i <= model.getOrder(); i++) {
            Histogram<Double> histogram = new Histogram<>();
            histogram.add(model.gramLogProbs[i].values.values());
            double[] lookup = new double[histogram.size()];
            int j = 0;
            for (Double key : histogram) {
                lookup[j] = key;
                j++;
            }
            Quantizer quantizer = BinningQuantizer.linearBinning(lookup, 8);
            lookups[i] = quantizer.getDequantizer();
            List<String> keys = Lists.newArrayList(model.gramLogProbs[i].values.keySet());
            int[] fingerprints = new int[keys.size()];
            int[] probabilityIndexes = new int[keys.size()];
            mphfs[i] = MultiLevelMphf.generate(new StringListKeyProvider(keys));
            for (final String key : keys) {
                final int index = mphfs[i].get(key);
                fingerprints[index] = MultiLevelMphf.hash(key, -1) & FINGER_PRINT_MASK;
                probabilityIndexes[index] = quantizer.getQuantizationIndex(model.gramLogProbs[i].values.get(key));
            }
            lookups[i].save(dos);
            dos.writeInt(keys.size());
            for (int k = 0; k < keys.size(); k++) {
                dos.writeShort(fingerprints[k] & 0xffff);
                dos.writeByte(probabilityIndexes[k]);
            }
            mphfs[i].serialize(dos);
        }
    }
}
Also used : Histogram(zemberek.core.collections.Histogram) MultiLevelMphf(zemberek.core.hash.MultiLevelMphf) Mphf(zemberek.core.hash.Mphf) DataOutputStream(java.io.DataOutputStream) MultiLevelMphf(zemberek.core.hash.MultiLevelMphf) FileOutputStream(java.io.FileOutputStream) Quantizer(zemberek.core.quantization.Quantizer) BinningQuantizer(zemberek.core.quantization.BinningQuantizer) DoubleLookup(zemberek.core.quantization.DoubleLookup) BufferedOutputStream(java.io.BufferedOutputStream)

Example 2 with Mphf

use of zemberek.core.hash.Mphf in project zemberek-nlp by ahmetaa.

the class LossyIntLookup method deserialize.

/**
 * Deseializes a LossyIntLookup structure from a {@link DataInputStream} [dis]
 */
public static LossyIntLookup deserialize(DataInputStream dis) throws IOException {
    long magic = dis.readInt();
    if (magic != MAGIC) {
        throw new IllegalStateException("File does not carry expected value in the beginning.");
    }
    int length = dis.readInt();
    int[] data = new int[length];
    for (int i = 0; i < data.length; i++) {
        data[i] = dis.readInt();
    }
    Mphf mphf = MultiLevelMphf.deserialize(dis);
    return new LossyIntLookup(mphf, data);
}
Also used : MultiLevelMphf(zemberek.core.hash.MultiLevelMphf) Mphf(zemberek.core.hash.Mphf)

Example 3 with Mphf

use of zemberek.core.hash.Mphf in project zemberek-nlp by ahmetaa.

the class UncompressedToSmoothLmConverter method convert.

private void convert(File binaryUncompressedLmDir, NgramDataBlock block, SmoothLm.MphfType type, File[] oneBasedMphfFiles, int chunkBits) throws IOException {
    Log.info("Generating compressed language model.");
    MultiFileUncompressedLm lm = new MultiFileUncompressedLm(binaryUncompressedLmDir);
    lm.generateRankFiles(block.probabilitySize * 8, QuantizerType.BINNING);
    order = lm.order;
    DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(lmFile)));
    // generate Minimal Perfect Hash functions for 2,3...n grams and save them as separate files.
    File[] phfFiles = new File[order + 1];
    if (oneBasedMphfFiles != null) {
        phfFiles = oneBasedMphfFiles;
    } else {
        for (int i = 2; i <= order; i++) {
            Mphf mphf;
            if (type == SmoothLm.MphfType.LARGE) {
                mphf = LargeNgramMphf.generate(lm.getGramFile(i), chunkBits);
            } else {
                mphf = MultiLevelMphf.generate(lm.getGramFile(i));
            }
            Log.info("MPHF is generated for order %d with %d keys. Average bits per key: %.3f", i, mphf.size(), mphf.averageBitsPerKey());
            File mphfFile = new File(tempDir, lmFile.getName() + i + "gram.mphf");
            phfFiles[i] = mphfFile;
            mphf.serialize(mphfFile);
        }
    }
    // generate header.
    Log.info("Writing header");
    // write version and type info
    dos.writeInt(VERSION);
    // write Mphf type
    if (type == SmoothLm.MphfType.SMALL) {
        dos.writeInt(0);
    } else {
        dos.writeInt(1);
    }
    // write log-base
    dos.writeDouble(10d);
    // write n value for grams (3 for trigram model)
    dos.writeInt(order);
    for (int i = 1; i <= order; i++) {
        dos.writeInt(lm.getCount(i));
    }
    // write rank lookup data (contains size+doubles)
    for (int i = 1; i <= order; i++) {
        Files.copy(lm.getProbabilityLookupFile(i), dos);
    }
    for (int i = 1; i <= order; i++) {
        if (i < order) {
            Files.copy(lm.getBackoffLookupFile(i), dos);
        }
    }
    Log.info("Reordering probability data and saving it together with n-gram fingerprints");
    for (int i = 1; i <= order; i++) {
        InMemoryBigByteArray probData = new InMemoryBigByteArray(lm.getProbRankFile(i));
        InMemoryBigByteArray backoffData = null;
        if (i < order) {
            backoffData = new InMemoryBigByteArray(lm.getBackoffRankFile(i));
        }
        ReorderData reorderData;
        final int gramCount = probData.count;
        if (i == 1) {
            int[] reorderedIndexes = new int[gramCount];
            for (int j = 0; j < gramCount; j++) {
                reorderedIndexes[j] = j;
            }
            reorderData = new ReorderData(reorderedIndexes, new int[0]);
        } else {
            if (type == SmoothLm.MphfType.LARGE) {
                reorderData = reorderIndexes(block, lm, i, LargeNgramMphf.deserialize(phfFiles[i]));
            } else {
                reorderData = reorderIndexes(block, lm, i, MultiLevelMphf.deserialize(phfFiles[i]));
            }
        }
        Log.info("Validating reordered index array for order: %d", i);
        validateIndexArray(reorderData.reorderedKeyIndexes);
        int fingerPrintSize = block.fingerPrintSize;
        if (i == 1) {
            fingerPrintSize = 0;
        }
        int backOffSize = block.backoffSize;
        if (i == order) {
            backOffSize = 0;
        }
        dos.writeInt(gramCount);
        dos.writeInt(fingerPrintSize);
        dos.writeInt(block.probabilitySize);
        dos.writeInt(backOffSize);
        byte[] probBuff = new byte[block.probabilitySize];
        byte[] fpBuff = new byte[fingerPrintSize];
        byte[] backoffBuff = new byte[backOffSize];
        for (int k = 0; k < gramCount; k++) {
            // save fingerprint values for 2,3,.. grams.
            if (i > 1) {
                block.fingerprintAsBytes(reorderData.fingerprints[k], fpBuff);
                dos.write(fpBuff);
            }
            probData.get(reorderData.reorderedKeyIndexes[k], probBuff);
            dos.write(probBuff);
            // write backoff value if exists.
            if (backoffData != null) {
                backoffData.get(reorderData.reorderedKeyIndexes[k], backoffBuff);
                dos.write(backoffBuff);
            }
        }
    }
    // append size of the Perfect hash and its content.
    if (phfFiles.length > 0) {
        Log.info("Saving MPHF values.");
    }
    for (int i = 2; i <= order; i++) {
        Files.copy(phfFiles[i], dos);
    }
    // save vocabulary
    Log.info("Saving vocabulary.");
    Files.copy(lm.getVocabularyFile(), dos);
    dos.close();
}
Also used : LargeNgramMphf(zemberek.core.hash.LargeNgramMphf) MultiLevelMphf(zemberek.core.hash.MultiLevelMphf) Mphf(zemberek.core.hash.Mphf) DataOutputStream(java.io.DataOutputStream) FileOutputStream(java.io.FileOutputStream) BufferedOutputStream(java.io.BufferedOutputStream) File(java.io.File)

Aggregations

Mphf (zemberek.core.hash.Mphf)3 MultiLevelMphf (zemberek.core.hash.MultiLevelMphf)3 BufferedOutputStream (java.io.BufferedOutputStream)2 DataOutputStream (java.io.DataOutputStream)2 FileOutputStream (java.io.FileOutputStream)2 File (java.io.File)1 Histogram (zemberek.core.collections.Histogram)1 LargeNgramMphf (zemberek.core.hash.LargeNgramMphf)1 BinningQuantizer (zemberek.core.quantization.BinningQuantizer)1 DoubleLookup (zemberek.core.quantization.DoubleLookup)1 Quantizer (zemberek.core.quantization.Quantizer)1