use of zemberek.core.hash.Mphf in project zemberek-nlp by ahmetaa.
the class CompressedCharNgramModel method compress.
public static void compress(MapBasedCharNgramLanguageModel model, File output) throws IOException {
Mphf[] mphfs = new MultiLevelMphf[model.getOrder() + 1];
DoubleLookup[] lookups = new DoubleLookup[model.getOrder() + 1];
try (DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(output)))) {
dos.writeInt(model.getOrder());
dos.writeUTF(model.getId());
for (int i = 1; i <= model.getOrder(); i++) {
Histogram<Double> histogram = new Histogram<>();
histogram.add(model.gramLogProbs[i].values.values());
double[] lookup = new double[histogram.size()];
int j = 0;
for (Double key : histogram) {
lookup[j] = key;
j++;
}
Quantizer quantizer = BinningQuantizer.linearBinning(lookup, 8);
lookups[i] = quantizer.getDequantizer();
List<String> keys = Lists.newArrayList(model.gramLogProbs[i].values.keySet());
int[] fingerprints = new int[keys.size()];
int[] probabilityIndexes = new int[keys.size()];
mphfs[i] = MultiLevelMphf.generate(new StringListKeyProvider(keys));
for (final String key : keys) {
final int index = mphfs[i].get(key);
fingerprints[index] = MultiLevelMphf.hash(key, -1) & FINGER_PRINT_MASK;
probabilityIndexes[index] = quantizer.getQuantizationIndex(model.gramLogProbs[i].values.get(key));
}
lookups[i].save(dos);
dos.writeInt(keys.size());
for (int k = 0; k < keys.size(); k++) {
dos.writeShort(fingerprints[k] & 0xffff);
dos.writeByte(probabilityIndexes[k]);
}
mphfs[i].serialize(dos);
}
}
}
use of zemberek.core.hash.Mphf in project zemberek-nlp by ahmetaa.
the class LossyIntLookup method deserialize.
/**
* Deseializes a LossyIntLookup structure from a {@link DataInputStream} [dis]
*/
public static LossyIntLookup deserialize(DataInputStream dis) throws IOException {
long magic = dis.readInt();
if (magic != MAGIC) {
throw new IllegalStateException("File does not carry expected value in the beginning.");
}
int length = dis.readInt();
int[] data = new int[length];
for (int i = 0; i < data.length; i++) {
data[i] = dis.readInt();
}
Mphf mphf = MultiLevelMphf.deserialize(dis);
return new LossyIntLookup(mphf, data);
}
use of zemberek.core.hash.Mphf in project zemberek-nlp by ahmetaa.
the class UncompressedToSmoothLmConverter method convert.
private void convert(File binaryUncompressedLmDir, NgramDataBlock block, SmoothLm.MphfType type, File[] oneBasedMphfFiles, int chunkBits) throws IOException {
Log.info("Generating compressed language model.");
MultiFileUncompressedLm lm = new MultiFileUncompressedLm(binaryUncompressedLmDir);
lm.generateRankFiles(block.probabilitySize * 8, QuantizerType.BINNING);
order = lm.order;
DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(lmFile)));
// generate Minimal Perfect Hash functions for 2,3...n grams and save them as separate files.
File[] phfFiles = new File[order + 1];
if (oneBasedMphfFiles != null) {
phfFiles = oneBasedMphfFiles;
} else {
for (int i = 2; i <= order; i++) {
Mphf mphf;
if (type == SmoothLm.MphfType.LARGE) {
mphf = LargeNgramMphf.generate(lm.getGramFile(i), chunkBits);
} else {
mphf = MultiLevelMphf.generate(lm.getGramFile(i));
}
Log.info("MPHF is generated for order %d with %d keys. Average bits per key: %.3f", i, mphf.size(), mphf.averageBitsPerKey());
File mphfFile = new File(tempDir, lmFile.getName() + i + "gram.mphf");
phfFiles[i] = mphfFile;
mphf.serialize(mphfFile);
}
}
// generate header.
Log.info("Writing header");
// write version and type info
dos.writeInt(VERSION);
// write Mphf type
if (type == SmoothLm.MphfType.SMALL) {
dos.writeInt(0);
} else {
dos.writeInt(1);
}
// write log-base
dos.writeDouble(10d);
// write n value for grams (3 for trigram model)
dos.writeInt(order);
for (int i = 1; i <= order; i++) {
dos.writeInt(lm.getCount(i));
}
// write rank lookup data (contains size+doubles)
for (int i = 1; i <= order; i++) {
Files.copy(lm.getProbabilityLookupFile(i), dos);
}
for (int i = 1; i <= order; i++) {
if (i < order) {
Files.copy(lm.getBackoffLookupFile(i), dos);
}
}
Log.info("Reordering probability data and saving it together with n-gram fingerprints");
for (int i = 1; i <= order; i++) {
InMemoryBigByteArray probData = new InMemoryBigByteArray(lm.getProbRankFile(i));
InMemoryBigByteArray backoffData = null;
if (i < order) {
backoffData = new InMemoryBigByteArray(lm.getBackoffRankFile(i));
}
ReorderData reorderData;
final int gramCount = probData.count;
if (i == 1) {
int[] reorderedIndexes = new int[gramCount];
for (int j = 0; j < gramCount; j++) {
reorderedIndexes[j] = j;
}
reorderData = new ReorderData(reorderedIndexes, new int[0]);
} else {
if (type == SmoothLm.MphfType.LARGE) {
reorderData = reorderIndexes(block, lm, i, LargeNgramMphf.deserialize(phfFiles[i]));
} else {
reorderData = reorderIndexes(block, lm, i, MultiLevelMphf.deserialize(phfFiles[i]));
}
}
Log.info("Validating reordered index array for order: %d", i);
validateIndexArray(reorderData.reorderedKeyIndexes);
int fingerPrintSize = block.fingerPrintSize;
if (i == 1) {
fingerPrintSize = 0;
}
int backOffSize = block.backoffSize;
if (i == order) {
backOffSize = 0;
}
dos.writeInt(gramCount);
dos.writeInt(fingerPrintSize);
dos.writeInt(block.probabilitySize);
dos.writeInt(backOffSize);
byte[] probBuff = new byte[block.probabilitySize];
byte[] fpBuff = new byte[fingerPrintSize];
byte[] backoffBuff = new byte[backOffSize];
for (int k = 0; k < gramCount; k++) {
// save fingerprint values for 2,3,.. grams.
if (i > 1) {
block.fingerprintAsBytes(reorderData.fingerprints[k], fpBuff);
dos.write(fpBuff);
}
probData.get(reorderData.reorderedKeyIndexes[k], probBuff);
dos.write(probBuff);
// write backoff value if exists.
if (backoffData != null) {
backoffData.get(reorderData.reorderedKeyIndexes[k], backoffBuff);
dos.write(backoffBuff);
}
}
}
// append size of the Perfect hash and its content.
if (phfFiles.length > 0) {
Log.info("Saving MPHF values.");
}
for (int i = 2; i <= order; i++) {
Files.copy(phfFiles[i], dos);
}
// save vocabulary
Log.info("Saving vocabulary.");
Files.copy(lm.getVocabularyFile(), dos);
dos.close();
}
Aggregations