use of zemberek.core.quantization.DoubleLookup in project zemberek-nlp by ahmetaa.
the class CompressedCharNgramModel method load.
public static CompressedCharNgramModel load(InputStream is) throws IOException {
try (DataInputStream dis = new DataInputStream(new BufferedInputStream(is))) {
int order = dis.readInt();
String modelId = dis.readUTF();
MultiLevelMphf[] mphfs = new MultiLevelMphf[order + 1];
ProbData[] probDatas = new ProbData[order + 1];
DoubleLookup[] lookups = new DoubleLookup[order + 1];
for (int i = 1; i <= order; i++) {
lookups[i] = DoubleLookup.getLookup(dis);
probDatas[i] = new ProbData(dis);
mphfs[i] = MultiLevelMphf.deserialize(dis);
}
return new CompressedCharNgramModel(order, modelId, mphfs, probDatas, lookups);
}
}
use of zemberek.core.quantization.DoubleLookup in project zemberek-nlp by ahmetaa.
the class CompressedCharNgramModel method compress.
public static void compress(MapBasedCharNgramLanguageModel model, File output) throws IOException {
Mphf[] mphfs = new MultiLevelMphf[model.getOrder() + 1];
DoubleLookup[] lookups = new DoubleLookup[model.getOrder() + 1];
try (DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(output)))) {
dos.writeInt(model.getOrder());
dos.writeUTF(model.getId());
for (int i = 1; i <= model.getOrder(); i++) {
Histogram<Double> histogram = new Histogram<>();
histogram.add(model.gramLogProbs[i].values.values());
double[] lookup = new double[histogram.size()];
int j = 0;
for (Double key : histogram) {
lookup[j] = key;
j++;
}
Quantizer quantizer = BinningQuantizer.linearBinning(lookup, 8);
lookups[i] = quantizer.getDequantizer();
List<String> keys = Lists.newArrayList(model.gramLogProbs[i].values.keySet());
int[] fingerprints = new int[keys.size()];
int[] probabilityIndexes = new int[keys.size()];
mphfs[i] = MultiLevelMphf.generate(new StringListKeyProvider(keys));
for (final String key : keys) {
final int index = mphfs[i].get(key);
fingerprints[index] = MultiLevelMphf.hash(key, -1) & FINGER_PRINT_MASK;
probabilityIndexes[index] = quantizer.getQuantizationIndex(model.gramLogProbs[i].values.get(key));
}
lookups[i].save(dos);
dos.writeInt(keys.size());
for (int k = 0; k < keys.size(); k++) {
dos.writeShort(fingerprints[k] & 0xffff);
dos.writeByte(probabilityIndexes[k]);
}
mphfs[i].serialize(dos);
}
}
}
use of zemberek.core.quantization.DoubleLookup in project zemberek-nlp by ahmetaa.
the class MultiFileUncompressedLm method generateRankFile.
private void generateRankFile(int bit, int currentOrder, File probFile, File rankFile, QuantizerType quantizerType) throws IOException {
try (DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(probFile)));
DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(rankFile)))) {
int count = dis.readInt();
Quantizer quantizer = BinaryFloatFileReader.getQuantizer(probFile, bit, quantizerType);
dos.writeInt(count);
Log.info("Writing Rank file for " + currentOrder + " grams");
int bytecount = (bit % 8 == 0 ? bit / 8 : bit / 8 + 1);
if (bytecount == 0) {
bytecount = 1;
}
dos.writeInt(bytecount);
byte[] bytez = new byte[3];
for (int j = 0; j < count; j++) {
final int rank = quantizer.getQuantizationIndex(dis.readFloat());
switch(bytecount) {
case 1:
dos.write(rank & 0xff);
break;
case 2:
dos.writeShort(rank & 0xffff);
break;
case 3:
bytez[0] = (byte) ((rank >>> 16) & 0xff);
bytez[1] = (byte) ((rank >>> 8) & 0xff);
bytez[2] = (byte) (rank & 0xff);
dos.write(bytez);
break;
}
}
DoubleLookup lookup = quantizer.getDequantizer();
Log.info("Writing lookups for " + currentOrder + " grams. Size= " + lookup.getRange());
lookup.save(new File(dir, probFile.getName() + ".lookup"));
}
}
Aggregations