use of zemberek.core.embeddings.FastText in project zemberek-nlp by ahmetaa.
the class AutomaticLabelingExperiment method getOrTrainFastText.
private FastText getOrTrainFastText(Path train, Path modelPath) throws Exception {
FastText fastText;
if (modelPath.toFile().exists()) {
fastText = FastText.load(modelPath);
} else {
Args argz = Args.forSupervised();
argz.thread = 16;
argz.loss = Args.loss_name.hierarchicalSoftmax;
argz.epoch = 100;
argz.wordNgrams = 2;
argz.minCount = 10;
argz.lr = 0.2;
argz.dim = 250;
argz.bucket = 7_000_000;
fastText = new FastTextTrainer(argz).train(train);
fastText.saveModel(modelPath);
}
return fastText;
}
use of zemberek.core.embeddings.FastText in project zemberek-nlp by ahmetaa.
the class AutomaticLabelingExperiment method runExperiment.
public void runExperiment() throws Exception {
Path corpusPath = experimentRoot.resolve("label.corpus");
Path trainData = experimentRoot.resolve("labels.train");
Path testData = experimentRoot.resolve("labels.test");
Path modelPath = experimentRoot.resolve("labels.model");
Path predictionPath = experimentRoot.resolve("labels.prediction");
// extractLabeledDocuments(rawCorpusRoot, corpusPath);
Set<String> set = generateSetForLabelExperiment(corpusPath, morphology, true);
saveSets(trainData, testData, set);
FastText fastText = getOrTrainFastText(trainData, modelPath);
test(corpusPath, testData, predictionPath, fastText);
}
use of zemberek.core.embeddings.FastText in project zemberek-nlp by ahmetaa.
the class DocumentSimilarityExperiment method checkSimilarity.
public void checkSimilarity(Path model, Path corpusFile, Path outPath) throws IOException {
FastText fastText = FastText.load(model);
List<WebDocument> docs = WebCorpus.loadDocuments(corpusFile);
List<DocumentSimilarity> sims = new ArrayList<>();
Log.info("Calculating document vectors.");
for (WebDocument doc : docs) {
doc.setContent(hack(doc.getLines()));
if (doc.contentLength() < 500) {
continue;
}
String str = doc.getContentAsString();
str = str.length() > 200 ? str.substring(0, 200) : str;
float[] vec = fastText.sentenceVector(str).clone();
// float[] vec = fastText.textVectors(doc.getLines()).data_.clone();
sims.add(new DocumentSimilarity(doc, vec));
}
try (PrintWriter pw = new PrintWriter(outPath.toFile(), "utf-8")) {
int i = 0;
for (DocumentSimilarity sim : sims) {
List<ScoredItem<WebDocument>> nearest = nearestK(sim, sims, 5);
pw.println("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@");
pw.println(String.join("\n", sim.document.getLines()));
for (ScoredItem<WebDocument> w : nearest) {
pw.println("----------------------------------");
pw.println(String.join("\n", w.item.getLines()));
}
i++;
if (i == 100) {
break;
}
}
}
}
use of zemberek.core.embeddings.FastText in project zemberek-nlp by ahmetaa.
the class CategoryPredictionExperiment method runExperiment.
private void runExperiment() throws Exception {
Path corpusPath = experimentRoot.resolve("category.corpus");
Path train = experimentRoot.resolve("category.train");
Path test = experimentRoot.resolve("category.test");
Path titleRaw = experimentRoot.resolve("category.title");
Path modelPath = experimentRoot.resolve("category.model");
Path predictionPath = experimentRoot.resolve("category.predictions");
extractCategoryDocuments(rawCorpusRoot, corpusPath);
boolean useOnlyTitles = true;
boolean useLemmas = true;
generateSets(corpusPath, train, test, useOnlyTitles, useLemmas);
generateRawSet(corpusPath, titleRaw);
FastText fastText;
if (modelPath.toFile().exists()) {
Log.info("Reusing existing model %s", modelPath);
fastText = FastText.load(modelPath);
} else {
Args argz = Args.forSupervised();
argz.thread = 4;
argz.model = Args.model_name.supervised;
argz.loss = Args.loss_name.softmax;
argz.epoch = 50;
argz.wordNgrams = 2;
argz.minCount = 0;
argz.lr = 0.5;
argz.dim = 100;
argz.bucket = 5_000_000;
fastText = new FastTextTrainer(argz).train(train);
fastText.saveModel(modelPath);
}
EvaluationResult result = fastText.test(test, 1);
Log.info(result.toString());
WebCorpus corpus = new WebCorpus("corpus", "labeled");
corpus.addDocuments(WebCorpus.loadDocuments(corpusPath));
Log.info("Testing started.");
List<String> testLines = Files.readAllLines(test, StandardCharsets.UTF_8);
try (PrintWriter pw = new PrintWriter(predictionPath.toFile(), "utf-8")) {
for (String testLine : testLines) {
String id = testLine.substring(0, testLine.indexOf(' ')).substring(1);
WebDocument doc = corpus.getDocument(id);
List<ScoredItem<String>> res = fastText.predict(testLine, 3);
List<String> predictedCategories = new ArrayList<>();
for (ScoredItem<String> re : res) {
if (re.score < -10) {
continue;
}
predictedCategories.add(String.format(Locale.ENGLISH, "%s (%.2f)", re.item.replaceAll("__label__", "").replaceAll("_", " "), re.score));
}
pw.println("id = " + id);
pw.println();
pw.println(doc.getTitle());
pw.println();
pw.println("Actual Category = " + doc.getCategory());
pw.println("Predictions = " + String.join(", ", predictedCategories));
pw.println();
pw.println("------------------------------------------------------");
pw.println();
}
}
Log.info("Done.");
}
use of zemberek.core.embeddings.FastText in project zemberek-nlp by ahmetaa.
the class GenerateWordVectors method run.
@Override
public void run() throws IOException {
Log.info("Generating word vectors from %s", input);
WordVectorsTrainer trainer = WordVectorsTrainer.builder().epochCount(epochCount).learningRate(learningRate).modelType(modelType).minWordCount(minWordCount).threadCount(threadCount).wordNgramOrder(wordNGrams).dimension(dimension).contextWindowSize(contextWindowSize).build();
Log.info("Training Started.");
trainer.getEventBus().register(this);
FastText fastText = trainer.train(input);
Log.info("Saving vectors in text format to %s", output);
fastText.saveVectors(output);
}
Aggregations