use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class EvaluateNer method run.
@Override
public void run() throws Exception {
initializeOutputDir();
if (hypothesisPath == null) {
IOUtil.checkDirectoryArgument(modelRoot, "Model Root");
} else {
IOUtil.checkFileArgument(referencePath, "Hypothesis File");
}
IOUtil.checkFileArgument(referencePath, "Reference File");
NerDataSet hypothesis;
NerDataSet reference = NerDataSet.load(referencePath, annotationStyle);
Log.info("Reference :");
Log.info(reference.info());
if (hypothesisPath == null) {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
PerceptronNer ner = PerceptronNer.loadModel(modelRoot, morphology);
Stopwatch sw = Stopwatch.createStarted();
hypothesis = ner.evaluate(reference);
double secs = sw.elapsed(TimeUnit.MILLISECONDS) / 1000d;
Log.info("NER is applied to reference data in %.4f seconds.", secs);
} else {
hypothesis = NerDataSet.load(hypothesisPath, annotationStyle);
}
Log.info("Hypothesis :");
Log.info(hypothesis.info());
Path reportPath = outDir.resolve("eval-report");
PerceptronNerTrainer.evaluationReport(reference, hypothesis, reportPath);
TestResult result = PerceptronNerTrainer.collectEvaluationData(reference, hypothesis);
Log.info("Evaluation Result:");
Log.info(result.dump());
Log.info("Detailed evaluation report is written in %s", reportPath);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class FindNamedEntities method run.
@Override
public void run() throws Exception {
initializeOutputDir();
IOUtil.checkDirectoryArgument(modelRoot, "Model Root");
IOUtil.checkFileArgument(inputPath, "Input File");
Path out = outDir.resolve(inputPath.toFile().getName() + ".ne");
List<String> lines = Files.readAllLines(inputPath, StandardCharsets.UTF_8);
List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(lines);
Log.info("There are %d lines and about %d sentences", lines.size(), sentences.size());
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
PerceptronNer ner = PerceptronNer.loadModel(modelRoot, morphology);
Stopwatch sw = Stopwatch.createStarted();
int tokenCount = 0;
try (PrintWriter pw = new PrintWriter(out.toFile(), "UTF-8")) {
for (String sentence : sentences) {
sentence = TextUtil.normalizeApostrophes(sentence);
sentence = TextUtil.normalizeQuotesHyphens(sentence);
sentence = TextUtil.normalizeSpacesAndSoftHyphens(sentence);
List<String> words = TurkishTokenizer.DEFAULT.tokenizeToStrings(sentence);
tokenCount += words.size();
NerSentence result = ner.findNamedEntities(sentence, words);
pw.println(result.getAsTrainingSentence(annotationStyle));
}
}
double secs = sw.elapsed(TimeUnit.MILLISECONDS) / 1000d;
Log.info("Token count = %s", tokenCount);
Log.info("File processed in %.4f seconds.", secs);
Log.info("Speed = %.2f tokens/sec", tokenCount / secs);
Log.info("Result is written in %s", out);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class TrainNerModel method run.
@Override
public void run() throws Exception {
initializeOutputDir();
IOUtil.checkFileArgument(trainDataPath, "Training file");
Path modelRoot = outDir.resolve("model");
Path modelRootCompressed = outDir.resolve("model-compressed");
Path logPath = outDir.resolve("train-log");
Log.addFileHandler(logPath);
if (developmentPath != null) {
IOUtil.checkFileArgument(developmentPath, "Development file");
}
NerDataSet trainingSet = NerDataSet.load(trainDataPath, annotationStyle);
Log.info("Training set information:");
Log.info(trainingSet.info());
NerDataSet devSet = null;
if (developmentPath != null) {
devSet = NerDataSet.load(developmentPath, annotationStyle);
Log.info("Development set information:");
Log.info(devSet.info());
}
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
Log.info("------------ Training Started --------------------");
PerceptronNer ner = new PerceptronNerTrainer(morphology).train(trainingSet, devSet, iterationCount, learningRate);
Files.createDirectories(modelRoot);
Files.createDirectories(modelRootCompressed);
ner.saveModelAsText(modelRoot);
ner.saveModelCompressed(modelRootCompressed);
Log.info("Text model is created in %s", modelRoot);
Log.info("Compressed model is created in %s", modelRootCompressed);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class SpellCheckerPerformanceTests method correctWordFindingTest.
@Test
@Ignore(value = "Not a test.")
public void correctWordFindingTest() throws Exception {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology);
TurkishSentenceExtractor extractor = TurkishSentenceExtractor.DEFAULT;
TurkishTokenizer tokenizer = TurkishTokenizer.DEFAULT;
Path path = new File(Resources.getResource("spell-checker-test.txt").getFile()).toPath();
List<String> lines = Files.readAllLines(path);
List<String> sentences = extractor.fromParagraphs(lines);
Stopwatch sw = Stopwatch.createStarted();
Histogram<String> incorrectFound = new Histogram<>();
Histogram<String> correctFound = new Histogram<>();
for (String sentence : sentences) {
List<Token> tokens = tokenizer.tokenize(sentence);
for (Token token : tokens) {
String text = token.getText();
if (!spellChecker.check(text)) {
incorrectFound.add(text);
} else {
correctFound.add(text);
}
}
}
Log.info("Elapsed = %d", sw.elapsed(TimeUnit.MILLISECONDS));
Log.info("Incorrect (total/unique) = %d / %d", incorrectFound.totalCount(), incorrectFound.size());
Log.info("Correct (total/unique) = %d / %d", correctFound.totalCount(), correctFound.size());
incorrectFound.saveSortedByCounts(Paths.get("incorrect.txt"), " : ");
correctFound.saveSortedByCounts(Paths.get("correct.txt"), " : ");
/*
Path lmPath = Paths.get(ClassLoader.getSystemResource("lm-bigram.slm").toURI());
SmoothLm model = SmoothLm.builder(lmPath.toFile()).build();
*/
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class TurkishSpellCheckerTest method checkVerb1.
@Test
public void checkVerb1() {
TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("okumak").build();
List<String> endings = Lists.newArrayList("dum");
StemEndingGraph graph = new StemEndingGraph(morphology, endings);
TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology, graph.stemGraph);
Assert.assertTrue(spellChecker.check("okudum"));
}
Aggregations