use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class PerceptronNer method loadModel.
public static PerceptronNer loadModel(Path modelRoot, TurkishMorphology morphology) throws IOException {
Map<String, ClassModel> weightsMap = new HashMap<>();
List<Path> files = Files.walk(modelRoot, 1).filter(s -> s.toFile().getName().endsWith(".ner.model")).collect(Collectors.toList());
for (Path file : files) {
ClassModel weights = ClassModel.load(file);
weightsMap.put(weights.id, weights);
}
return new PerceptronNer(weightsMap, morphology);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class CharacterGraphDecoderTest method stemEndingTest1.
@Test
public void stemEndingTest1() {
TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("bakmak", "gelmek").build();
List<String> endings = Lists.newArrayList("acak", "ecek");
StemEndingGraph graph = new StemEndingGraph(morphology, endings);
CharacterGraphDecoder spellChecker = new CharacterGraphDecoder(graph.stemGraph);
List<String> res = spellChecker.getSuggestions("bakcaak");
Assert.assertEquals(1, res.size());
Assert.assertEquals("bakacak", res.get(0));
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class NerExperiment method trainAndTest.
public static void trainAndTest(Path trainPath, Path testPath, Path modelRoot, Path reportPath) throws IOException {
NerDataSet trainingSet = NerDataSet.load(trainPath, AnnotationStyle.BRACKET);
Log.info(trainingSet.info());
NerDataSet testSet = NerDataSet.load(testPath, AnnotationStyle.BRACKET);
Log.info(testSet.info());
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
PerceptronNer ner = new PerceptronNerTrainer(morphology).train(trainingSet, testSet, 7, 0.1f);
Files.createDirectories(modelRoot);
ner.saveModelAsText(modelRoot);
Log.info("Testing %d sentences.", testSet.sentences.size());
NerDataSet testResult = ner.evaluate(testSet);
PerceptronNerTrainer.evaluationReport(testSet, testResult, reportPath);
Log.info("Done.");
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class NerExperiment method main.
public static void main(String[] args) throws IOException {
Path root = Paths.get("/home/ahmetaa/data/nlp/ner");
Path trainPath = root.resolve("sentences.20k.result.txt");
Path testPath = root.resolve("reyyan.test.txt");
Path modelRoot = root.resolve("ner/model-toy");
Path reportPath = root.resolve("test-result.txt");
trainAndTest(trainPath, testPath, modelRoot, reportPath);
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
PerceptronNer ner = PerceptronNer.loadModel(modelRoot, morphology);
Stopwatch sw = Stopwatch.createStarted();
Path input = root.resolve("sentences.1k");
Path output = root.resolve("sentences.1k.result.txt");
List<String> sentences = Files.readAllLines(input);
int tokenCount = 0;
try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
for (String sentence : sentences) {
if (sentence.contains("[") || sentence.contains("]")) {
continue;
}
tokenCount += TurkishTokenizer.DEFAULT.tokenize(sentence).size();
NerSentence result = ner.findNamedEntities(sentence);
pw.println(result.getAsTrainingSentence(AnnotationStyle.BRACKET));
}
}
System.out.println("Elapsed = " + sw.elapsed(TimeUnit.MILLISECONDS));
System.out.println("TokenCount = " + tokenCount);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class NormalizationVocabularyGenerator method main.
public static void main(String[] args) throws Exception {
TurkishMorphology morphology = getTurkishMorphology();
NormalizationVocabularyGenerator generator = new NormalizationVocabularyGenerator(morphology);
Path corporaRoot = Paths.get("/home/aaa/data/normalization/corpus");
Path outRoot = Paths.get("/home/aaa/data/normalization/vocab-clean");
Path rootList = corporaRoot.resolve("clean-list");
BlockTextLoader corpusProvider = BlockTextLoader.fromDirectoryRoot(corporaRoot, rootList, 30_000);
Files.createDirectories(outRoot);
// create vocabularies
int threadCount = Runtime.getRuntime().availableProcessors() / 2;
if (threadCount > 22) {
threadCount = 22;
}
generator.createVocabulary(corpusProvider, threadCount, outRoot);
}
Aggregations