use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class HunspellOperations method generateAnnotationFileMultiSplit.
private static void generateAnnotationFileMultiSplit(Path vocab, Path annotationsPath) throws IOException {
List<String> words = Files.readAllLines(vocab, StandardCharsets.UTF_8);
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
List<String> annotations = new ArrayList<>();
for (String word : words) {
WordAnalysis analysis = morphology.analyze(word);
if (!analysis.isCorrect()) {
Log.warn("Cannot analyze %s", word);
continue;
}
LinkedHashSet<String> stemEndings = new LinkedHashSet<>();
for (SingleAnalysis s : analysis) {
if (s.getDictionaryItem().secondaryPos == SecondaryPos.ProperNoun || s.getDictionaryItem().secondaryPos == SecondaryPos.Abbreviation) {
continue;
}
String surfaces = AnalysisFormatters.SURFACE_SEQUENCE.format(s);
List<String> tokens = Splitter.on(" ").splitToList(surfaces);
String stem = tokens.get(0);
for (int i = 0; i < tokens.size(); i++) {
String morpheme = tokens.get(i);
if (i > 0) {
stem = stem + morpheme;
}
List<String> morphemes = i == tokens.size() - 1 ? new ArrayList<>() : tokens.subList(i + 1, tokens.size());
String ending = String.join(" ", morphemes);
if (isCorrectAndContainsNoProper(morphology.analyze(stem))) {
if (ending.length() > 0) {
stemEndings.add(word + " " + stem + " " + ending);
}
/*else {
stemEndings.add(word + " " + stem);
}*/
}
}
}
annotations.add(String.join(",", stemEndings));
}
Files.write(annotationsPath, annotations, StandardCharsets.UTF_8);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class TurkishStopWords method generateFromDictionary.
static TurkishStopWords generateFromDictionary() throws IOException {
Set<PrimaryPos> pos = Sets.newHashSet(PrimaryPos.Adverb, PrimaryPos.Conjunction, PrimaryPos.Determiner, PrimaryPos.Interjection, PrimaryPos.PostPositive, PrimaryPos.Numeral, PrimaryPos.Pronoun, PrimaryPos.Question);
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
Set<String> set = new HashSet<>();
RootLexicon lexicon = morphology.getLexicon();
for (DictionaryItem item : lexicon) {
if (pos.contains(item.primaryPos)) {
set.add(item.lemma);
}
}
List<String> str = new ArrayList<>(set);
str.sort(Turkish.STRING_COMPARATOR_ASC);
return new TurkishStopWords(new LinkedHashSet<>(str));
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class GenerateNerModel method main.
public static void main(String[] args) throws IOException {
// you will need ner-train and ner-test files to run this example.
Path trainPath = Paths.get("ner-train");
Path testPath = Paths.get("ner-test");
Path modelRoot = Paths.get("my-model");
NerDataSet trainingSet = NerDataSet.load(trainPath, AnnotationStyle.BRACKET);
// prints information
Log.info(trainingSet.info());
NerDataSet testSet = NerDataSet.load(testPath, AnnotationStyle.BRACKET);
Log.info(testSet.info());
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
// Training occurs here. Result is a PerceptronNer instance.
// There will be 7 iterations with 0.1 learning rate.
PerceptronNer ner = new PerceptronNerTrainer(morphology).train(trainingSet, testSet, 13, 0.1f);
Files.createDirectories(modelRoot);
ner.saveModelAsText(modelRoot);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class NormalizeNoisyText method main.
public static void main(String[] args) throws IOException {
String[] examples = { "Yrn okua gidicem", "Tmm, yarin havuza giricem ve aksama kadar yaticam :)", "ah aynen ya annemde fark ettı siz evinizden cıkmayın diyo", "gercek mı bu? Yuh! Artık unutulması bile beklenmiyo", "Hayır hayat telaşm olmasa alacam buraları gökdelen dikicem.", "yok hocam kesınlıkle oyle birşey yok", "herseyi soyle hayatında olmaması gerek bence boyle ınsanların falan baskı yapıyosa" };
// change paths with your normalization data root folder and language model file paths.
// Example: https://drive.google.com/drive/folders/1tztjRiUs9BOTH-tb1v7FWyixl-iUpydW
// download lm and normalization folders to some local directory.
Path zemberekDataRoot = Paths.get("/home/aaa/zemberek-data");
Path lookupRoot = zemberekDataRoot.resolve("normalization");
Path lmPath = zemberekDataRoot.resolve("lm/lm.2gram.slm");
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
TurkishSentenceNormalizer normalizer = new TurkishSentenceNormalizer(morphology, lookupRoot, lmPath);
for (String example : examples) {
System.out.println(example);
System.out.println(normalizer.normalize(example));
System.out.println();
}
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class AmbiguousExampleFinder method main.
public static void main(String[] args) throws Exception {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
Path indexRoot = Paths.get("/home/aaa/data/zemberek/corpus-index");
CorpusSearcher searcher = new CorpusSearcher(indexRoot);
AmbiguousExampleFinder finder = new AmbiguousExampleFinder(searcher);
extractSentences(morphology, finder);
}
Aggregations