use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.
the class DisambiguateSentences method main.
public static void main(String[] args) {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
String sentence = "Bol baharatlı bir yemek yaptıralım.";
Log.info("Sentence = " + sentence);
List<WordAnalysis> analyses = morphology.analyzeSentence(sentence);
Log.info("Sentence word analysis result:");
for (WordAnalysis entry : analyses) {
Log.info("Word = " + entry.getInput());
for (SingleAnalysis analysis : entry) {
Log.info(analysis.formatLong());
}
}
SentenceAnalysis result = morphology.disambiguate(sentence, analyses);
Log.info("\nAfter ambiguity resolution : ");
result.bestAnalysis().forEach(Log::info);
}
use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.
the class Scripts method saveUnambiguous.
public static void saveUnambiguous(List<SentenceAnalysis> sentences, Path out) throws IOException {
try (PrintWriter pwMorph = new PrintWriter(out.toFile(), "utf-8")) {
for (SentenceAnalysis analysis : sentences) {
if (analysis.bestAnalysis().stream().anyMatch(SingleAnalysis::isUnknown)) {
continue;
}
pwMorph.format("S:%s%n", analysis.getSentence());
for (SentenceWordAnalysis sw : analysis) {
WordAnalysis wa = sw.getWordAnalysis();
pwMorph.println(wa.getInput());
SingleAnalysis best = sw.getBestAnalysis();
for (SingleAnalysis singleAnalysis : wa) {
boolean isBest = singleAnalysis.equals(best);
if (wa.analysisCount() == 1) {
pwMorph.println(singleAnalysis.formatLong());
} else {
pwMorph.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
}
}
}
pwMorph.println();
}
}
}
use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.
the class ClassificationExampleBase method splitWords.
protected String splitWords(String sentence) {
List<String> tokens = Splitter.on(" ").splitToList(sentence);
// assume first is label. Remove label from sentence for morphological analysis.
String label = tokens.get(0);
tokens = tokens.subList(1, tokens.size());
sentence = String.join(" ", tokens);
if (sentence.length() == 0) {
return sentence;
}
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
List<String> res = new ArrayList<>();
// add label first.
res.add(label);
for (SentenceWordAnalysis e : analysis) {
SingleAnalysis best = e.getBestAnalysis();
String input = e.getWordAnalysis().getInput();
if (best.isUnknown()) {
res.add(input);
continue;
}
List<String> lemmas = best.getLemmas();
String l = lemmas.get(0);
if (l.length() < input.length()) {
res.add(l);
String substring = input.substring(l.length());
res.add("_" + substring);
} else {
res.add(l);
}
}
return String.join(" ", res);
}
use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.
the class FindPOS method main.
public static void main(String[] args) {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
String sentence = "Keşke yarın hava güzel olsa.";
Log.info("Sentence = " + sentence);
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
for (SentenceWordAnalysis a : analysis) {
PrimaryPos primaryPos = a.getBestAnalysis().getPos();
Log.info("%s : %s ", a.getWordAnalysis().getInput(), primaryPos);
}
}
use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method performance.
@Test
@Ignore("Not a Test.")
public void performance() throws IOException {
List<String> lines = Files.readAllLines(// Paths.get("/media/depo/data/aaa/corpora/dunya.100k")
Paths.get("/home/ahmetaa/data/nlp/corpora/dunya.100k"));
TurkishMorphology analyzer = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().disableCache().build();
Log.info(lines.size() + " lines will be processed.");
Log.info("Dictionary has " + analyzer.getLexicon().size() + " items.");
long tokenCount = 0;
long tokenCountNoPunct = 0;
Stopwatch clock = Stopwatch.createStarted();
TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
for (String line : lines) {
List<Token> tokens = lexer.tokenize(line);
tokenCount += tokens.stream().filter(s -> (s.getType() != Token.Type.SpaceTab)).count();
tokenCountNoPunct += tokens.stream().filter(s -> (s.getType() != Token.Type.Punctuation && s.getType() != Token.Type.SpaceTab)).count();
}
long elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Token Count = " + tokenCount);
Log.info("Token Count (No Punctuation) = " + tokenCountNoPunct);
Log.info("Tokenization Speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization Speed (No Punctuation) = %.1f tokens/sec ", tokenCountNoPunct * 1000d / elapsed);
Log.info("");
Log.info("Sentence word analysis test:");
int counter = 0;
clock.reset().start();
for (String line : lines) {
try {
List<WordAnalysis> res = analyzer.analyzeSentence(line);
// for preventing VM optimizations.
counter += res.size();
} catch (Exception e) {
Log.info(line);
e.printStackTrace();
}
}
elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Tokenization + Analysis speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization + Analysis speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
Log.info(analyzer.toString());
Log.info("");
Log.info("Disambiguation Test:");
analyzer.invalidateCache();
clock.reset().start();
for (String line : lines) {
try {
SentenceAnalysis results = analyzer.analyzeAndDisambiguate(line);
// for preventing VM optimizations.
counter += results.size();
} catch (Exception e) {
Log.info(line);
e.printStackTrace();
}
}
elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Tokenization + Analysis + Disambiguation speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization + Analysis + Disambiguation speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
Log.info(counter);
}
Aggregations