use of zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator in project zemberek-nlp by ahmetaa.
the class FindPOS method main.
public static void main(String[] args) throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
Z3MarkovModelDisambiguator disambiguator = new Z3MarkovModelDisambiguator();
TurkishSentenceAnalyzer sentenceAnalyzer = new TurkishSentenceAnalyzer(morphology, disambiguator);
new FindPOS(sentenceAnalyzer).test("Keşke yarın hava güzel olsa.");
}
use of zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator in project zemberek-nlp by ahmetaa.
the class TurkishSentenceAnalyzerTest method setUp.
@Before
public void setUp() throws Exception {
TurkishMorphology morphParser = TurkishMorphology.createWithDefaults();
parser = new TurkishSentenceAnalyzer(morphParser, new Z3MarkovModelDisambiguator());
}
use of zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method performance.
@Test
@Ignore("Not a Test.")
public void performance() throws IOException {
List<String> lines = Files.readAllLines(// Paths.get("/media/depo/data/aaa/corpora/dunya.100k")
Paths.get("/home/ahmetaa/data/nlp/corpora/dunya.100k"));
TurkishMorphology analyzer = TurkishMorphology.builder().addDefaultDictionaries().disableUnidentifiedTokenAnalyzer().disableCache().build();
TurkishSentenceAnalyzer sentenceAnalyzer = new TurkishSentenceAnalyzer(analyzer, new Z3MarkovModelDisambiguator());
Log.info(lines.size() + " lines will be processed.");
Log.info("Dictionary has " + analyzer.getLexicon().size() + " items.");
long tokenCount = 0;
long tokenCountNoPunct = 0;
Stopwatch clock = Stopwatch.createStarted();
TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
for (String line : lines) {
List<Token> tokens = lexer.tokenize(line);
tokenCount += tokens.stream().filter(s -> (s.getType() != TurkishLexer.SpaceTab)).count();
tokenCountNoPunct += tokens.stream().filter(s -> (s.getType() != TurkishLexer.Punctuation && s.getType() != TurkishLexer.SpaceTab)).count();
}
long elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Token Count = " + tokenCount);
Log.info("Token Count (No Punctuation) = " + tokenCountNoPunct);
Log.info("Tokenization Speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization Speed (No Punctuation) = %.1f tokens/sec ", tokenCountNoPunct * 1000d / elapsed);
Log.info("");
Log.info("Sentence word analysis test:");
int counter = 0;
clock.reset().start();
for (String line : lines) {
try {
SentenceAnalysis res = sentenceAnalyzer.analyze(line);
// for preventing VM optimizations.
counter += res.size();
} catch (Exception e) {
Log.info(line);
e.printStackTrace();
}
}
elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Tokenization + Analysis speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization + Analysis speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
Log.info(analyzer.toString());
Log.info("");
Log.info("Disambiguation Test:");
analyzer.invalidateAllCache();
clock.reset().start();
for (String line : lines) {
try {
List<WordAnalysis> results = sentenceAnalyzer.bestParse(line);
// for preventing VM optimizations.
counter += results.size();
} catch (Exception e) {
Log.info(line);
e.printStackTrace();
}
}
elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Tokenization + Analysis + Disambiguation speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization + Analysis + Disambiguation speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
Log.info(counter);
}
Aggregations