use of zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer in project zemberek-nlp by ahmetaa.
the class WordHistogram method generateHistograms.
static void generateHistograms(List<String> paragraphs, Path outRoot) throws IOException {
TurkishMorphology morphology = TurkishMorphology.builder().addDefaultDictionaries().cacheParameters(75_000, 150_000).build();
TurkishSentenceAnalyzer analyzer = new TurkishSentenceAnalyzer(morphology, new Z3MarkovModelDisambiguator());
Histogram<String> roots = new Histogram<>(1000_000);
Histogram<String> words = new Histogram<>(1000_000);
int paragraphCounter = 0;
int sentenceCounter = 0;
int tokenCounter = 0;
for (String paragraph : paragraphs) {
List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraph(paragraph);
sentenceCounter += sentences.size();
for (String sentence : sentences) {
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
tokenCounter += tokens.size();
SentenceAnalysis analysis = analyzer.analyze(sentence);
analyzer.disambiguate(analysis);
for (SentenceAnalysis.Entry e : analysis) {
WordAnalysis best = e.parses.get(0);
if (best.getPos() == PrimaryPos.Numeral || best.getPos() == PrimaryPos.Punctuation) {
continue;
}
if (best.isUnknown()) {
continue;
}
if (best.isRuntime() && !Strings.containsNone(e.input, "01234567890")) {
continue;
}
List<String> lemmas = best.getLemmas();
if (lemmas.size() == 0) {
continue;
}
roots.add(best.getDictionaryItem().lemma);
String w = e.input;
if (best.getDictionaryItem().secondaryPos != SecondaryPos.ProperNoun) {
w = w.toLowerCase(Turkish.LOCALE);
} else {
w = Turkish.capitalize(w);
}
words.add(w);
}
}
paragraphCounter++;
if (paragraphCounter % 1000 == 0) {
System.out.println(paragraphCounter + " of " + paragraphs.size());
}
}
System.out.println("tokenCounter = " + tokenCounter);
System.out.println("sentenceCounter = " + sentenceCounter);
Files.createDirectories(outRoot);
roots.saveSortedByCounts(outRoot.resolve("roots.freq.txt"), " ");
roots.saveSortedByKeys(outRoot.resolve("roots.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
words.saveSortedByCounts(outRoot.resolve("words.freq.txt"), " ");
words.saveSortedByKeys(outRoot.resolve("words.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
words.removeSmaller(10);
words.saveSortedByCounts(outRoot.resolve("words10.freq.txt"), " ");
words.saveSortedByKeys(outRoot.resolve("words10.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
}
use of zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer in project zemberek-nlp by ahmetaa.
the class CategoryPredictionExperiment method generateSets.
private void generateSets(Path input, Path train, Path test, boolean useOnlyTitle, boolean useRoots) throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
TurkishSentenceAnalyzer analyzer = new TurkishSentenceAnalyzer(morphology, new Z3MarkovModelDisambiguator());
WebCorpus corpus = new WebCorpus("category", "category");
Log.info("Loading corpus from %s", input);
corpus.addDocuments(WebCorpus.loadDocuments(input));
List<String> set = new ArrayList<>(corpus.documentCount());
TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
Histogram<String> categoryCounts = new Histogram<>();
for (WebDocument document : corpus.getDocuments()) {
String category = document.getCategory();
if (category.length() > 0) {
categoryCounts.add(category);
}
}
Log.info("All category count = %d", categoryCounts.size());
categoryCounts.removeSmaller(20);
Log.info("Reduced label count = %d", categoryCounts.size());
Log.info("Extracting data from %d documents ", corpus.documentCount());
int c = 0;
for (WebDocument document : corpus.getDocuments()) {
if (document.getCategory().length() == 0) {
continue;
}
if (useOnlyTitle && document.getTitle().length() == 0) {
continue;
}
String content = document.getContentAsString();
String title = document.getTitle();
List<Token> docTokens = useOnlyTitle ? lexer.tokenize(title) : lexer.tokenize(content);
List<String> reduced = new ArrayList<>(docTokens.size());
String category = document.getCategory();
if (categoryCounts.contains(category)) {
category = "__label__" + document.getCategory().replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
} else {
continue;
}
for (Token token : docTokens) {
if (token.getType() == TurkishLexer.PercentNumeral || token.getType() == TurkishLexer.Number || token.getType() == TurkishLexer.Punctuation || token.getType() == TurkishLexer.RomanNumeral || token.getType() == TurkishLexer.Time || token.getType() == TurkishLexer.UnknownWord || token.getType() == TurkishLexer.Unknown) {
continue;
}
String tokenStr = token.getText();
reduced.add(tokenStr);
}
String join = String.join(" ", reduced);
if (useRoots) {
SentenceAnalysis analysis = analyzer.analyze(join);
analyzer.disambiguate(analysis);
List<String> res = new ArrayList<>();
for (SentenceAnalysis.Entry e : analysis) {
WordAnalysis best = e.parses.get(0);
if (best.isUnknown()) {
res.add(e.input);
continue;
}
List<String> lemmas = best.getLemmas();
if (lemmas.size() == 0) {
continue;
}
res.add(lemmas.get(lemmas.size() - 1));
}
join = String.join(" ", res);
}
set.add("#" + document.getId() + " " + category + " " + join.replaceAll("[']", "").toLowerCase(Turkish.LOCALE));
if (c++ % 1000 == 0) {
Log.info("%d of %d processed.", c, corpus.documentCount());
}
}
Log.info("Generate train and test set.");
saveSets(train, test, new LinkedHashSet<>(set));
}
use of zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer in project zemberek-nlp by ahmetaa.
the class DisambiguateSentences method main.
public static void main(String[] args) throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
Z3MarkovModelDisambiguator disambiguator = new Z3MarkovModelDisambiguator();
TurkishSentenceAnalyzer sentenceAnalyzer = new TurkishSentenceAnalyzer(morphology, disambiguator);
new DisambiguateSentences(sentenceAnalyzer).analyzeAndDisambiguate("86 lira harcardım.");
}
use of zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method testSentenceAnalysis.
@Test
@Ignore("Not a Test.")
public void testSentenceAnalysis() throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
Z3MarkovModelDisambiguator disambiguator = new Z3MarkovModelDisambiguator();
TurkishSentenceAnalyzer analyzer = new TurkishSentenceAnalyzer(morphology, disambiguator);
String sentence = "Kırmızı kalemi al.";
Log.info("Sentence = " + sentence);
SentenceAnalysis analysis = analyzer.analyze(sentence);
Log.info("Before disambiguation.");
writeParseResult(analysis);
Log.info("\nAfter disambiguation.");
analyzer.disambiguate(analysis);
writeParseResult(analysis);
}
use of zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method disambiguationMemoryTest.
@Test
@Ignore("Not a Test.")
public void disambiguationMemoryTest() throws IOException {
List<String> lines = Files.readAllLines(Paths.get("/media/depo/data/aaa/corpora/dunya.100k"));
TurkishMorphology parser = TurkishMorphology.createWithDefaults();
TurkishSentenceAnalyzer sentenceAnalyzer = new TurkishSentenceAnalyzer(parser, new Z3MarkovModelDisambiguator());
int k = 0;
for (int i = 0; i < 100; i++) {
Stopwatch sw = Stopwatch.createStarted();
for (String line : lines) {
k += sentenceAnalyzer.bestParse(line).size();
}
Log.info(sw.elapsed(TimeUnit.MILLISECONDS));
}
Log.info(k);
}
Aggregations