use of zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence in project zemberek-nlp by ahmetaa.
the class GenerateDataWithRules method extractData.
private void extractData(Path p, Path outRoot, int resultLimit, int maxAmbigiousWordCount) throws IOException {
List<Path> files = Files.walk(p, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList());
BatchResult result = new BatchResult();
int i = 0;
for (Path file : files) {
Log.info("Processing %s", file);
LinkedHashSet<String> sentences = getSentences(file);
collect(result, sentences, maxAmbigiousWordCount, resultLimit);
i++;
Log.info("%d of %d", i, files.size());
if (resultLimit > 0 && result.results.size() > resultLimit) {
break;
}
}
String s = p.toFile().getName();
Log.info("Saving.");
Path out = outRoot.resolve(s + "-rule-result.txt");
Path amb = outRoot.resolve(s + "-rule-result-amb.txt");
try (PrintWriter pwu = new PrintWriter(out.toFile(), "utf-8");
PrintWriter pwa = new PrintWriter(amb.toFile(), "utf-8")) {
for (ResultSentence sentence : result.results) {
pwu.println("S:" + sentence.sentence);
pwa.println("S:" + sentence.sentence);
for (AmbiguityAnalysis analysis : sentence.results) {
List<String> forTrain = analysis.getForTrainingOutput();
forTrain.forEach(pwu::println);
pwa.println(analysis.token);
for (AnalysisDecision r : analysis.choices) {
pwa.println(r.analysis.formatLong());
}
}
pwu.println();
pwa.println();
}
}
}
use of zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence in project zemberek-nlp by ahmetaa.
the class RuleBasedDisambiguatorTest method test.
@Test
public void test() throws IOException {
// String input = "ABD Açık Serena Williams'ın";
// String input = "Çünkü birbirine tezat oluşturuyor.";
// String input = "O anda gördüm.";
// String input = "Aklımıza ilk gelen emeği öncelemek.";
// String input = "Petrolün Türkiye üzerinden dünya pazarına satılması.";
String input = "4 Neden önemli?";
// String input = "Sadece partimi iktidar yaptım.";
TurkishMorphology analyzer = TurkishMorphology.createWithDefaults();
// Rules rules = new Rules();
// rules.pairLexRules.add(PairRule.fromLine("Aklı*|aklı* [akıl:Noun] *"));
RuleBasedDisambiguator disambiguator = new RuleBasedDisambiguator(analyzer, Rules.fromResources());
ResultSentence resultSentence = disambiguator.disambiguate(input);
System.out.println(resultSentence.allIgnoredCount());
for (AmbiguityAnalysis a : resultSentence.results) {
a.getForTrainingOutput().forEach(System.out::println);
}
}
use of zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence in project zemberek-nlp by ahmetaa.
the class GenerateDataWithRules method collect.
private void collect(BatchResult batchResult, Collection<String> sentences, int maxAmbigiousWordCount, int resultLimit) {
List<List<String>> group = group(new ArrayList<>(sentences), 5000);
for (List<String> strings : group) {
LinkedHashSet<String> toProcess = getAccpetableSentences(strings);
Log.info("Processing.. %d found.", batchResult.acceptedSentences.size());
for (String sentence : toProcess) {
ResultSentence r = ruleBasedDisambiguator.disambiguate(sentence);
if (r.ambiguousWordCount() > maxAmbigiousWordCount) {
continue;
}
if (r.zeroAnalysisCount() > 0) {
continue;
}
if (r.allIgnoredCount() > 0) {
Log.warn("Sentence [%s] contains word(s) that all analyses are ignored.", r.sentence);
continue;
}
boolean sentenceOk = true;
for (WordAnalysis an : r.sentenceAnalysis) {
boolean ok = true;
for (Predicate<WordAnalysis> predicate : acceptWordPredicates) {
if (!predicate.test(an)) {
ok = false;
break;
}
}
if (!ok) {
batchResult.ignoredSentences.add(sentence);
sentenceOk = false;
break;
}
}
if (sentenceOk) {
batchResult.acceptedSentences.add(sentence);
batchResult.results.add(r);
if (resultLimit > 0 && batchResult.results.size() > resultLimit) {
return;
}
}
}
}
}
Aggregations