use of zemberek.morphology._MorphologicalAmbiguityResolverExperiment.SingleAnalysisSentence in project zemberek-nlp by ahmetaa.
the class _MorphologicalAmbiguityResolverExperiment method extracData.
public void extracData(Path p, Path outRoot, int maxAnalysisCount, int resultLimit) throws IOException {
List<Path> files = Files.walk(p, 1).filter(s -> s.toFile().isFile() && s.toFile().getName().endsWith(".corpus")).collect(Collectors.toList());
LinkedHashSet<SingleAnalysisSentence> result = new LinkedHashSet<>();
int i = 0;
for (Path file : files) {
List<SingleAnalysisSentence> collect = collect(file, maxAnalysisCount);
result.addAll(collect);
i++;
Log.info("%d of %d", i, files.size());
if (resultLimit > 0 && result.size() > resultLimit) {
break;
}
}
String s = p.toFile().getName();
Path out = outRoot.resolve(s + "-unambigious.txt");
try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8")) {
for (SingleAnalysisSentence sentence : result) {
pw.println(sentence.sentence);
for (Single single : sentence.tokens) {
for (_SingleAnalysis r : single.res) {
pw.println(r.formatSurfaceSequence());
}
}
pw.println();
}
}
// saving failed words.
failedWords.saveSortedByKeys(outRoot.resolve(s + "-failed.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
// saving failed words by frequency.
failedWords.saveSortedByCounts(outRoot.resolve(s + "-failed.freq.txt"), " ");
}
use of zemberek.morphology._MorphologicalAmbiguityResolverExperiment.SingleAnalysisSentence in project zemberek-nlp by ahmetaa.
the class _WordCollector method extracData.
public Histogram<String> extracData(Path p, Path outRoot, int resultLimit) throws IOException {
Histogram<String> words = new Histogram<>(5_000_000);
List<Path> files = Files.walk(p, 1).filter(s -> s.toFile().isFile() && s.toFile().getName().endsWith(".corpus")).collect(Collectors.toList());
LinkedHashSet<SingleAnalysisSentence> result = new LinkedHashSet<>();
for (Path file : files) {
Log.info("Processing %s", file);
List<String> lines = Files.readAllLines(file, StandardCharsets.UTF_8).stream().filter(s -> !s.startsWith("<")).collect(Collectors.toList());
List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(lines);
for (String sentence : sentences) {
sentence = sentence.replaceAll("[\\s/\\-\\u00a0]+", " ");
sentence = sentence.replaceAll("[\\u00ad]", "");
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
for (Token token : tokens) {
String rawWord = token.getText();
if (!Strings.containsNone(rawWord, "0123456789_")) {
continue;
}
String word = Character.isUpperCase(rawWord.charAt(0)) ? Turkish.capitalize(rawWord) : rawWord.toLowerCase(Turkish.LOCALE);
words.add(word);
}
}
Log.info("Count = %d", words.size());
}
String s = p.toFile().getName();
Log.info("Saving words.");
// saving failed words.
words.saveSortedByKeys(outRoot.resolve(s + "-counts-sorted-name.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
// saving failed words by frequency.
words.saveSortedByCounts(outRoot.resolve(s + "-counts-sorted-freq.txt"), " ");
Files.write(outRoot.resolve(s + "-words-sorted-freq.txt"), words.getSortedList());
Files.write(outRoot.resolve(s + "-words-sorted-name.txt"), words.getSortedList(Turkish.STRING_COMPARATOR_ASC));
return words;
}
Aggregations