use of zemberek.core.text.TextUtil in project zemberek-nlp by ahmetaa.
the class PreprocessTurkishCorpus method run.
@Override
public void run() throws IOException, InterruptedException {
List<Path> paths = new ArrayList<>();
if (input.toFile().isFile()) {
paths.add(input);
} else {
Set<String> dirNamesToProcess = new HashSet<>();
if (dirList != null) {
List<String> dirNames = TextIO.loadLines(dirList);
Log.info("Directory names to process:");
for (String dirName : dirNames) {
Log.info(dirName);
}
dirNamesToProcess.addAll(dirNames);
}
List<Path> directories = Files.walk(input, recurse ? Integer.MAX_VALUE : 1).filter(s -> s.toFile().isDirectory() && !s.equals(input)).collect(Collectors.toList());
for (Path directory : directories) {
if (dirList != null && !dirNamesToProcess.contains(directory.toFile().getName())) {
continue;
}
paths.addAll(Files.walk(directory, 1).filter(s -> s.toFile().isFile() && (extension == null || s.endsWith(extension))).collect(Collectors.toList()));
}
}
Log.info("There are %d files to process.", paths.size());
long totalLines = 0;
for (Path path : paths) {
totalLines += TextIO.lineCount(path);
}
final long total = totalLines;
if (paths.size() == 0) {
Log.info("No corpus files found for input : %s", input);
System.exit(0);
}
AtomicLong sentenceCount = new AtomicLong(0);
if (operation == Operation.LEMMA) {
morphology = TurkishMorphology.createWithDefaults();
}
try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
BlockTextLoader loader = BlockTextLoader.fromPaths(paths, 10_000);
BlockingExecutor executor = new BlockingExecutor(threadCount);
AtomicInteger count = new AtomicInteger(0);
for (TextChunk chunk : loader) {
executor.submit(() -> {
List<String> processed = chunk.getData().stream().filter(// ignore meta tag lines.
s -> !s.startsWith("<")).map(TextUtil::normalizeSpacesAndSoftHyphens).collect(Collectors.toList());
List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(processed);
sentences = sentences.stream().filter(s -> !TextUtil.containsCombiningDiacritics(s)).map(s -> {
if (operation == Operation.LEMMA) {
return replaceWordsWithLemma(s);
} else {
return String.join(" ", TurkishTokenizer.DEFAULT.tokenizeToStrings(s));
}
}).map(s -> toLowercase ? s.toLowerCase(Turkish.LOCALE) : s).collect(Collectors.toList());
synchronized (this) {
sentences.forEach(pw::println);
sentenceCount.addAndGet(sentences.size());
int c = count.addAndGet(chunk.size());
System.out.println(String.format("(%d of %d lines) processed.", c, total));
}
});
}
executor.shutdown();
executor.awaitTermination(1, TimeUnit.DAYS);
}
Log.info("%d sentences are written in %s", sentenceCount.get(), output);
}
Aggregations