use of zemberek.core.concurrency.BlockingExecutor in project zemberek-nlp by ahmetaa.
the class ProcessNormalizationCorpus method process.
void process(BlockTextLoader corpusProvider, int threadCount, Path outRoot) throws Exception {
ExecutorService service = new BlockingExecutor(threadCount);
AtomicInteger c = new AtomicInteger(0);
for (TextChunk chunk : corpusProvider) {
service.submit(() -> {
List<String> sentences = TextCleaner.cleanAndExtractSentences(chunk.getData());
sentences = sentences.stream().map(s -> normalizer.preProcess(s)).collect(Collectors.toList());
Path p = outRoot.resolve(String.valueOf(c.getAndIncrement()));
try {
Files.write(p, sentences, StandardCharsets.UTF_8);
} catch (IOException e) {
e.printStackTrace();
}
Log.info(c.get() * BLOCK_SIZE + " Lines processed.");
});
}
service.shutdown();
service.awaitTermination(1, TimeUnit.DAYS);
}
use of zemberek.core.concurrency.BlockingExecutor in project zemberek-nlp by ahmetaa.
the class StemDisambiguationExperiment method doit.
private void doit() throws IOException {
System.setProperty("org.jline.terminal.dumb", "true");
List<Path> paths = new ArrayList<>();
if (input.toFile().isFile()) {
paths.add(input);
} else {
Set<String> dirNamesToProcess = new HashSet<>();
if (dirList != null) {
List<String> dirNames = TextIO.loadLines(dirList, "#");
Log.info("Directory names to process:");
for (String dirName : dirNames) {
Log.info(dirName);
}
dirNamesToProcess.addAll(dirNames);
}
List<Path> directories = Files.walk(input, recurse ? Integer.MAX_VALUE : 1).filter(s -> s.toFile().isDirectory() && !s.equals(input)).collect(Collectors.toList());
for (Path directory : directories) {
if (dirList != null && !dirNamesToProcess.contains(directory.toFile().getName())) {
continue;
}
paths.addAll(Files.walk(directory, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList()));
}
}
Log.info("There are %d files to process.", paths.size());
long totalLines = 0;
for (Path path : paths) {
totalLines += TextIO.lineCount(path);
}
if (paths.size() == 0) {
Log.info("No corpus files found for input : %s", input);
System.exit(0);
}
AtomicLong sentenceCount = new AtomicLong(0);
try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
BlockTextLoader loader = BlockTextLoader.fromPaths(paths, 30_000);
BlockingExecutor executor = new BlockingExecutor(threadCount);
for (TextChunk chunk : loader) {
executor.submit(() -> {
List<String> data = new ArrayList<>(new LinkedHashSet<>(chunk.getData()));
List<String> sentences = TextCleaner.cleanAndExtractSentences(data);
sentences = sentences.stream().filter(this::unambiguous).map(s -> toLowercase ? s.toLowerCase(Turkish.LOCALE) : s).collect(Collectors.toList());
synchronized (this) {
sentences.forEach(pw::println);
sentenceCount.addAndGet(sentences.size());
System.out.println(chunk.size());
}
});
}
executor.shutdown();
}
Log.info("%d sentences are written in %s", sentenceCount.get(), output);
}
use of zemberek.core.concurrency.BlockingExecutor in project zemberek-nlp by ahmetaa.
the class PreprocessTurkishCorpus method run.
@Override
public void run() throws IOException, InterruptedException {
List<Path> paths = new ArrayList<>();
if (input.toFile().isFile()) {
paths.add(input);
} else {
Set<String> dirNamesToProcess = new HashSet<>();
if (dirList != null) {
List<String> dirNames = TextIO.loadLines(dirList);
Log.info("Directory names to process:");
for (String dirName : dirNames) {
Log.info(dirName);
}
dirNamesToProcess.addAll(dirNames);
}
List<Path> directories = Files.walk(input, recurse ? Integer.MAX_VALUE : 1).filter(s -> s.toFile().isDirectory() && !s.equals(input)).collect(Collectors.toList());
for (Path directory : directories) {
if (dirList != null && !dirNamesToProcess.contains(directory.toFile().getName())) {
continue;
}
paths.addAll(Files.walk(directory, 1).filter(s -> s.toFile().isFile() && (extension == null || s.endsWith(extension))).collect(Collectors.toList()));
}
}
Log.info("There are %d files to process.", paths.size());
long totalLines = 0;
for (Path path : paths) {
totalLines += TextIO.lineCount(path);
}
final long total = totalLines;
if (paths.size() == 0) {
Log.info("No corpus files found for input : %s", input);
System.exit(0);
}
AtomicLong sentenceCount = new AtomicLong(0);
if (operation == Operation.LEMMA) {
morphology = TurkishMorphology.createWithDefaults();
}
try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
BlockTextLoader loader = BlockTextLoader.fromPaths(paths, 10_000);
BlockingExecutor executor = new BlockingExecutor(threadCount);
AtomicInteger count = new AtomicInteger(0);
for (TextChunk chunk : loader) {
executor.submit(() -> {
List<String> processed = chunk.getData().stream().filter(// ignore meta tag lines.
s -> !s.startsWith("<")).map(TextUtil::normalizeSpacesAndSoftHyphens).collect(Collectors.toList());
List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(processed);
sentences = sentences.stream().filter(s -> !TextUtil.containsCombiningDiacritics(s)).map(s -> {
if (operation == Operation.LEMMA) {
return replaceWordsWithLemma(s);
} else {
return String.join(" ", TurkishTokenizer.DEFAULT.tokenizeToStrings(s));
}
}).map(s -> toLowercase ? s.toLowerCase(Turkish.LOCALE) : s).collect(Collectors.toList());
synchronized (this) {
sentences.forEach(pw::println);
sentenceCount.addAndGet(sentences.size());
int c = count.addAndGet(chunk.size());
System.out.println(String.format("(%d of %d lines) processed.", c, total));
}
});
}
executor.shutdown();
executor.awaitTermination(1, TimeUnit.DAYS);
}
Log.info("%d sentences are written in %s", sentenceCount.get(), output);
}
use of zemberek.core.concurrency.BlockingExecutor in project zemberek-nlp by ahmetaa.
the class NormalizationScripts method cleanTwitterData.
static void cleanTwitterData(Path in, Path out) throws Exception {
AnalysisCache cache = AnalysisCache.builder().dynamicCacheSize(300_000, 500_000).build();
TurkishMorphology morphology = TurkishMorphology.builder().setCache(cache).setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().build();
int threadCount = Runtime.getRuntime().availableProcessors() / 2;
if (threadCount > 20) {
threadCount = 20;
}
ExecutorService executorService = new BlockingExecutor(threadCount);
CompletionService<TwitterSaver> service = new ExecutorCompletionService<>(executorService);
int blockSize = 20_000;
BlockTextLoader loader = BlockTextLoader.fromPath(in, blockSize);
Path foreign = Paths.get(out.toString() + ".foreign");
TwitterSaver saver = new TwitterSaver(out, foreign, blockSize);
int bc = 0;
for (TextChunk block : loader) {
service.submit(new TwitterTask(morphology, saver, block, bc));
bc++;
}
executorService.shutdown();
executorService.awaitTermination(1, TimeUnit.DAYS);
}
use of zemberek.core.concurrency.BlockingExecutor in project zemberek-nlp by ahmetaa.
the class NormalizationVocabularyGenerator method collectVocabularyHistogram.
Vocabulary collectVocabularyHistogram(BlockTextLoader corpora, int threadCount) throws Exception {
ExecutorService executorService = new BlockingExecutor(threadCount);
Vocabulary result = new Vocabulary();
for (TextChunk chunk : corpora) {
Log.info("Processing %s", chunk);
executorService.submit(new WordCollectorTask(chunk, result));
}
executorService.shutdown();
executorService.awaitTermination(1, TimeUnit.DAYS);
return result;
}
Aggregations