use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.
the class ProcessNormalizationCorpus method process.
void process(BlockTextLoader corpusProvider, int threadCount, Path outRoot) throws Exception {
ExecutorService service = new BlockingExecutor(threadCount);
AtomicInteger c = new AtomicInteger(0);
for (TextChunk chunk : corpusProvider) {
service.submit(() -> {
List<String> sentences = TextCleaner.cleanAndExtractSentences(chunk.getData());
sentences = sentences.stream().map(s -> normalizer.preProcess(s)).collect(Collectors.toList());
Path p = outRoot.resolve(String.valueOf(c.getAndIncrement()));
try {
Files.write(p, sentences, StandardCharsets.UTF_8);
} catch (IOException e) {
e.printStackTrace();
}
Log.info(c.get() * BLOCK_SIZE + " Lines processed.");
});
}
service.shutdown();
service.awaitTermination(1, TimeUnit.DAYS);
}
use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.
the class RemoveDuplicateLines method recreateCorpus.
private void recreateCorpus() throws IOException {
int lineCounter = 0;
int writtenLines = 0;
try (PrintWriter writer = new PrintWriter(new OutputStreamWriter(IOUtil.geBufferedOutputStream(output), "UTF-8"))) {
BlockTextLoader loader = BlockTextLoader.fromPath(corpus, 10_000);
for (TextChunk block : loader) {
for (String line : block.getData()) {
String l = line;
if (normalizeLines) {
l = process(line);
}
lineCounter++;
if (lineCounter % PROGRESS == 0) {
Log.info("Total lines read: %d. Lines Written: %d", lineCounter, writtenLines);
}
long hash = longHash(l);
if (index.get(hash) == lineCounter) {
if (writeCounts) {
writer.println(histogram.get(hash) + " " + line);
} else {
writer.println(line);
}
writtenLines++;
}
}
if (count != -1 && lineCounter > count) {
break;
}
}
Log.info("Total lines read: %d. Lines Written: %d", lineCounter, writtenLines);
}
}
use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.
the class RemoveDuplicateLines method findDuplicates.
private void findDuplicates() {
BlockTextLoader loader = BlockTextLoader.fromPath(corpus, 10_000);
int lineCounter = 0;
for (TextChunk block : loader) {
for (String line : block.getData()) {
String l = line;
if (normalizeLines) {
l = process(line);
}
totalCount++;
lineCounter++;
if (totalCount % PROGRESS == 0) {
Log.info("Total lines read: %d. Duplicates: %d", totalCount, duplicateCount);
}
long hash = longHash(l);
if (index.containsKey(hash)) {
duplicateCount++;
} else {
index.put(hash, totalCount);
}
histogram.increment(hash);
}
if (count != -1 && lineCounter > count) {
break;
}
}
Log.info("Total lines read: %d. Duplicates: %d", totalCount, duplicateCount);
Log.info("Duplicate Ratio: %.3f", duplicateCount * 100.0d / totalCount);
}
use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.
the class ItemFindExperiment method processCorpus.
static void processCorpus(Path in, Path out) throws IOException {
BlockTextLoader loader = BlockTextLoader.fromPath(in, 10000);
try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8")) {
for (TextChunk chunk : loader) {
LinkedHashSet<String> unique = new LinkedHashSet<>(chunk.getData());
for (String l : unique) {
if (!Strings.containsNone(l, "[]#~|")) {
continue;
}
l = l.toLowerCase(es).replaceAll("[^0-9a-zñáéíóúü]", " ").replaceAll("\\s+", " ").trim();
if (l.length() == 0) {
continue;
}
if (l.length() < 20) {
continue;
}
pw.println(l);
}
}
}
}
use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.
the class StemDisambiguationExperiment method doit.
private void doit() throws IOException {
System.setProperty("org.jline.terminal.dumb", "true");
List<Path> paths = new ArrayList<>();
if (input.toFile().isFile()) {
paths.add(input);
} else {
Set<String> dirNamesToProcess = new HashSet<>();
if (dirList != null) {
List<String> dirNames = TextIO.loadLines(dirList, "#");
Log.info("Directory names to process:");
for (String dirName : dirNames) {
Log.info(dirName);
}
dirNamesToProcess.addAll(dirNames);
}
List<Path> directories = Files.walk(input, recurse ? Integer.MAX_VALUE : 1).filter(s -> s.toFile().isDirectory() && !s.equals(input)).collect(Collectors.toList());
for (Path directory : directories) {
if (dirList != null && !dirNamesToProcess.contains(directory.toFile().getName())) {
continue;
}
paths.addAll(Files.walk(directory, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList()));
}
}
Log.info("There are %d files to process.", paths.size());
long totalLines = 0;
for (Path path : paths) {
totalLines += TextIO.lineCount(path);
}
if (paths.size() == 0) {
Log.info("No corpus files found for input : %s", input);
System.exit(0);
}
AtomicLong sentenceCount = new AtomicLong(0);
try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
BlockTextLoader loader = BlockTextLoader.fromPaths(paths, 30_000);
BlockingExecutor executor = new BlockingExecutor(threadCount);
for (TextChunk chunk : loader) {
executor.submit(() -> {
List<String> data = new ArrayList<>(new LinkedHashSet<>(chunk.getData()));
List<String> sentences = TextCleaner.cleanAndExtractSentences(data);
sentences = sentences.stream().filter(this::unambiguous).map(s -> toLowercase ? s.toLowerCase(Turkish.LOCALE) : s).collect(Collectors.toList());
synchronized (this) {
sentences.forEach(pw::println);
sentenceCount.addAndGet(sentences.size());
System.out.println(chunk.size());
}
});
}
executor.shutdown();
}
Log.info("%d sentences are written in %s", sentenceCount.get(), output);
}
Aggregations