use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.
the class ItemFindExperiment method processCorpus.
static void processCorpus(Path in, Path out) throws IOException {
BlockTextLoader loader = BlockTextLoader.fromPath(in, 10000);
try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8")) {
for (TextChunk chunk : loader) {
LinkedHashSet<String> unique = new LinkedHashSet<>(chunk.getData());
for (String l : unique) {
if (!Strings.containsNone(l, "[]#~|")) {
continue;
}
l = l.toLowerCase(es).replaceAll("[^0-9a-zñáéíóúü]", " ").replaceAll("\\s+", " ").trim();
if (l.length() == 0) {
continue;
}
if (l.length() < 20) {
continue;
}
pw.println(l);
}
}
}
}
use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.
the class StemDisambiguationExperiment method doit.
private void doit() throws IOException {
System.setProperty("org.jline.terminal.dumb", "true");
List<Path> paths = new ArrayList<>();
if (input.toFile().isFile()) {
paths.add(input);
} else {
Set<String> dirNamesToProcess = new HashSet<>();
if (dirList != null) {
List<String> dirNames = TextIO.loadLines(dirList, "#");
Log.info("Directory names to process:");
for (String dirName : dirNames) {
Log.info(dirName);
}
dirNamesToProcess.addAll(dirNames);
}
List<Path> directories = Files.walk(input, recurse ? Integer.MAX_VALUE : 1).filter(s -> s.toFile().isDirectory() && !s.equals(input)).collect(Collectors.toList());
for (Path directory : directories) {
if (dirList != null && !dirNamesToProcess.contains(directory.toFile().getName())) {
continue;
}
paths.addAll(Files.walk(directory, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList()));
}
}
Log.info("There are %d files to process.", paths.size());
long totalLines = 0;
for (Path path : paths) {
totalLines += TextIO.lineCount(path);
}
if (paths.size() == 0) {
Log.info("No corpus files found for input : %s", input);
System.exit(0);
}
AtomicLong sentenceCount = new AtomicLong(0);
try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
BlockTextLoader loader = BlockTextLoader.fromPaths(paths, 30_000);
BlockingExecutor executor = new BlockingExecutor(threadCount);
for (TextChunk chunk : loader) {
executor.submit(() -> {
List<String> data = new ArrayList<>(new LinkedHashSet<>(chunk.getData()));
List<String> sentences = TextCleaner.cleanAndExtractSentences(data);
sentences = sentences.stream().filter(this::unambiguous).map(s -> toLowercase ? s.toLowerCase(Turkish.LOCALE) : s).collect(Collectors.toList());
synchronized (this) {
sentences.forEach(pw::println);
sentenceCount.addAndGet(sentences.size());
System.out.println(chunk.size());
}
});
}
executor.shutdown();
}
Log.info("%d sentences are written in %s", sentenceCount.get(), output);
}
use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.
the class Dictionary method readFromFile.
static Dictionary readFromFile(Path file, final Args args) throws IOException {
Log.info("Initialize dictionary and histograms.");
Dictionary dictionary = new Dictionary(args);
Log.info("Loading text.");
BlockTextLoader loader = new BlockTextLoader(file, 100_000);
SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
int blockCounter = 1;
for (List<String> lines : loader) {
for (String line : lines) {
List<String> split = tokenizer.splitToList(line);
split.add(EOS);
for (String word : split) {
if (word.startsWith("#")) {
continue;
}
dictionary.addWithCount(word, 1);
}
}
Log.info("Lines read: %d (thousands) ", blockCounter * 100);
blockCounter++;
}
Log.info("Word + Label count = %d", dictionary.words_.size());
Log.info("Removing word and labels with small counts. Min word = %d, Min Label = %d", args.minCount, args.minCountLabel);
// now we have the histograms. Remove based on count.
dictionary.words_.sort((e1, e2) -> {
if (e1.type != e2.type) {
return Integer.compare(e1.type, e2.type);
} else {
return Long.compare(e2.count, e1.count);
}
});
LinkedHashSet<Entry> all = new LinkedHashSet<>(dictionary.words_);
List<Entry> toRemove = dictionary.words_.stream().filter(s -> (s.type == TYPE_WORD && s.count < args.minCount || s.type == TYPE_LABEL && s.count < args.minCountLabel)).collect(Collectors.toList());
all.removeAll(toRemove);
dictionary.words_ = new ArrayList<>(all);
dictionary.size_ = 0;
dictionary.nwords_ = 0;
dictionary.nlabels_ = 0;
Arrays.fill(dictionary.word2int_, -1);
for (Entry e : dictionary.words_) {
int i = dictionary.find(e.word);
dictionary.word2int_[i] = dictionary.size_++;
if (e.type == TYPE_WORD) {
dictionary.nwords_++;
}
if (e.type == TYPE_LABEL) {
dictionary.nlabels_++;
}
}
Log.info("Word count = %d , Label count = %d", dictionary.nwords(), dictionary.nlabels());
dictionary.initTableDiscard();
Log.info("Adding character n-grams for words.");
dictionary.initNgrams();
Log.info("Done.");
return dictionary;
}
use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.
the class PreprocessTurkishCorpus method run.
@Override
public void run() throws IOException, InterruptedException {
List<Path> paths = new ArrayList<>();
if (input.toFile().isFile()) {
paths.add(input);
} else {
Set<String> dirNamesToProcess = new HashSet<>();
if (dirList != null) {
List<String> dirNames = TextIO.loadLines(dirList);
Log.info("Directory names to process:");
for (String dirName : dirNames) {
Log.info(dirName);
}
dirNamesToProcess.addAll(dirNames);
}
List<Path> directories = Files.walk(input, recurse ? Integer.MAX_VALUE : 1).filter(s -> s.toFile().isDirectory() && !s.equals(input)).collect(Collectors.toList());
for (Path directory : directories) {
if (dirList != null && !dirNamesToProcess.contains(directory.toFile().getName())) {
continue;
}
paths.addAll(Files.walk(directory, 1).filter(s -> s.toFile().isFile() && (extension == null || s.endsWith(extension))).collect(Collectors.toList()));
}
}
Log.info("There are %d files to process.", paths.size());
long totalLines = 0;
for (Path path : paths) {
totalLines += TextIO.lineCount(path);
}
final long total = totalLines;
if (paths.size() == 0) {
Log.info("No corpus files found for input : %s", input);
System.exit(0);
}
AtomicLong sentenceCount = new AtomicLong(0);
if (operation == Operation.LEMMA) {
morphology = TurkishMorphology.createWithDefaults();
}
try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
BlockTextLoader loader = BlockTextLoader.fromPaths(paths, 10_000);
BlockingExecutor executor = new BlockingExecutor(threadCount);
AtomicInteger count = new AtomicInteger(0);
for (TextChunk chunk : loader) {
executor.submit(() -> {
List<String> processed = chunk.getData().stream().filter(// ignore meta tag lines.
s -> !s.startsWith("<")).map(TextUtil::normalizeSpacesAndSoftHyphens).collect(Collectors.toList());
List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(processed);
sentences = sentences.stream().filter(s -> !TextUtil.containsCombiningDiacritics(s)).map(s -> {
if (operation == Operation.LEMMA) {
return replaceWordsWithLemma(s);
} else {
return String.join(" ", TurkishTokenizer.DEFAULT.tokenizeToStrings(s));
}
}).map(s -> toLowercase ? s.toLowerCase(Turkish.LOCALE) : s).collect(Collectors.toList());
synchronized (this) {
sentences.forEach(pw::println);
sentenceCount.addAndGet(sentences.size());
int c = count.addAndGet(chunk.size());
System.out.println(String.format("(%d of %d lines) processed.", c, total));
}
});
}
executor.shutdown();
executor.awaitTermination(1, TimeUnit.DAYS);
}
Log.info("%d sentences are written in %s", sentenceCount.get(), output);
}
use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.
the class Dictionary method readFromFile.
static Dictionary readFromFile(Path file, final Args args) {
Log.info("Initialize dictionary and histograms.");
Dictionary dictionary = new Dictionary(args);
Log.info("Loading text.");
BlockTextLoader loader = BlockTextLoader.fromPath(file, 100_000);
SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
int blockCounter = 1;
for (TextChunk lines : loader) {
for (String line : lines) {
List<String> split = tokenizer.splitToList(line);
split.add(EOS);
for (String word : split) {
if (word.startsWith("#")) {
continue;
}
dictionary.add(word);
}
}
Log.info("Lines read: %d (thousands) ", blockCounter * 100);
blockCounter++;
}
Log.info("Word + Label count = %d", dictionary.words_.size());
Log.info("Removing word and labels with small counts. Min word = %d, Min Label = %d", args.minCount, args.minCountLabel);
// now we have the histograms. Remove based on count.
dictionary.words_.sort((e1, e2) -> {
if (e1.type != e2.type) {
return Integer.compare(e1.type, e2.type);
} else {
return Long.compare(e2.count, e1.count);
}
});
// TODO: add threshold method.
LinkedHashSet<Entry> all = new LinkedHashSet<>(dictionary.words_);
List<Entry> toRemove = dictionary.words_.stream().filter(s -> (s.type == TYPE_WORD && s.count < args.minCount || s.type == TYPE_LABEL && s.count < args.minCountLabel)).collect(Collectors.toList());
all.removeAll(toRemove);
dictionary.words_ = new ArrayList<>(all);
dictionary.size_ = 0;
dictionary.nwords_ = 0;
dictionary.nlabels_ = 0;
Arrays.fill(dictionary.word2int_, -1);
for (Entry e : dictionary.words_) {
int i = dictionary.find(e.word);
dictionary.word2int_[i] = dictionary.size_++;
if (e.type == TYPE_WORD) {
dictionary.nwords_++;
}
if (e.type == TYPE_LABEL) {
dictionary.nlabels_++;
}
}
Log.info("Word count = %d , Label count = %d", dictionary.nwords(), dictionary.nlabels());
dictionary.initTableDiscard();
dictionary.initNgrams();
return dictionary;
}
Aggregations