use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.
the class NoisyWordsLexiconGenerator method main.
public static void main(String[] args) throws Exception {
int threadCount = Runtime.getRuntime().availableProcessors() / 2;
if (threadCount > 22) {
threadCount = 22;
}
Path corporaRoot = Paths.get("/home/aaa/data/normalization/corpus");
Path workDir = Paths.get("/home/aaa/data/normalization/test-large");
Path corpusDirList = corporaRoot.resolve("all-list");
Files.createDirectories(workDir);
Path correct = workDir.resolve("correct");
Path incorrect = workDir.resolve("incorrect");
Path maybeIncorrect = workDir.resolve("possibly-incorrect");
NormalizationVocabulary vocabulary = new NormalizationVocabulary(correct, incorrect, maybeIncorrect, 1, 3, 1);
NoisyWordsLexiconGenerator generator = new NoisyWordsLexiconGenerator(vocabulary, threadCount);
BlockTextLoader corpusProvider = BlockTextLoader.fromDirectoryRoot(corporaRoot, corpusDirList, 50_000);
// create graph
Path graphPath = workDir.resolve("graph");
generator.createGraph(corpusProvider, graphPath);
Histogram<String> incorrectWords = Histogram.loadFromUtf8File(incorrect, ' ');
incorrectWords.add(Histogram.loadFromUtf8File(maybeIncorrect, ' '));
generator.createCandidates(graphPath, workDir, incorrectWords);
Log.info("Done");
}
use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.
the class NormalizationScripts method splitSingleFileCorpus.
static void splitSingleFileCorpus(Path in, Path outRoot) throws IOException {
int blockSize = 100_000;
BlockTextLoader loader = BlockTextLoader.fromPath(in, blockSize);
Files.createDirectories(outRoot);
int bc = 0;
for (TextChunk block : loader) {
String name = in.toFile().getName();
Path blockPath = outRoot.resolve(name + "." + bc);
Files.write(blockPath, block, StandardCharsets.UTF_8);
bc++;
}
}
use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.
the class CorpusNerCollector method main.
public static void main(String[] args) throws IOException {
Path corporaRoot = Paths.get("/media/ahmetaa/depo/corpora");
Path corpusDirList = corporaRoot.resolve("ner-list");
Path outRoot = Paths.get("/media/ahmetaa/depo/ner/out");
Files.createDirectories(outRoot);
BlockTextLoader corpusProvider = BlockTextLoader.fromDirectoryRoot(corporaRoot, corpusDirList, 10_000);
// assumes you generated a model in my-model directory.
Path modelRoot = Paths.get("my-model");
TurkishMorphology morphology = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().build();
PerceptronNer ner = PerceptronNer.loadModel(modelRoot, morphology);
Set<String> illegal = Sets.newHashSet(".", ",", "!", "?", ":");
List<String> lines = new ArrayList<>();
int c = 0;
int k = 0;
for (TextChunk chunk : corpusProvider) {
LinkedHashSet<String> sentences = new LinkedHashSet<>(TextCleaner.cleanAndExtractSentences(chunk.getData()));
for (String sentence : sentences) {
if (sentence.length() > 100) {
continue;
}
NerSentence result = ner.findNamedEntities(sentence);
int neCount = result.getNamedEntities().size();
List<NamedEntity> nes = result.getNamedEntities();
boolean badNamedEntity = false;
for (NamedEntity ne : nes) {
for (NerToken token : ne.tokens) {
if (illegal.contains(token.word)) {
badNamedEntity = true;
break;
}
WordAnalysis a = morphology.analyze(token.word);
for (SingleAnalysis analysis : a) {
DictionaryItem item = analysis.getDictionaryItem();
if (item.secondaryPos != SecondaryPos.Abbreviation && item.secondaryPos != SecondaryPos.ProperNoun) {
badNamedEntity = true;
break;
}
}
}
if (badNamedEntity) {
break;
}
}
if (badNamedEntity) {
continue;
}
if (neCount > 0 && neCount < 3) {
lines.add(result.getAsTrainingSentence(AnnotationStyle.BRACKET));
c++;
if (c == 1000) {
Path out = outRoot.resolve(chunk.id + "-" + k);
Files.write(out, lines);
Log.info("%s created. ", out);
lines = new ArrayList<>();
c = 0;
k++;
if (k > 10) {
System.exit(0);
}
}
}
}
}
}
use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.
the class RemoveDuplicateLines method recreateCorpus.
private void recreateCorpus() throws IOException {
int lineCounter = 0;
int writtenLines = 0;
try (PrintWriter writer = new PrintWriter(new OutputStreamWriter(IOUtil.geBufferedOutputStream(output), "UTF-8"))) {
BlockTextLoader loader = BlockTextLoader.fromPath(corpus, 10_000);
for (TextChunk block : loader) {
for (String line : block.getData()) {
String l = line;
if (normalizeLines) {
l = process(line);
}
lineCounter++;
if (lineCounter % PROGRESS == 0) {
Log.info("Total lines read: %d. Lines Written: %d", lineCounter, writtenLines);
}
long hash = longHash(l);
if (index.get(hash) == lineCounter) {
if (writeCounts) {
writer.println(histogram.get(hash) + " " + line);
} else {
writer.println(line);
}
writtenLines++;
}
}
if (count != -1 && lineCounter > count) {
break;
}
}
Log.info("Total lines read: %d. Lines Written: %d", lineCounter, writtenLines);
}
}
use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.
the class RemoveDuplicateLines method findDuplicates.
private void findDuplicates() {
BlockTextLoader loader = BlockTextLoader.fromPath(corpus, 10_000);
int lineCounter = 0;
for (TextChunk block : loader) {
for (String line : block.getData()) {
String l = line;
if (normalizeLines) {
l = process(line);
}
totalCount++;
lineCounter++;
if (totalCount % PROGRESS == 0) {
Log.info("Total lines read: %d. Duplicates: %d", totalCount, duplicateCount);
}
long hash = longHash(l);
if (index.containsKey(hash)) {
duplicateCount++;
} else {
index.put(hash, totalCount);
}
histogram.increment(hash);
}
if (count != -1 && lineCounter > count) {
break;
}
}
Log.info("Total lines read: %d. Duplicates: %d", totalCount, duplicateCount);
Log.info("Duplicate Ratio: %.3f", duplicateCount * 100.0d / totalCount);
}
Aggregations