use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.
the class ProcessTwnertcData method main.
public static void main(String[] args) throws IOException {
Path corpus = Paths.get("/media/ahmetaa/depo/ner/TWNERTC_All_Versions/TWNERTC_TC_Coarse_Grained_NER_DomainDependent_NoiseReduction.DUMP");
Path nerOut = Paths.get("/media/ahmetaa/depo/ner/ner-coarse");
Path categoryOut = Paths.get("/media/ahmetaa/depo/classification/twnertc-data");
BlockTextLoader loader = BlockTextLoader.fromPath(corpus, 10_000);
List<String> nerLines = new ArrayList<>();
List<String> categoryLines = new ArrayList<>();
Histogram<String> categories = new Histogram<>();
for (TextChunk chunk : loader) {
for (String line : chunk) {
List<String> parts = TextUtil.TAB_SPLITTER.splitToList(line);
categoryLines.add("__label__" + parts.get(0) + " " + parts.get(2));
categories.add(parts.get(0));
List<String> nerLabels = TextUtil.SPACE_SPLITTER.splitToList(parts.get(1));
List<String> nerWords = TextUtil.SPACE_SPLITTER.splitToList(parts.get(2));
if (nerLabels.size() != nerWords.size()) {
continue;
}
List<NerRange> ranges = new ArrayList<>();
NerRange range = new NerRange();
for (int i = 0; i < nerLabels.size(); i++) {
String lbl = nerLabels.get(i);
String word = nerWords.get(i);
if (lbl.equals("O")) {
if (range.type == null) {
range.type = "O";
} else {
if (range.type.equals("O")) {
range.seq.add(word);
} else {
ranges.add(range);
range = new NerRange();
range.type = "O";
range.seq.add(word);
}
}
}
}
}
Log.info(chunk.index * loader.getBlockSize());
}
Files.write(categoryOut, categoryLines);
categories.saveSortedByCounts(Paths.get("/media/ahmetaa/depo/classification/categories"), " ");
}
use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.
the class NormalizationScripts method cleanTwitterData.
static void cleanTwitterData(Path in, Path out) throws Exception {
AnalysisCache cache = AnalysisCache.builder().dynamicCacheSize(300_000, 500_000).build();
TurkishMorphology morphology = TurkishMorphology.builder().setCache(cache).setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().build();
int threadCount = Runtime.getRuntime().availableProcessors() / 2;
if (threadCount > 20) {
threadCount = 20;
}
ExecutorService executorService = new BlockingExecutor(threadCount);
CompletionService<TwitterSaver> service = new ExecutorCompletionService<>(executorService);
int blockSize = 20_000;
BlockTextLoader loader = BlockTextLoader.fromPath(in, blockSize);
Path foreign = Paths.get(out.toString() + ".foreign");
TwitterSaver saver = new TwitterSaver(out, foreign, blockSize);
int bc = 0;
for (TextChunk block : loader) {
service.submit(new TwitterTask(morphology, saver, block, bc));
bc++;
}
executorService.shutdown();
executorService.awaitTermination(1, TimeUnit.DAYS);
}
use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.
the class QuestionClassifier method collectCoarseData.
private static void collectCoarseData() throws IOException {
Path root = Paths.get("/media/ahmetaa/depo/corpora/open-subtitles-tr-2018-small");
BlockTextLoader corpusProvider = BlockTextLoader.fromDirectory(root);
LinkedHashSet<String> questions = new LinkedHashSet<>();
LinkedHashSet<String> notQuestions = new LinkedHashSet<>();
Random rnd = new Random();
int quesCount = 200_000;
int noQuesCount = 300_000;
for (TextChunk chunk : corpusProvider) {
for (String line : chunk) {
if (line.length() > 80) {
continue;
}
if (line.endsWith("?") && questions.size() < quesCount) {
int r = rnd.nextInt(3);
if (r < 2) {
questions.add("__label__question " + line);
} else {
questions.add("__label__question " + line.replaceAll("\\?", "").trim());
}
} else {
if (notQuestions.size() < noQuesCount) {
notQuestions.add("__label__not_question " + line);
}
}
}
if (questions.size() == quesCount && notQuestions.size() == noQuesCount) {
break;
}
}
Path outQ = Paths.get("/media/ahmetaa/depo/classification/question/coarse/questions-raw");
Files.write(outQ, questions, StandardCharsets.UTF_8);
Path outNotQ = Paths.get("/media/ahmetaa/depo/classification/question/coarse/not_questions-raw");
Files.write(outNotQ, questions, StandardCharsets.UTF_8);
List<String> all = new ArrayList<>(questions);
all.addAll(notQuestions);
Collections.shuffle(all);
Path allData = Paths.get("/media/ahmetaa/depo/classification/question/coarse/all-raw");
Files.write(allData, all, StandardCharsets.UTF_8);
}
use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.
the class ProcessNormalizationCorpus method main.
public static void main(String[] args) throws Exception {
TurkishMorphology morphology = getTurkishMorphology();
Path normalizationDataRoot = Paths.get("/home/aaa/data/normalization/test-large");
Path lmPath = Paths.get("/home/aaa/data/normalization/lm.slm");
TurkishSentenceNormalizer normalizationPreprocessor = new TurkishSentenceNormalizer(morphology, normalizationDataRoot, lmPath);
ProcessNormalizationCorpus processor = new ProcessNormalizationCorpus(normalizationPreprocessor);
Path corporaRoot = Paths.get("/home/aaa/data/corpora");
Path outRoot = Paths.get("/home/aaa/data/normalization/corpus/clean");
Path rootList = corporaRoot.resolve("clean-list");
Files.createDirectories(outRoot);
BlockTextLoader corpusProvider = BlockTextLoader.fromDirectoryRoot(corporaRoot, rootList, BLOCK_SIZE);
// create vocabularies
int threadCount = Runtime.getRuntime().availableProcessors() / 2;
if (threadCount > 10) {
threadCount = 10;
}
processor.process(corpusProvider, threadCount, outRoot);
Log.info("Done.");
}
use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.
the class NormalizationVocabularyGenerator method main.
public static void main(String[] args) throws Exception {
TurkishMorphology morphology = getTurkishMorphology();
NormalizationVocabularyGenerator generator = new NormalizationVocabularyGenerator(morphology);
Path corporaRoot = Paths.get("/home/aaa/data/normalization/corpus");
Path outRoot = Paths.get("/home/aaa/data/normalization/vocab-clean");
Path rootList = corporaRoot.resolve("clean-list");
BlockTextLoader corpusProvider = BlockTextLoader.fromDirectoryRoot(corporaRoot, rootList, 30_000);
Files.createDirectories(outRoot);
// create vocabularies
int threadCount = Runtime.getRuntime().availableProcessors() / 2;
if (threadCount > 22) {
threadCount = 22;
}
generator.createVocabulary(corpusProvider, threadCount, outRoot);
}
Aggregations