Search in sources :

Example 11 with BlockTextLoader

use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.

the class ProcessTwnertcData method main.

public static void main(String[] args) throws IOException {
    Path corpus = Paths.get("/media/ahmetaa/depo/ner/TWNERTC_All_Versions/TWNERTC_TC_Coarse_Grained_NER_DomainDependent_NoiseReduction.DUMP");
    Path nerOut = Paths.get("/media/ahmetaa/depo/ner/ner-coarse");
    Path categoryOut = Paths.get("/media/ahmetaa/depo/classification/twnertc-data");
    BlockTextLoader loader = BlockTextLoader.fromPath(corpus, 10_000);
    List<String> nerLines = new ArrayList<>();
    List<String> categoryLines = new ArrayList<>();
    Histogram<String> categories = new Histogram<>();
    for (TextChunk chunk : loader) {
        for (String line : chunk) {
            List<String> parts = TextUtil.TAB_SPLITTER.splitToList(line);
            categoryLines.add("__label__" + parts.get(0) + " " + parts.get(2));
            categories.add(parts.get(0));
            List<String> nerLabels = TextUtil.SPACE_SPLITTER.splitToList(parts.get(1));
            List<String> nerWords = TextUtil.SPACE_SPLITTER.splitToList(parts.get(2));
            if (nerLabels.size() != nerWords.size()) {
                continue;
            }
            List<NerRange> ranges = new ArrayList<>();
            NerRange range = new NerRange();
            for (int i = 0; i < nerLabels.size(); i++) {
                String lbl = nerLabels.get(i);
                String word = nerWords.get(i);
                if (lbl.equals("O")) {
                    if (range.type == null) {
                        range.type = "O";
                    } else {
                        if (range.type.equals("O")) {
                            range.seq.add(word);
                        } else {
                            ranges.add(range);
                            range = new NerRange();
                            range.type = "O";
                            range.seq.add(word);
                        }
                    }
                }
            }
        }
        Log.info(chunk.index * loader.getBlockSize());
    }
    Files.write(categoryOut, categoryLines);
    categories.saveSortedByCounts(Paths.get("/media/ahmetaa/depo/classification/categories"), " ");
}
Also used : Path(java.nio.file.Path) Histogram(zemberek.core.collections.Histogram) BlockTextLoader(zemberek.core.text.BlockTextLoader) ArrayList(java.util.ArrayList) TextChunk(zemberek.core.text.TextChunk)

Example 12 with BlockTextLoader

use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.

the class NormalizationScripts method cleanTwitterData.

static void cleanTwitterData(Path in, Path out) throws Exception {
    AnalysisCache cache = AnalysisCache.builder().dynamicCacheSize(300_000, 500_000).build();
    TurkishMorphology morphology = TurkishMorphology.builder().setCache(cache).setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().build();
    int threadCount = Runtime.getRuntime().availableProcessors() / 2;
    if (threadCount > 20) {
        threadCount = 20;
    }
    ExecutorService executorService = new BlockingExecutor(threadCount);
    CompletionService<TwitterSaver> service = new ExecutorCompletionService<>(executorService);
    int blockSize = 20_000;
    BlockTextLoader loader = BlockTextLoader.fromPath(in, blockSize);
    Path foreign = Paths.get(out.toString() + ".foreign");
    TwitterSaver saver = new TwitterSaver(out, foreign, blockSize);
    int bc = 0;
    for (TextChunk block : loader) {
        service.submit(new TwitterTask(morphology, saver, block, bc));
        bc++;
    }
    executorService.shutdown();
    executorService.awaitTermination(1, TimeUnit.DAYS);
}
Also used : AnalysisCache(zemberek.morphology.analysis.AnalysisCache) Path(java.nio.file.Path) BlockTextLoader(zemberek.core.text.BlockTextLoader) ExecutorService(java.util.concurrent.ExecutorService) ExecutorCompletionService(java.util.concurrent.ExecutorCompletionService) TextChunk(zemberek.core.text.TextChunk) TurkishMorphology(zemberek.morphology.TurkishMorphology) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor)

Example 13 with BlockTextLoader

use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.

the class QuestionClassifier method collectCoarseData.

private static void collectCoarseData() throws IOException {
    Path root = Paths.get("/media/ahmetaa/depo/corpora/open-subtitles-tr-2018-small");
    BlockTextLoader corpusProvider = BlockTextLoader.fromDirectory(root);
    LinkedHashSet<String> questions = new LinkedHashSet<>();
    LinkedHashSet<String> notQuestions = new LinkedHashSet<>();
    Random rnd = new Random();
    int quesCount = 200_000;
    int noQuesCount = 300_000;
    for (TextChunk chunk : corpusProvider) {
        for (String line : chunk) {
            if (line.length() > 80) {
                continue;
            }
            if (line.endsWith("?") && questions.size() < quesCount) {
                int r = rnd.nextInt(3);
                if (r < 2) {
                    questions.add("__label__question " + line);
                } else {
                    questions.add("__label__question " + line.replaceAll("\\?", "").trim());
                }
            } else {
                if (notQuestions.size() < noQuesCount) {
                    notQuestions.add("__label__not_question " + line);
                }
            }
        }
        if (questions.size() == quesCount && notQuestions.size() == noQuesCount) {
            break;
        }
    }
    Path outQ = Paths.get("/media/ahmetaa/depo/classification/question/coarse/questions-raw");
    Files.write(outQ, questions, StandardCharsets.UTF_8);
    Path outNotQ = Paths.get("/media/ahmetaa/depo/classification/question/coarse/not_questions-raw");
    Files.write(outNotQ, questions, StandardCharsets.UTF_8);
    List<String> all = new ArrayList<>(questions);
    all.addAll(notQuestions);
    Collections.shuffle(all);
    Path allData = Paths.get("/media/ahmetaa/depo/classification/question/coarse/all-raw");
    Files.write(allData, all, StandardCharsets.UTF_8);
}
Also used : Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) BlockTextLoader(zemberek.core.text.BlockTextLoader) Random(java.util.Random) ArrayList(java.util.ArrayList) TextChunk(zemberek.core.text.TextChunk)

Example 14 with BlockTextLoader

use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.

the class ProcessNormalizationCorpus method main.

public static void main(String[] args) throws Exception {
    TurkishMorphology morphology = getTurkishMorphology();
    Path normalizationDataRoot = Paths.get("/home/aaa/data/normalization/test-large");
    Path lmPath = Paths.get("/home/aaa/data/normalization/lm.slm");
    TurkishSentenceNormalizer normalizationPreprocessor = new TurkishSentenceNormalizer(morphology, normalizationDataRoot, lmPath);
    ProcessNormalizationCorpus processor = new ProcessNormalizationCorpus(normalizationPreprocessor);
    Path corporaRoot = Paths.get("/home/aaa/data/corpora");
    Path outRoot = Paths.get("/home/aaa/data/normalization/corpus/clean");
    Path rootList = corporaRoot.resolve("clean-list");
    Files.createDirectories(outRoot);
    BlockTextLoader corpusProvider = BlockTextLoader.fromDirectoryRoot(corporaRoot, rootList, BLOCK_SIZE);
    // create vocabularies
    int threadCount = Runtime.getRuntime().availableProcessors() / 2;
    if (threadCount > 10) {
        threadCount = 10;
    }
    processor.process(corpusProvider, threadCount, outRoot);
    Log.info("Done.");
}
Also used : Path(java.nio.file.Path) BlockTextLoader(zemberek.core.text.BlockTextLoader) TurkishMorphology(zemberek.morphology.TurkishMorphology) NormalizationVocabularyGenerator.getTurkishMorphology(zemberek.normalization.NormalizationVocabularyGenerator.getTurkishMorphology)

Example 15 with BlockTextLoader

use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.

the class NormalizationVocabularyGenerator method main.

public static void main(String[] args) throws Exception {
    TurkishMorphology morphology = getTurkishMorphology();
    NormalizationVocabularyGenerator generator = new NormalizationVocabularyGenerator(morphology);
    Path corporaRoot = Paths.get("/home/aaa/data/normalization/corpus");
    Path outRoot = Paths.get("/home/aaa/data/normalization/vocab-clean");
    Path rootList = corporaRoot.resolve("clean-list");
    BlockTextLoader corpusProvider = BlockTextLoader.fromDirectoryRoot(corporaRoot, rootList, 30_000);
    Files.createDirectories(outRoot);
    // create vocabularies
    int threadCount = Runtime.getRuntime().availableProcessors() / 2;
    if (threadCount > 22) {
        threadCount = 22;
    }
    generator.createVocabulary(corpusProvider, threadCount, outRoot);
}
Also used : Path(java.nio.file.Path) BlockTextLoader(zemberek.core.text.BlockTextLoader) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Aggregations

BlockTextLoader (zemberek.core.text.BlockTextLoader)15 Path (java.nio.file.Path)12 TextChunk (zemberek.core.text.TextChunk)11 ArrayList (java.util.ArrayList)7 LinkedHashSet (java.util.LinkedHashSet)6 TurkishMorphology (zemberek.morphology.TurkishMorphology)6 IOException (java.io.IOException)4 PrintWriter (java.io.PrintWriter)4 List (java.util.List)4 Collectors (java.util.stream.Collectors)4 Log (zemberek.core.logging.Log)4 Random (java.util.Random)3 BlockingExecutor (zemberek.core.concurrency.BlockingExecutor)3 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)3 DataInputStream (java.io.DataInputStream)2 DataOutputStream (java.io.DataOutputStream)2 Files (java.nio.file.Files)2 Arrays (java.util.Arrays)2 HashSet (java.util.HashSet)2 Set (java.util.Set)2