Search in sources :

Example 6 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class UnsupervisedKeyPhraseExtractor method collectCorpusStatisticsForLemmas.

static CorpusStatistics collectCorpusStatisticsForLemmas(WebCorpus corpus, TurkishMorphology analyzer, int count) throws IOException {
    CorpusStatistics statistics = new CorpusStatistics(1_000_000);
    int docCount = 0;
    for (WebDocument document : corpus.getDocuments()) {
        Histogram<String> docHistogram = new Histogram<>();
        List<String> sentences = extractor.fromParagraphs(document.getLines());
        for (String sentence : sentences) {
            List<SingleAnalysis> analysis = analyzer.analyzeAndDisambiguate(sentence).bestAnalysis();
            for (SingleAnalysis w : analysis) {
                if (!analysisAcceptable(w)) {
                    continue;
                }
                String s = w.getStemAndEnding().concat();
                if (TurkishStopWords.DEFAULT.contains(s)) {
                    continue;
                }
                List<String> lemmas = w.getLemmas();
                docHistogram.add(lemmas.get(lemmas.size() - 1));
            }
        }
        statistics.termFrequencies.add(docHistogram);
        for (String s : docHistogram) {
            statistics.documentFrequencies.add(s);
        }
        if (docCount++ % 500 == 0) {
            Log.info("Doc count = %d", docCount);
        }
        if (count > 0 && docCount > count) {
            break;
        }
    }
    statistics.documentCount = count > 0 ? Math.min(count, corpus.documentCount()) : corpus.documentCount();
    return statistics;
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) WebDocument(zemberek.corpus.WebDocument)

Example 7 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class UnsupervisedKeyPhraseExtractor method lemmaNgrams.

private List<Histogram<Term>> lemmaNgrams(List<String> paragraphs) {
    List<Histogram<Term>> ngrams = new ArrayList<>(order + 1);
    for (int i = 0; i < order; i++) {
        ngrams.add(new Histogram<>(100));
    }
    int tokenCount = 0;
    List<String> sentences = extractor.fromParagraphs(paragraphs);
    for (String sentence : sentences) {
        List<SingleAnalysis> analysis = morphology.analyzeAndDisambiguate(sentence).bestAnalysis();
        for (int i = 0; i < order; i++) {
            int currentOrder = i + 1;
            for (int j = 0; j < analysis.size() - currentOrder; j++) {
                String[] words = new String[currentOrder];
                boolean fail = false;
                for (int k = 0; k < currentOrder; k++) {
                    SingleAnalysis a = analysis.get(j + k);
                    if (!analysisAcceptable(a)) {
                        fail = true;
                        break;
                    }
                    String surface = a.getStemAndEnding().concat();
                    if (TurkishStopWords.DEFAULT.contains(surface)) {
                        fail = true;
                        break;
                    }
                    List<String> lemmas = a.getLemmas();
                    words[k] = lemmas.get(lemmas.size() - 1);
                }
                if (!fail) {
                    Term term = new Term(words);
                    int count = ngrams.get(i).add(term);
                    if (count == 1) {
                        // if this is the first time, set the first occurance index.
                        term.setFirstOccurrenceIndex(tokenCount + j);
                    }
                }
                tokenCount += analysis.size();
            }
        }
    }
    return ngrams;
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) ArrayList(java.util.ArrayList)

Example 8 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class GenerateDataWithRules method extractHighlyAmbigiousWordSentences.

private void extractHighlyAmbigiousWordSentences(Path inputRoot, Path outRoot, int minCount, int wordCount) throws IOException {
    List<Path> files = Files.walk(inputRoot, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList());
    Histogram<WordAnalysis> wordAnalyses = new Histogram<>();
    for (Path file : files) {
        Log.info("Processing %s", file);
        LinkedHashSet<String> sentences = getSentences(file);
        List<List<String>> group = group(new ArrayList<>(sentences), 5000);
        for (List<String> lines : group) {
            Log.info("Collected %d words.", wordAnalyses.size());
            LinkedHashSet<String> toProcess = getAccpetableSentences(lines);
            for (String sentence : toProcess) {
                try {
                    SentenceAnalysis sentenceAnalysis = morphology.analyzeAndDisambiguate(sentence);
                    for (SentenceWordAnalysis analysis : sentenceAnalysis) {
                        HashSet<String> stems = new HashSet<>(4);
                        for (SingleAnalysis s : analysis.getWordAnalysis()) {
                            stems.add(s.getStem());
                            if (stems.size() > minCount) {
                                wordAnalyses.add(analysis.getWordAnalysis());
                                break;
                            }
                        }
                    }
                } catch (Exception e) {
                    Log.warn("Error in sentence %s", sentence);
                }
            }
        }
        if (wordAnalyses.size() > wordCount) {
            break;
        }
    }
    String s = inputRoot.toFile().getName();
    Path amb = outRoot.resolve(s + "-amb.txt");
    try (PrintWriter pwa = new PrintWriter(amb.toFile(), "utf-8")) {
        for (WordAnalysis wa : wordAnalyses.getSortedList()) {
            pwa.println(wa.getInput());
            for (SingleAnalysis analysis : wa) {
                pwa.println(analysis.formatLong());
            }
            pwa.println();
        }
    }
}
Also used : Path(java.nio.file.Path) PrintWriter(java.io.PrintWriter) ResultSentence(zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence) Files(java.nio.file.Files) TextUtil(zemberek.core.text.TextUtil) Predicate(java.util.function.Predicate) AnalysisDecision(zemberek.morphology.ambiguity.RuleBasedDisambiguator.AnalysisDecision) Collection(java.util.Collection) IOException(java.io.IOException) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) Collectors(java.util.stream.Collectors) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) List(java.util.List) AmbiguityAnalysis(zemberek.morphology.ambiguity.RuleBasedDisambiguator.AmbiguityAnalysis) Lists(com.google.common.collect.Lists) Paths(java.nio.file.Paths) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) Collections(java.util.Collections) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) IOException(java.io.IOException) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) PrintWriter(java.io.PrintWriter)

Example 9 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method unknownZemberek.

@Test
@Ignore("Not a Test.")
public void unknownZemberek() throws IOException {
    Path wordFreqFile = DATA_PATH.resolve("vocab.all.freq");
    Log.info("Loading histogram.");
    Histogram<String> histogram = Histogram.loadFromUtf8File(wordFreqFile, ' ');
    Path dir = DATA_PATH.resolve("out");
    Log.info("Loading parseable.");
    List<String> zemberekAll = Files.readAllLines(dir.resolve("zemberek-parsed-words.txt"));
    histogram.removeAll(zemberekAll);
    // histogram.removeSmaller(10);
    Log.info("Saving.");
    Files.write(dir.resolve("no-parse-zemberek-freq.txt"), histogram.getSortedList());
    Files.write(dir.resolve("no-parse-zemberek-tr.txt"), histogram.getSortedList((a, b) -> turkishCollator.compare(a, b)));
}
Also used : Path(java.nio.file.Path) TurkishMorphotactics(zemberek.morphology.morphotactics.TurkishMorphotactics) URLDecoder(java.net.URLDecoder) Stopwatch(com.google.common.base.Stopwatch) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) Multimap(com.google.common.collect.Multimap) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) ArrayList(java.util.ArrayList) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) HashSet(java.util.HashSet) Turkish(zemberek.core.turkish.Turkish) Token(zemberek.tokenization.Token) HashMultimap(com.google.common.collect.HashMultimap) Lists(com.google.common.collect.Lists) Locale(java.util.Locale) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) PrimaryPos(zemberek.core.turkish.PrimaryPos) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Splitter(com.google.common.base.Splitter) Path(java.nio.file.Path) LinkedHashMultimap(com.google.common.collect.LinkedHashMultimap) LinkedHashSet(java.util.LinkedHashSet) Collator(java.text.Collator) Histogram(zemberek.core.collections.Histogram) SecondaryPos(zemberek.core.turkish.SecondaryPos) PrintWriter(java.io.PrintWriter) Morpheme(zemberek.morphology.morphotactics.Morpheme) Files(java.nio.file.Files) TurkishMorphology(zemberek.morphology.TurkishMorphology) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) Test(org.junit.Test) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) OflazerAnalyzerRunner(zemberek.morphology.external.OflazerAnalyzerRunner) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Ignore(org.junit.Ignore) Paths(java.nio.file.Paths) TextIO(zemberek.core.text.TextIO) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) RootLexicon(zemberek.morphology.lexicon.RootLexicon) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 10 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class DocumentSimilarityExperiment method removeDuplicates.

public void removeDuplicates(Path input, Path output, int k) throws IOException {
    List<String> all = Files.readAllLines(input);
    Log.info("Sentence count = %d", all.size());
    Histogram<String> h = new Histogram<>(10_000_000);
    h.add(all);
    for (String s : h.getSortedList()) {
        int count = h.getCount(s);
        if (count > k) {
            h.set(s, k);
        }
    }
    int newCount = 0;
    try (PrintWriter pw = new PrintWriter(output.toFile(), "utf-8")) {
        for (String s : all) {
            if (h.getCount(s) > 0) {
                pw.println(s);
                h.decrementIfPositive(s);
                newCount++;
            }
        }
    }
    Log.info("New count = %d", newCount);
}
Also used : Histogram(zemberek.core.collections.Histogram) PrintWriter(java.io.PrintWriter)

Aggregations

Histogram (zemberek.core.collections.Histogram)39 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)17 Path (java.nio.file.Path)15 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)14 ArrayList (java.util.ArrayList)13 TurkishMorphology (zemberek.morphology.TurkishMorphology)12 Token (zemberek.tokenization.Token)12 Stopwatch (com.google.common.base.Stopwatch)11 PrintWriter (java.io.PrintWriter)11 LinkedHashSet (java.util.LinkedHashSet)11 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)11 IOException (java.io.IOException)10 Files (java.nio.file.Files)10 Paths (java.nio.file.Paths)10 List (java.util.List)10 Collectors (java.util.stream.Collectors)10 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)10 StandardCharsets (java.nio.charset.StandardCharsets)9 HashSet (java.util.HashSet)9 Log (zemberek.core.logging.Log)9