Search in sources :

Example 16 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class CategoryPredictionExperiment method generateSets.

private void generateSets(Path input, Path train, Path test, boolean useOnlyTitle, boolean useLemmas) throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    WebCorpus corpus = new WebCorpus("category", "category");
    Log.info("Loading corpus from %s", input);
    corpus.addDocuments(WebCorpus.loadDocuments(input));
    List<String> set = new ArrayList<>(corpus.documentCount());
    TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
    Histogram<String> categoryCounts = new Histogram<>();
    for (WebDocument document : corpus.getDocuments()) {
        String category = document.getCategory();
        if (category.length() > 0) {
            categoryCounts.add(category);
        }
    }
    Log.info("All category count = %d", categoryCounts.size());
    categoryCounts.removeSmaller(20);
    for (String c : categoryCounts.getSortedList()) {
        System.out.println(c + " " + categoryCounts.getCount(c));
    }
    Log.info("Reduced label count = %d", categoryCounts.size());
    Log.info("Extracting data from %d documents ", corpus.documentCount());
    int c = 0;
    for (WebDocument document : corpus.getDocuments()) {
        if (document.getCategory().length() == 0) {
            continue;
        }
        if (useOnlyTitle && document.getTitle().length() == 0) {
            continue;
        }
        String content = document.getContentAsString();
        String title = document.getTitle();
        List<Token> docTokens = useOnlyTitle ? lexer.tokenize(title) : lexer.tokenize(content);
        List<String> reduced = new ArrayList<>(docTokens.size());
        String category = document.getCategory();
        if (categoryCounts.contains(category)) {
            category = "__label__" + document.getCategory().replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
        } else {
            continue;
        }
        for (Token token : docTokens) {
            if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || token.getType() == Token.Type.Punctuation || token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
                continue;
            }
            String tokenStr = token.getText();
            reduced.add(tokenStr);
        }
        String join = String.join(" ", reduced);
        if (join.trim().isEmpty()) {
            continue;
        }
        if (useLemmas) {
            SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(join);
            List<String> res = new ArrayList<>();
            for (SentenceWordAnalysis e : analysis) {
                SingleAnalysis best = e.getBestAnalysis();
                if (best.isUnknown()) {
                    res.add(e.getWordAnalysis().getInput());
                    continue;
                }
                List<String> lemmas = best.getLemmas();
                if (lemmas.size() == 0) {
                    continue;
                }
                res.add(lemmas.get(lemmas.size() - 1));
            }
            join = String.join(" ", res);
        }
        set.add("#" + document.getId() + " " + category + " " + join.replaceAll("[']", "").toLowerCase(Turkish.LOCALE));
        if (c++ % 1000 == 0) {
            Log.info("%d of %d processed.", c, corpus.documentCount());
        }
    }
    Log.info("Generate train and test set.");
    saveSets(train, test, new LinkedHashSet<>(set));
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WebDocument(zemberek.corpus.WebDocument) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) WebCorpus(zemberek.corpus.WebCorpus)

Example 17 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class WordHistogram method generateHistograms.

static void generateHistograms(List<String> paragraphs, Path outRoot) throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Histogram<String> roots = new Histogram<>(1000_000);
    Histogram<String> words = new Histogram<>(1000_000);
    int paragraphCounter = 0;
    int sentenceCounter = 0;
    int tokenCounter = 0;
    for (String paragraph : paragraphs) {
        List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraph(paragraph);
        sentenceCounter += sentences.size();
        for (String sentence : sentences) {
            List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
            tokenCounter += tokens.size();
            SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
            for (SentenceWordAnalysis e : analysis) {
                SingleAnalysis best = e.getBestAnalysis();
                if (best.getPos() == PrimaryPos.Numeral || best.getPos() == PrimaryPos.Punctuation) {
                    continue;
                }
                if (best.isUnknown()) {
                    continue;
                }
                if (best.isRuntime() && !Strings.containsNone(e.getWordAnalysis().getInput(), "01234567890")) {
                    continue;
                }
                List<String> lemmas = best.getLemmas();
                if (lemmas.size() == 0) {
                    continue;
                }
                roots.add(best.getDictionaryItem().lemma);
                String w = e.getWordAnalysis().getInput();
                if (best.getDictionaryItem().secondaryPos != SecondaryPos.ProperNoun) {
                    w = w.toLowerCase(Turkish.LOCALE);
                } else {
                    w = Turkish.capitalize(w);
                }
                words.add(w);
            }
        }
        paragraphCounter++;
        if (paragraphCounter % 1000 == 0) {
            System.out.println(paragraphCounter + " of " + paragraphs.size());
        }
    }
    System.out.println("tokenCounter = " + tokenCounter);
    System.out.println("sentenceCounter = " + sentenceCounter);
    Files.createDirectories(outRoot);
    roots.saveSortedByCounts(outRoot.resolve("roots.freq.txt"), " ");
    roots.saveSortedByKeys(outRoot.resolve("roots.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
    words.saveSortedByCounts(outRoot.resolve("words.freq.txt"), " ");
    words.saveSortedByKeys(outRoot.resolve("words.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
    words.removeSmaller(10);
    words.saveSortedByCounts(outRoot.resolve("words10.freq.txt"), " ");
    words.saveSortedByKeys(outRoot.resolve("words10.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) Token(zemberek.tokenization.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 18 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class ClassificationConsole method removeNonWords.

private String removeNonWords(String sentence) {
    List<Token> docTokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
    List<String> reduced = new ArrayList<>(docTokens.size());
    for (Token token : docTokens) {
        if (token.getType() == Type.PercentNumeral || token.getType() == Type.Number || token.getType() == Type.Punctuation || token.getType() == Type.RomanNumeral || token.getType() == Type.Time || token.getType() == Type.UnknownWord || token.getType() == Type.Unknown) {
            if (!token.getText().contains("__")) {
                continue;
            }
        }
        String tokenStr = token.getText();
        reduced.add(tokenStr);
    }
    return String.join(" ", reduced);
}
Also used : ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token)

Example 19 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class TurkishTokenizationExample method tokenIterator.

public static void tokenIterator() {
    System.out.println("Low level tokenization iterator using Ant-lr Lexer.");
    String input = "İstanbul'a, merhaba!";
    System.out.println("Input = " + input);
    Iterator<Token> tokenIterator = tokenizer.getTokenIterator(input);
    while (tokenIterator.hasNext()) {
        Token token = tokenIterator.next();
        System.out.println(token);
    }
}
Also used : Token(zemberek.tokenization.Token)

Example 20 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class TurkishSentenceNormalizer method normalize.

public String normalize(String sentence) {
    if (sentence.trim().length() == 0) {
        return sentence;
    }
    String processed = preProcess(sentence);
    List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(processed);
    List<Candidates> candidatesList = new ArrayList<>();
    for (int i = 0; i < tokens.size(); i++) {
        Token currentToken = tokens.get(i);
        String current = currentToken.getText();
        String next = i == tokens.size() - 1 ? null : tokens.get(i + 1).getText();
        String previous = i == 0 ? null : tokens.get(i - 1).getText();
        LinkedHashSet<String> candidates = new LinkedHashSet<>(2);
        // add matches from manual lookup
        candidates.addAll(lookupManual.get(current));
        // add matches from random walk
        candidates.addAll(lookupFromGraph.get(current));
        // add matches from ascii equivalents.
        // TODO: this may decrease accuracy. Also, this can be eliminated with ascii tolerant analyzer.
        candidates.addAll(lookupFromAscii.get(current));
        // add matches from informal analysis to formal surface conversion.
        WordAnalysis analyses = informalAsciiTolerantMorphology.analyze(current);
        for (SingleAnalysis analysis : analyses) {
            if (analysis.containsInformalMorpheme()) {
                WordGenerator.Result result = analysisConverter.convert(current, analysis);
                if (result != null) {
                    candidates.add(result.surface);
                }
            } else {
                List<WordGenerator.Result> results = morphology.getWordGenerator().generate(analysis.getDictionaryItem(), analysis.getMorphemes());
                for (Result result : results) {
                    candidates.add(result.surface);
                }
            }
        }
        // get top 3 1 distance matches.
        if ((analyses.analysisCount() == 0) && current.length() > 3) {
            List<String> spellCandidates = spellChecker.suggestForWord(current, previous, next, lm);
            if (spellCandidates.size() > 3) {
                spellCandidates = new ArrayList<>(spellCandidates.subList(0, 3));
            }
            candidates.addAll(spellCandidates);
        }
        // if still there is no match, add the word itself.
        if (candidates.isEmpty() || morphology.analyze(current).isCorrect()) {
            candidates.add(current);
        }
        Candidates result = new Candidates(currentToken.getText(), candidates.stream().map(Candidate::new).collect(Collectors.toList()));
        candidatesList.add(result);
    }
    // Apply Viterbi decoding and return result.
    return String.join(" ", decode(candidatesList));
}
Also used : LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Result(zemberek.morphology.generator.WordGenerator.Result) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token) WordGenerator(zemberek.morphology.generator.WordGenerator) Result(zemberek.morphology.generator.WordGenerator.Result)

Aggregations

Token (zemberek.tokenization.Token)25 ArrayList (java.util.ArrayList)13 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)8 Histogram (zemberek.core.collections.Histogram)7 TurkishMorphology (zemberek.morphology.TurkishMorphology)6 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)5 Stopwatch (com.google.common.base.Stopwatch)4 Path (java.nio.file.Path)4 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)4 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)4 IOException (java.io.IOException)3 LinkedHashSet (java.util.LinkedHashSet)3 Ignore (org.junit.Ignore)3 Test (org.junit.Test)3 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)3 TurkishSentenceExtractor (zemberek.tokenization.TurkishSentenceExtractor)3 StandardCharsets (java.nio.charset.StandardCharsets)2 Files (java.nio.file.Files)2 Paths (java.nio.file.Paths)2 List (java.util.List)2