Search in sources :

Example 11 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class ClassificationExampleBase method removeNonWords.

protected String removeNonWords(String sentence) {
    List<Token> docTokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
    List<String> reduced = new ArrayList<>(docTokens.size());
    for (Token token : docTokens) {
        String text = token.getText();
        // skip label and ending words.
        if (text.startsWith("_") || text.contains("__")) {
            reduced.add(text);
            continue;
        }
        Token.Type type = token.getType();
        if (type == Token.Type.Mention || type == Token.Type.HashTag || type == Token.Type.URL || type == Token.Type.Punctuation || type == Type.RomanNumeral || type == Token.Type.Time || type == Token.Type.UnknownWord || type == Token.Type.Unknown) {
            continue;
        }
        reduced.add(text);
    }
    return String.join(" ", reduced);
}
Also used : Type(zemberek.tokenization.Token.Type) ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token)

Example 12 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class UnsupervisedKeyPhraseExtractor method collectCorpusStatistics.

static CorpusStatistics collectCorpusStatistics(WebCorpus corpus) throws IOException {
    CorpusStatistics statistics = new CorpusStatistics(1_000_000);
    for (WebDocument document : corpus.getDocuments()) {
        Histogram<String> docHistogram = new Histogram<>();
        List<String> sentences = extractor.fromParagraphs(document.getLines());
        for (String sentence : sentences) {
            List<Token> tokens = lexer.tokenize(sentence);
            for (Token token : tokens) {
                if (!tokenTypeAccpetable(token)) {
                    continue;
                }
                String s = normalize(token.getText());
                if (TurkishStopWords.DEFAULT.contains(s)) {
                    continue;
                }
                docHistogram.add(s);
            }
        }
        statistics.termFrequencies.add(docHistogram);
        for (String s : docHistogram) {
            statistics.documentFrequencies.add(s);
        }
    }
    statistics.documentCount = corpus.documentCount();
    return statistics;
}
Also used : Histogram(zemberek.core.collections.Histogram) WebDocument(zemberek.corpus.WebDocument) Token(zemberek.tokenization.Token)

Example 13 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class UnsupervisedKeyPhraseExtractor method collectGrams.

void collectGrams(List<Token> tokens, Histogram<Term> grams, int order, int offset) {
    for (int i = 0; i < tokens.size() - order; i++) {
        String[] words = new String[order];
        boolean fail = false;
        for (int j = 0; j < order; j++) {
            Token t = tokens.get(i + j);
            if (!tokenTypeAccpetable(t)) {
                fail = true;
                break;
            }
            String word = normalize(t.getText());
            if (TurkishStopWords.DEFAULT.contains(word)) {
                fail = true;
                break;
            }
            words[j] = word;
        }
        if (!fail) {
            Term t = new Term(words);
            int count = grams.add(t);
            if (count == 1) {
                // if this is the first time, set the first occurance index.
                t.setFirstOccurrenceIndex(offset + i);
            }
        }
    }
}
Also used : Token(zemberek.tokenization.Token)

Example 14 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class AutomaticLabelingExperiment method processContent.

public String processContent(TurkishMorphology analyzer, String content, boolean useRoots) {
    List<Token> docTokens = lexer.tokenize(content);
    List<String> reduced = new ArrayList<>(docTokens.size());
    for (Token token : docTokens) {
        if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || token.getType() == Token.Type.Punctuation || token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
            continue;
        }
        String tokenStr = token.getText();
        reduced.add(tokenStr);
    }
    String joined = String.join(" ", reduced);
    if (useRoots) {
        SentenceAnalysis analysis = analyzer.analyzeAndDisambiguate(joined);
        List<String> res = new ArrayList<>();
        for (SentenceWordAnalysis e : analysis) {
            SingleAnalysis best = e.getBestAnalysis();
            if (best.isUnknown()) {
                res.add(e.getWordAnalysis().getInput());
                continue;
            }
            List<String> lemmas = best.getLemmas();
            if (lemmas.size() == 0) {
                continue;
            }
            res.add(lemmas.get(lemmas.size() - 1));
        }
        joined = String.join(" ", res);
    }
    return joined.replaceAll("[']", "").toLowerCase(Turkish.LOCALE);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 15 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class _MorphologicalAmbiguityResolverExperiment method collect.

private List<SingleAnalysisSentence> collect(Path p, int maxAnalysisCount) throws IOException {
    List<String> sentences = getSentences(p);
    TurkishMorphology analyzer = TurkishMorphology.createWithDefaults();
    int tokenCount = 0;
    int sentenceCount = 0;
    List<SingleAnalysisSentence> result = new ArrayList<>();
    for (String sentence : sentences) {
        sentence = sentence.replaceAll("\\s+|\\u00a0", " ");
        sentence = sentence.replaceAll("[\\u00ad]", "");
        sentence = sentence.replaceAll("[…]", "...");
        List<Single> singleAnalysisWords = new ArrayList<>();
        List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
        boolean failed = false;
        int i = 0;
        for (Token token : tokens) {
            tokenCount++;
            String rawWord = token.getText();
            String word = Character.isUpperCase(rawWord.charAt(0)) ? Turkish.capitalize(rawWord) : rawWord.toLowerCase(Turkish.LOCALE);
            WordAnalysis results;
            if (cache.containsKey(word)) {
                results = cache.get(word);
            } else {
                results = analyzer.analyze(word);
                cache.put(word, results);
            }
            if (results.analysisCount() == 0) {
                if (Strings.containsNone(word, "0123456789-.")) {
                    failedWords.add(word);
                }
            }
            if (results.analysisCount() < 1 || results.analysisCount() > maxAnalysisCount) {
                failed = true;
                break;
            } else {
                List<SingleAnalysis> filtered = results.stream().filter(s -> !(s.getDictionaryItem().secondaryPos == SecondaryPos.ProperNoun && Character.isLowerCase(rawWord.charAt(0)))).collect(Collectors.toList());
                if (filtered.size() == 0) {
                    failed = true;
                    break;
                }
                singleAnalysisWords.add(new Single(word, i, results.copyFor(filtered)));
                i++;
            }
        }
        if (!failed) {
            result.add(new SingleAnalysisSentence(sentence, singleAnalysisWords));
        }
        sentenceCount++;
        if (sentenceCount % 2000 == 0) {
            Log.info("%d sentences %d tokens analyzed. %d found", sentenceCount, tokenCount, result.size());
        }
    }
    return result;
}
Also used : Strings(zemberek.core.io.Strings) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Turkish(zemberek.core.turkish.Turkish) Token(zemberek.tokenization.Token) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Map(java.util.Map) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) SecondaryPos(zemberek.core.turkish.SecondaryPos) PrintWriter(java.io.PrintWriter) Files(java.nio.file.Files) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Objects(java.util.Objects) List(java.util.List) Paths(java.nio.file.Paths) TurkishSentenceExtractor(zemberek.tokenization.TurkishSentenceExtractor) LanguageIdentifier(zemberek.langid.LanguageIdentifier) Pattern(java.util.regex.Pattern) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token)

Aggregations

Token (zemberek.tokenization.Token)25 ArrayList (java.util.ArrayList)13 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)8 Histogram (zemberek.core.collections.Histogram)7 TurkishMorphology (zemberek.morphology.TurkishMorphology)6 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)5 Stopwatch (com.google.common.base.Stopwatch)4 Path (java.nio.file.Path)4 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)4 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)4 IOException (java.io.IOException)3 LinkedHashSet (java.util.LinkedHashSet)3 Ignore (org.junit.Ignore)3 Test (org.junit.Test)3 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)3 TurkishSentenceExtractor (zemberek.tokenization.TurkishSentenceExtractor)3 StandardCharsets (java.nio.charset.StandardCharsets)2 Files (java.nio.file.Files)2 Paths (java.nio.file.Paths)2 List (java.util.List)2