Search in sources :

Example 1 with SecondaryPos

use of zemberek.core.turkish.SecondaryPos in project zemberek-nlp by ahmetaa.

the class _MorphologicalAmbiguityResolverExperiment method collect.

private List<SingleAnalysisSentence> collect(Path p, int maxAnalysisCount) throws IOException {
    List<String> sentences = getSentences(p);
    TurkishMorphology analyzer = TurkishMorphology.createWithDefaults();
    int tokenCount = 0;
    int sentenceCount = 0;
    List<SingleAnalysisSentence> result = new ArrayList<>();
    for (String sentence : sentences) {
        sentence = sentence.replaceAll("\\s+|\\u00a0", " ");
        sentence = sentence.replaceAll("[\\u00ad]", "");
        sentence = sentence.replaceAll("[…]", "...");
        List<Single> singleAnalysisWords = new ArrayList<>();
        List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
        boolean failed = false;
        int i = 0;
        for (Token token : tokens) {
            tokenCount++;
            String rawWord = token.getText();
            String word = Character.isUpperCase(rawWord.charAt(0)) ? Turkish.capitalize(rawWord) : rawWord.toLowerCase(Turkish.LOCALE);
            WordAnalysis results;
            if (cache.containsKey(word)) {
                results = cache.get(word);
            } else {
                results = analyzer.analyze(word);
                cache.put(word, results);
            }
            if (results.analysisCount() == 0) {
                if (Strings.containsNone(word, "0123456789-.")) {
                    failedWords.add(word);
                }
            }
            if (results.analysisCount() < 1 || results.analysisCount() > maxAnalysisCount) {
                failed = true;
                break;
            } else {
                List<SingleAnalysis> filtered = results.stream().filter(s -> !(s.getDictionaryItem().secondaryPos == SecondaryPos.ProperNoun && Character.isLowerCase(rawWord.charAt(0)))).collect(Collectors.toList());
                if (filtered.size() == 0) {
                    failed = true;
                    break;
                }
                singleAnalysisWords.add(new Single(word, i, results.copyFor(filtered)));
                i++;
            }
        }
        if (!failed) {
            result.add(new SingleAnalysisSentence(sentence, singleAnalysisWords));
        }
        sentenceCount++;
        if (sentenceCount % 2000 == 0) {
            Log.info("%d sentences %d tokens analyzed. %d found", sentenceCount, tokenCount, result.size());
        }
    }
    return result;
}
Also used : Strings(zemberek.core.io.Strings) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Turkish(zemberek.core.turkish.Turkish) Token(zemberek.tokenization.Token) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Map(java.util.Map) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) SecondaryPos(zemberek.core.turkish.SecondaryPos) PrintWriter(java.io.PrintWriter) Files(java.nio.file.Files) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Objects(java.util.Objects) List(java.util.List) Paths(java.nio.file.Paths) TurkishSentenceExtractor(zemberek.tokenization.TurkishSentenceExtractor) LanguageIdentifier(zemberek.langid.LanguageIdentifier) Pattern(java.util.regex.Pattern) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token)

Example 2 with SecondaryPos

use of zemberek.core.turkish.SecondaryPos in project zemberek-nlp by ahmetaa.

the class UnidentifiedTokenAnalyzer method tryWordWithApostrophe.

private List<SingleAnalysis> tryWordWithApostrophe(String word, SecondaryPos secondaryPos) {
    String normalized = TurkishAlphabet.INSTANCE.normalizeApostrophe(word);
    int index = normalized.indexOf('\'');
    if (index <= 0 || index == normalized.length() - 1) {
        return Collections.emptyList();
    }
    String stem = normalized.substring(0, index);
    String ending = normalized.substring(index + 1);
    StemAndEnding se = new StemAndEnding(stem, ending);
    // TODO: should we remove dots with normalization?
    String stemNormalized = TurkishAlphabet.INSTANCE.normalize(se.stem).replaceAll("[.]", "");
    String endingNormalized = TurkishAlphabet.INSTANCE.normalize(se.ending);
    String pronunciation = guessPronunciation(stemNormalized);
    boolean capitalize = secondaryPos == SecondaryPos.ProperNoun || secondaryPos == SecondaryPos.Abbreviation;
    boolean pronunciationPossible = alphabet.containsVowel(pronunciation);
    DictionaryItem item = new DictionaryItem(capitalize ? Turkish.capitalize(normalized) : (pronunciationPossible ? stem : word), stemNormalized, pronunciation, PrimaryPos.Noun, secondaryPos);
    if (!pronunciationPossible) {
        List<SingleAnalysis> result = new ArrayList<>(1);
        result.add(SingleAnalysis.dummy(word, item));
        return result;
    }
    boolean itemDoesNotExist = !lexicon.containsItem(item);
    if (itemDoesNotExist) {
        item.attributes.add(RootAttribute.Runtime);
        analyzer.getStemTransitions().addDictionaryItem(item);
    }
    String toParse = stemNormalized + endingNormalized;
    List<SingleAnalysis> noQuotesParses = analyzer.analyze(toParse);
    if (itemDoesNotExist) {
        analyzer.getStemTransitions().removeDictionaryItem(item);
    }
    List<SingleAnalysis> analyses = noQuotesParses.stream().filter(noQuotesParse -> noQuotesParse.getStem().equals(stemNormalized)).collect(Collectors.toList());
    return analyses;
}
Also used : StemAndEnding(zemberek.core.turkish.StemAndEnding) StemAndEnding(zemberek.core.turkish.StemAndEnding) RootAttribute(zemberek.core.turkish.RootAttribute) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) Turkish(zemberek.core.turkish.Turkish) List(java.util.List) Token(zemberek.tokenization.Token) Lists(com.google.common.collect.Lists) Matcher(java.util.regex.Matcher) PronunciationGuesser(zemberek.morphology.analysis.tr.PronunciationGuesser) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) TurkishNumbers(zemberek.morphology.analysis.tr.TurkishNumbers) Map(java.util.Map) PrimaryPos(zemberek.core.turkish.PrimaryPos) Pattern(java.util.regex.Pattern) RootLexicon(zemberek.morphology.lexicon.RootLexicon) Collections(java.util.Collections) SecondaryPos(zemberek.core.turkish.SecondaryPos) TurkishNumeralEndingMachine(zemberek.morphology.analysis.tr.TurkishNumeralEndingMachine) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) ArrayList(java.util.ArrayList)

Example 3 with SecondaryPos

use of zemberek.core.turkish.SecondaryPos in project zemberek-nlp by ahmetaa.

the class UnidentifiedTokenAnalyzer method analyze.

public synchronized List<SingleAnalysis> analyze(Token token) {
    SecondaryPos sPos = guessSecondaryPosType(token);
    String word = token.getText();
    // TODO: for now, for regular words and numbers etc, use the analyze method.
    if (sPos == SecondaryPos.None) {
        if (word.contains("?")) {
            return Collections.emptyList();
        }
        if (alphabet.containsDigit(word)) {
            return tryNumeral(token);
        } else {
            return analyzeWord(word, word.contains(".") ? SecondaryPos.Abbreviation : SecondaryPos.ProperNoun);
        }
    }
    if (sPos == SecondaryPos.RomanNumeral) {
        return getForRomanNumeral(token);
    }
    if (sPos == SecondaryPos.Date || sPos == SecondaryPos.Clock) {
        return tryNumeral(token);
    }
    // TODO: consider returning analysis results without interfering with analyzer.
    String normalized = nonLettersPattern.matcher(word).replaceAll("");
    DictionaryItem item = new DictionaryItem(word, word, normalized, PrimaryPos.Noun, sPos);
    if (sPos == SecondaryPos.HashTag || sPos == SecondaryPos.Email || sPos == SecondaryPos.Url || sPos == SecondaryPos.Mention) {
        return analyzeWord(word, sPos);
    }
    boolean itemDoesNotExist = !lexicon.containsItem(item);
    if (itemDoesNotExist) {
        item.attributes.add(RootAttribute.Runtime);
        analyzer.getStemTransitions().addDictionaryItem(item);
    }
    List<SingleAnalysis> results = analyzer.analyze(word);
    if (itemDoesNotExist) {
        analyzer.getStemTransitions().removeDictionaryItem(item);
    }
    return results;
}
Also used : DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) SecondaryPos(zemberek.core.turkish.SecondaryPos)

Aggregations

SecondaryPos (zemberek.core.turkish.SecondaryPos)3 ArrayList (java.util.ArrayList)2 List (java.util.List)2 Map (java.util.Map)2 Pattern (java.util.regex.Pattern)2 Collectors (java.util.stream.Collectors)2 Turkish (zemberek.core.turkish.Turkish)2 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)2 Token (zemberek.tokenization.Token)2 Lists (com.google.common.collect.Lists)1 IOException (java.io.IOException)1 PrintWriter (java.io.PrintWriter)1 StandardCharsets (java.nio.charset.StandardCharsets)1 Files (java.nio.file.Files)1 Path (java.nio.file.Path)1 Paths (java.nio.file.Paths)1 Collections (java.util.Collections)1 HashMap (java.util.HashMap)1 LinkedHashSet (java.util.LinkedHashSet)1 Objects (java.util.Objects)1