Search in sources :

Example 1 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class TurkishSentenceNormalizer method combineNecessaryWords.

String combineNecessaryWords(List<Token> tokens) {
    List<String> result = new ArrayList<>();
    boolean combined = false;
    for (int i = 0; i < tokens.size() - 1; i++) {
        Token first = tokens.get(i);
        Token second = tokens.get(i + 1);
        String firstS = first.getText();
        String secondS = second.getText();
        if (!isWord(first) || !isWord(second)) {
            combined = false;
            result.add(firstS);
            continue;
        }
        if (combined) {
            combined = false;
            continue;
        }
        String c = combineCommon(firstS, secondS);
        if (c.length() > 0) {
            result.add(c);
            combined = true;
        } else {
            result.add(first.getText());
            combined = false;
        }
    }
    if (!combined) {
        result.add(tokens.get(tokens.size() - 1).getText());
    }
    return String.join(" ", result);
}
Also used : ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token)

Example 2 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class TurkishSentenceNormalizer method replaceCommon.

String replaceCommon(List<Token> tokens) {
    List<String> result = new ArrayList<>();
    for (Token token : tokens) {
        String text = token.getText();
        result.add(replacements.getOrDefault(text, text));
    }
    return String.join(" ", result);
}
Also used : ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token)

Example 3 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class SpeedTest method testNewsCorpus.

@Test
@Ignore(value = "Speed Test.")
public void testNewsCorpus() throws IOException {
    // Path p = Paths.get("/media/aaa/Data/corpora/me-sentences/www.aljazeera.com.tr/2018-02-22");
    Path p = Paths.get("src/test/resources/corpora/cnn-turk-10k");
    List<String> sentences = getSentences(p);
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Stopwatch sw = Stopwatch.createStarted();
    int tokenCount = 0;
    int noAnalysis = 0;
    int sentenceCount = 0;
    Histogram<String> failedWords = new Histogram<>(100000);
    for (String sentence : sentences) {
        List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
        for (Token token : tokens) {
            if (token.getType() == Token.Type.Punctuation) {
                continue;
            }
            tokenCount++;
            WordAnalysis results = morphology.analyze(token.getText());
            if (!results.isCorrect()) {
                noAnalysis++;
                failedWords.add(token.getText());
            }
        }
        sentenceCount++;
        if (sentenceCount % 2000 == 0) {
            Log.info("%d tokens analyzed.", tokenCount);
        }
    }
    double seconds = sw.stop().elapsed(TimeUnit.MILLISECONDS) / 1000d;
    double speed = tokenCount / seconds;
    double parseRatio = 100 - (noAnalysis * 100d / tokenCount);
    Log.info("%nElapsed = %.2f seconds", seconds);
    Log.info("%nToken Count (No Punc) = %d %nParse Ratio = %.4f%nSpeed = %.2f tokens/sec%n", tokenCount, parseRatio, speed);
    Log.info("Saving Unknown Tokens");
    failedWords.saveSortedByCounts(Paths.get("unk.freq"), " ");
    failedWords.saveSortedByKeys(Paths.get("unk"), " ", Turkish.STRING_COMPARATOR_ASC);
}
Also used : Path(java.nio.file.Path) Histogram(zemberek.core.collections.Histogram) Stopwatch(com.google.common.base.Stopwatch) Token(zemberek.tokenization.Token) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 4 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class SpeedTest method testForVisualVm.

private static void testForVisualVm(Path p, TurkishMorphology analyzer) throws IOException {
    // Path p = Paths.get("/media/aaa/Data/corpora/me-sentences/www.aljazeera.com.tr/2018-02-22");
    List<String> sentences = getSentences(p);
    Stopwatch sw = Stopwatch.createStarted();
    int tokenCount = 0;
    int noAnalysis = 0;
    int sentenceCount = 0;
    for (String sentence : sentences) {
        List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
        for (Token token : tokens) {
            tokenCount++;
            WordAnalysis results = analyzer.analyze(token.getText());
            if (!results.isCorrect()) {
                noAnalysis++;
            }
        }
        sentenceCount++;
        if (sentenceCount % 2000 == 0) {
            Log.info("%d tokens analyzed.", tokenCount);
        }
    }
    double seconds = sw.stop().elapsed(TimeUnit.MILLISECONDS) / 1000d;
    double speed = tokenCount / seconds;
    double parseRatio = 100 - (noAnalysis * 100d / tokenCount);
    System.out.println(analyzer.getCache());
    Log.info("%nElapsed = %.2f seconds", seconds);
    Log.info("%nToken Count (No Punc) = %d %nParse Ratio = %.4f%nSpeed = %.2f tokens/sec%n", tokenCount, parseRatio, speed);
}
Also used : Stopwatch(com.google.common.base.Stopwatch) Token(zemberek.tokenization.Token)

Example 5 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class NerDataSet method normalizeForNer.

public static String normalizeForNer(String input) {
    input = input.toLowerCase(Turkish.LOCALE);
    List<String> result = new ArrayList<>();
    for (Token t : TurkishTokenizer.DEFAULT.tokenize(input)) {
        String s = t.getText();
        if (t.getType() == Token.Type.Date || t.getType() == Type.Number || t.getType() == Token.Type.Time) {
            s = "*" + s.replaceAll("[0-9]", "D") + "*";
        }
        result.add(s);
    }
    return String.join("", result);
}
Also used : ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token)

Aggregations

Token (zemberek.tokenization.Token)25 ArrayList (java.util.ArrayList)13 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)8 Histogram (zemberek.core.collections.Histogram)7 TurkishMorphology (zemberek.morphology.TurkishMorphology)6 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)5 Stopwatch (com.google.common.base.Stopwatch)4 Path (java.nio.file.Path)4 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)4 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)4 IOException (java.io.IOException)3 LinkedHashSet (java.util.LinkedHashSet)3 Ignore (org.junit.Ignore)3 Test (org.junit.Test)3 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)3 TurkishSentenceExtractor (zemberek.tokenization.TurkishSentenceExtractor)3 StandardCharsets (java.nio.charset.StandardCharsets)2 Files (java.nio.file.Files)2 Paths (java.nio.file.Paths)2 List (java.util.List)2