Search in sources :

Example 6 with TurkishTokenizer

use of zemberek.tokenization.TurkishTokenizer in project zemberek-nlp by ahmetaa.

the class TurkishTokenizationExample method customTokenizer.

public static void customTokenizer() {
    TurkishTokenizer tokenizer = TurkishTokenizer.builder().ignoreTypes(Token.Type.Punctuation, Token.Type.NewLine, Token.Type.SpaceTab).build();
    List<Token> tokens = tokenizer.tokenize("Saat, 12:00.");
    for (Token token : tokens) {
        System.out.println(token);
    }
}
Also used : TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Token(zemberek.tokenization.Token)

Example 7 with TurkishTokenizer

use of zemberek.tokenization.TurkishTokenizer in project zemberek-nlp by ahmetaa.

the class TurkishTokenizationExample method simpleTokenization.

public static void simpleTokenization() {
    System.out.println("Simple tokenization returns a list of token strings.");
    TurkishTokenizer tokenizer = TurkishTokenizer.DEFAULT;
    String input = "İstanbul'a, merhaba!";
    System.out.println("Input = " + input);
    System.out.println("Tokenization list = " + Joiner.on("|").join(tokenizer.tokenizeToStrings("İstanbul'a, merhaba!")));
}
Also used : TurkishTokenizer(zemberek.tokenization.TurkishTokenizer)

Example 8 with TurkishTokenizer

use of zemberek.tokenization.TurkishTokenizer in project zemberek-nlp by ahmetaa.

the class TurkishTokenizationExample method simpleTokenization.

public static void simpleTokenization() {
    System.out.println("Simple tokenization returns a list of token strings.");
    TurkishTokenizer tokenizer = TurkishTokenizer.DEFAULT;
    String input = "İstanbul'a, merhaba!";
    System.out.println("Input = " + input);
    System.out.println("Tokenization list = " + Joiner.on("|").join(tokenizer.tokenizeToStrings("İstanbul'a, merhaba!")));
}
Also used : TurkishTokenizer(zemberek.tokenization.TurkishTokenizer)

Example 9 with TurkishTokenizer

use of zemberek.tokenization.TurkishTokenizer in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method performance.

@Test
@Ignore("Not a Test.")
public void performance() throws IOException {
    List<String> lines = Files.readAllLines(// Paths.get("/media/depo/data/aaa/corpora/dunya.100k")
    Paths.get("/home/ahmetaa/data/nlp/corpora/dunya.100k"));
    TurkishMorphology analyzer = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().disableCache().build();
    Log.info(lines.size() + " lines will be processed.");
    Log.info("Dictionary has " + analyzer.getLexicon().size() + " items.");
    long tokenCount = 0;
    long tokenCountNoPunct = 0;
    Stopwatch clock = Stopwatch.createStarted();
    TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
    for (String line : lines) {
        List<Token> tokens = lexer.tokenize(line);
        tokenCount += tokens.stream().filter(s -> (s.getType() != Token.Type.SpaceTab)).count();
        tokenCountNoPunct += tokens.stream().filter(s -> (s.getType() != Token.Type.Punctuation && s.getType() != Token.Type.SpaceTab)).count();
    }
    long elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
    Log.info("Elapsed Time = " + elapsed);
    Log.info("Token Count = " + tokenCount);
    Log.info("Token Count (No Punctuation) = " + tokenCountNoPunct);
    Log.info("Tokenization Speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
    Log.info("Tokenization Speed (No Punctuation) = %.1f tokens/sec ", tokenCountNoPunct * 1000d / elapsed);
    Log.info("");
    Log.info("Sentence word analysis test:");
    int counter = 0;
    clock.reset().start();
    for (String line : lines) {
        try {
            List<WordAnalysis> res = analyzer.analyzeSentence(line);
            // for preventing VM optimizations.
            counter += res.size();
        } catch (Exception e) {
            Log.info(line);
            e.printStackTrace();
        }
    }
    elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
    Log.info("Elapsed Time = " + elapsed);
    Log.info("Tokenization + Analysis speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
    Log.info("Tokenization + Analysis speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
    Log.info(analyzer.toString());
    Log.info("");
    Log.info("Disambiguation Test:");
    analyzer.invalidateCache();
    clock.reset().start();
    for (String line : lines) {
        try {
            SentenceAnalysis results = analyzer.analyzeAndDisambiguate(line);
            // for preventing VM optimizations.
            counter += results.size();
        } catch (Exception e) {
            Log.info(line);
            e.printStackTrace();
        }
    }
    elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
    Log.info("Elapsed Time = " + elapsed);
    Log.info("Tokenization + Analysis + Disambiguation speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
    Log.info("Tokenization + Analysis + Disambiguation speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
    Log.info(counter);
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Stopwatch(com.google.common.base.Stopwatch) Token(zemberek.tokenization.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) IOException(java.io.IOException) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 10 with TurkishTokenizer

use of zemberek.tokenization.TurkishTokenizer in project zemberek-nlp by ahmetaa.

the class DocumentSimilarityExperiment method normalizeLine.

public String normalizeLine(String input) {
    TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
    List<Token> tokens = lexer.tokenize(input);
    List<String> reduced = new ArrayList<>();
    for (Token token : tokens) {
        if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || // token.getType() == TurkishLexer.Punctuation ||
        token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
            continue;
        }
        String tokenStr = token.getText();
        reduced.add(tokenStr.replaceAll("'’", "").toLowerCase(Turkish.LOCALE));
    }
    return String.join(" ", reduced);
}
Also used : TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token)

Aggregations

TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)11 Token (zemberek.tokenization.Token)6 TurkishMorphology (zemberek.morphology.TurkishMorphology)4 ArrayList (java.util.ArrayList)3 Stopwatch (com.google.common.base.Stopwatch)2 Path (java.nio.file.Path)2 Ignore (org.junit.Ignore)2 Test (org.junit.Test)2 Histogram (zemberek.core.collections.Histogram)2 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)2 File (java.io.File)1 IOException (java.io.IOException)1 PrintWriter (java.io.PrintWriter)1 LinkedHashSet (java.util.LinkedHashSet)1 WebCorpus (zemberek.corpus.WebCorpus)1 WebDocument (zemberek.corpus.WebDocument)1 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)1 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)1 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)1 TurkishSpellChecker (zemberek.normalization.TurkishSpellChecker)1