Search in sources :

Example 21 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class TurkishSpellChecker method tokenizeForSpelling.

// TODO: this does not cover all token types.
public static List<String> tokenizeForSpelling(String sentence) {
    List<Token> tokens = tokenizer.tokenize(sentence);
    List<String> result = new ArrayList<>(tokens.size());
    for (Token token : tokens) {
        if (token.getType() == Token.Type.Unknown || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Punctuation) {
            continue;
        }
        String w = token.getText();
        if (token.getType() == Token.Type.Word) {
            w = w.toLowerCase(Turkish.LOCALE);
        } else if (token.getType() == Token.Type.WordWithSymbol) {
            w = Turkish.capitalize(w);
        }
        result.add(w);
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token)

Example 22 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method performance.

@Test
@Ignore("Not a Test.")
public void performance() throws IOException {
    List<String> lines = Files.readAllLines(// Paths.get("/media/depo/data/aaa/corpora/dunya.100k")
    Paths.get("/home/ahmetaa/data/nlp/corpora/dunya.100k"));
    TurkishMorphology analyzer = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().disableCache().build();
    Log.info(lines.size() + " lines will be processed.");
    Log.info("Dictionary has " + analyzer.getLexicon().size() + " items.");
    long tokenCount = 0;
    long tokenCountNoPunct = 0;
    Stopwatch clock = Stopwatch.createStarted();
    TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
    for (String line : lines) {
        List<Token> tokens = lexer.tokenize(line);
        tokenCount += tokens.stream().filter(s -> (s.getType() != Token.Type.SpaceTab)).count();
        tokenCountNoPunct += tokens.stream().filter(s -> (s.getType() != Token.Type.Punctuation && s.getType() != Token.Type.SpaceTab)).count();
    }
    long elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
    Log.info("Elapsed Time = " + elapsed);
    Log.info("Token Count = " + tokenCount);
    Log.info("Token Count (No Punctuation) = " + tokenCountNoPunct);
    Log.info("Tokenization Speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
    Log.info("Tokenization Speed (No Punctuation) = %.1f tokens/sec ", tokenCountNoPunct * 1000d / elapsed);
    Log.info("");
    Log.info("Sentence word analysis test:");
    int counter = 0;
    clock.reset().start();
    for (String line : lines) {
        try {
            List<WordAnalysis> res = analyzer.analyzeSentence(line);
            // for preventing VM optimizations.
            counter += res.size();
        } catch (Exception e) {
            Log.info(line);
            e.printStackTrace();
        }
    }
    elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
    Log.info("Elapsed Time = " + elapsed);
    Log.info("Tokenization + Analysis speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
    Log.info("Tokenization + Analysis speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
    Log.info(analyzer.toString());
    Log.info("");
    Log.info("Disambiguation Test:");
    analyzer.invalidateCache();
    clock.reset().start();
    for (String line : lines) {
        try {
            SentenceAnalysis results = analyzer.analyzeAndDisambiguate(line);
            // for preventing VM optimizations.
            counter += results.size();
        } catch (Exception e) {
            Log.info(line);
            e.printStackTrace();
        }
    }
    elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
    Log.info("Elapsed Time = " + elapsed);
    Log.info("Tokenization + Analysis + Disambiguation speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
    Log.info("Tokenization + Analysis + Disambiguation speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
    Log.info(counter);
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Stopwatch(com.google.common.base.Stopwatch) Token(zemberek.tokenization.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) IOException(java.io.IOException) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 23 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class DocumentSimilarityExperiment method normalizeLine.

public String normalizeLine(String input) {
    TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
    List<Token> tokens = lexer.tokenize(input);
    List<String> reduced = new ArrayList<>();
    for (Token token : tokens) {
        if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || // token.getType() == TurkishLexer.Punctuation ||
        token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
            continue;
        }
        String tokenStr = token.getText();
        reduced.add(tokenStr.replaceAll("'’", "").toLowerCase(Turkish.LOCALE));
    }
    return String.join(" ", reduced);
}
Also used : TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token)

Example 24 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class _WordCollector method extracData.

public Histogram<String> extracData(Path p, Path outRoot, int resultLimit) throws IOException {
    Histogram<String> words = new Histogram<>(5_000_000);
    List<Path> files = Files.walk(p, 1).filter(s -> s.toFile().isFile() && s.toFile().getName().endsWith(".corpus")).collect(Collectors.toList());
    LinkedHashSet<SingleAnalysisSentence> result = new LinkedHashSet<>();
    for (Path file : files) {
        Log.info("Processing %s", file);
        List<String> lines = Files.readAllLines(file, StandardCharsets.UTF_8).stream().filter(s -> !s.startsWith("<")).collect(Collectors.toList());
        List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(lines);
        for (String sentence : sentences) {
            sentence = sentence.replaceAll("[\\s/\\-\\u00a0]+", " ");
            sentence = sentence.replaceAll("[\\u00ad]", "");
            List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
            for (Token token : tokens) {
                String rawWord = token.getText();
                if (!Strings.containsNone(rawWord, "0123456789_")) {
                    continue;
                }
                String word = Character.isUpperCase(rawWord.charAt(0)) ? Turkish.capitalize(rawWord) : rawWord.toLowerCase(Turkish.LOCALE);
                words.add(word);
            }
        }
        Log.info("Count = %d", words.size());
    }
    String s = p.toFile().getName();
    Log.info("Saving words.");
    // saving failed words.
    words.saveSortedByKeys(outRoot.resolve(s + "-counts-sorted-name.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
    // saving failed words by frequency.
    words.saveSortedByCounts(outRoot.resolve(s + "-counts-sorted-freq.txt"), " ");
    Files.write(outRoot.resolve(s + "-words-sorted-freq.txt"), words.getSortedList());
    Files.write(outRoot.resolve(s + "-words-sorted-name.txt"), words.getSortedList(Turkish.STRING_COMPARATOR_ASC));
    return words;
}
Also used : Path(java.nio.file.Path) Files(java.nio.file.Files) Strings(zemberek.core.io.Strings) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) SingleAnalysisSentence(zemberek.morphology._MorphologicalAmbiguityResolverExperiment.SingleAnalysisSentence) Turkish(zemberek.core.turkish.Turkish) List(java.util.List) Token(zemberek.tokenization.Token) Paths(java.nio.file.Paths) TurkishSentenceExtractor(zemberek.tokenization.TurkishSentenceExtractor) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) SingleAnalysisSentence(zemberek.morphology._MorphologicalAmbiguityResolverExperiment.SingleAnalysisSentence) Token(zemberek.tokenization.Token)

Example 25 with Token

use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.

the class WordHistogram method stats.

static void stats(List<String> paragraphs) {
    int paragraphCounter = 0;
    int sentenceCounter = 0;
    int tokenCounter = 0;
    int tokenNoPunctCounter = 0;
    int tokenWordCounter = 0;
    for (String paragraph : paragraphs) {
        List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraph(paragraph);
        sentenceCounter += sentences.size();
        for (String sentence : sentences) {
            List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
            for (Token token : tokens) {
                if (token.getType() != Token.Type.Punctuation) {
                    tokenNoPunctCounter++;
                }
                if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || token.getType() == Token.Type.Punctuation || token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Email || token.getType() == Token.Type.HashTag || token.getType() == Token.Type.Emoticon || token.getType() == Token.Type.Time || token.getType() == Token.Type.Date || token.getType() == Token.Type.URL || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
                    tokenCounter++;
                } else {
                    tokenCounter++;
                    tokenWordCounter++;
                }
            }
        }
        paragraphCounter++;
        if (paragraphCounter % 1000 == 0) {
            System.out.println(paragraphCounter + " of " + paragraphs.size());
        }
    }
    System.out.println("sentenceCounter = " + sentenceCounter);
    System.out.println("tokenCounter = " + tokenCounter);
    System.out.println("tokenNoPunctCounter = " + tokenNoPunctCounter);
    System.out.println("tokenWordCounter = " + tokenWordCounter);
}
Also used : Token(zemberek.tokenization.Token)

Aggregations

Token (zemberek.tokenization.Token)25 ArrayList (java.util.ArrayList)13 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)8 Histogram (zemberek.core.collections.Histogram)7 TurkishMorphology (zemberek.morphology.TurkishMorphology)6 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)5 Stopwatch (com.google.common.base.Stopwatch)4 Path (java.nio.file.Path)4 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)4 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)4 IOException (java.io.IOException)3 LinkedHashSet (java.util.LinkedHashSet)3 Ignore (org.junit.Ignore)3 Test (org.junit.Test)3 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)3 TurkishSentenceExtractor (zemberek.tokenization.TurkishSentenceExtractor)3 StandardCharsets (java.nio.charset.StandardCharsets)2 Files (java.nio.file.Files)2 Paths (java.nio.file.Paths)2 List (java.util.List)2