use of zemberek.tokenization.TurkishTokenizer in project zemberek-nlp by ahmetaa.
the class TurkishTokenizationExample method customTokenizer.
public static void customTokenizer() {
TurkishTokenizer tokenizer = TurkishTokenizer.builder().ignoreTypes(Token.Type.Punctuation, Token.Type.NewLine, Token.Type.SpaceTab).build();
List<Token> tokens = tokenizer.tokenize("Saat, 12:00.");
for (Token token : tokens) {
System.out.println(token);
}
}
use of zemberek.tokenization.TurkishTokenizer in project zemberek-nlp by ahmetaa.
the class TurkishTokenizationExample method simpleTokenization.
public static void simpleTokenization() {
System.out.println("Simple tokenization returns a list of token strings.");
TurkishTokenizer tokenizer = TurkishTokenizer.DEFAULT;
String input = "İstanbul'a, merhaba!";
System.out.println("Input = " + input);
System.out.println("Tokenization list = " + Joiner.on("|").join(tokenizer.tokenizeToStrings("İstanbul'a, merhaba!")));
}
use of zemberek.tokenization.TurkishTokenizer in project zemberek-nlp by ahmetaa.
the class TurkishTokenizationExample method simpleTokenization.
public static void simpleTokenization() {
System.out.println("Simple tokenization returns a list of token strings.");
TurkishTokenizer tokenizer = TurkishTokenizer.DEFAULT;
String input = "İstanbul'a, merhaba!";
System.out.println("Input = " + input);
System.out.println("Tokenization list = " + Joiner.on("|").join(tokenizer.tokenizeToStrings("İstanbul'a, merhaba!")));
}
use of zemberek.tokenization.TurkishTokenizer in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method performance.
@Test
@Ignore("Not a Test.")
public void performance() throws IOException {
List<String> lines = Files.readAllLines(// Paths.get("/media/depo/data/aaa/corpora/dunya.100k")
Paths.get("/home/ahmetaa/data/nlp/corpora/dunya.100k"));
TurkishMorphology analyzer = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().disableCache().build();
Log.info(lines.size() + " lines will be processed.");
Log.info("Dictionary has " + analyzer.getLexicon().size() + " items.");
long tokenCount = 0;
long tokenCountNoPunct = 0;
Stopwatch clock = Stopwatch.createStarted();
TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
for (String line : lines) {
List<Token> tokens = lexer.tokenize(line);
tokenCount += tokens.stream().filter(s -> (s.getType() != Token.Type.SpaceTab)).count();
tokenCountNoPunct += tokens.stream().filter(s -> (s.getType() != Token.Type.Punctuation && s.getType() != Token.Type.SpaceTab)).count();
}
long elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Token Count = " + tokenCount);
Log.info("Token Count (No Punctuation) = " + tokenCountNoPunct);
Log.info("Tokenization Speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization Speed (No Punctuation) = %.1f tokens/sec ", tokenCountNoPunct * 1000d / elapsed);
Log.info("");
Log.info("Sentence word analysis test:");
int counter = 0;
clock.reset().start();
for (String line : lines) {
try {
List<WordAnalysis> res = analyzer.analyzeSentence(line);
// for preventing VM optimizations.
counter += res.size();
} catch (Exception e) {
Log.info(line);
e.printStackTrace();
}
}
elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Tokenization + Analysis speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization + Analysis speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
Log.info(analyzer.toString());
Log.info("");
Log.info("Disambiguation Test:");
analyzer.invalidateCache();
clock.reset().start();
for (String line : lines) {
try {
SentenceAnalysis results = analyzer.analyzeAndDisambiguate(line);
// for preventing VM optimizations.
counter += results.size();
} catch (Exception e) {
Log.info(line);
e.printStackTrace();
}
}
elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Tokenization + Analysis + Disambiguation speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization + Analysis + Disambiguation speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
Log.info(counter);
}
use of zemberek.tokenization.TurkishTokenizer in project zemberek-nlp by ahmetaa.
the class DocumentSimilarityExperiment method normalizeLine.
public String normalizeLine(String input) {
TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
List<Token> tokens = lexer.tokenize(input);
List<String> reduced = new ArrayList<>();
for (Token token : tokens) {
if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || // token.getType() == TurkishLexer.Punctuation ||
token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
continue;
}
String tokenStr = token.getText();
reduced.add(tokenStr.replaceAll("'’", "").toLowerCase(Turkish.LOCALE));
}
return String.join(" ", reduced);
}
Aggregations