use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class TurkishSpellChecker method tokenizeForSpelling.
// TODO: this does not cover all token types.
public static List<String> tokenizeForSpelling(String sentence) {
List<Token> tokens = tokenizer.tokenize(sentence);
List<String> result = new ArrayList<>(tokens.size());
for (Token token : tokens) {
if (token.getType() == Token.Type.Unknown || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Punctuation) {
continue;
}
String w = token.getText();
if (token.getType() == Token.Type.Word) {
w = w.toLowerCase(Turkish.LOCALE);
} else if (token.getType() == Token.Type.WordWithSymbol) {
w = Turkish.capitalize(w);
}
result.add(w);
}
return result;
}
use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method performance.
@Test
@Ignore("Not a Test.")
public void performance() throws IOException {
List<String> lines = Files.readAllLines(// Paths.get("/media/depo/data/aaa/corpora/dunya.100k")
Paths.get("/home/ahmetaa/data/nlp/corpora/dunya.100k"));
TurkishMorphology analyzer = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().disableCache().build();
Log.info(lines.size() + " lines will be processed.");
Log.info("Dictionary has " + analyzer.getLexicon().size() + " items.");
long tokenCount = 0;
long tokenCountNoPunct = 0;
Stopwatch clock = Stopwatch.createStarted();
TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
for (String line : lines) {
List<Token> tokens = lexer.tokenize(line);
tokenCount += tokens.stream().filter(s -> (s.getType() != Token.Type.SpaceTab)).count();
tokenCountNoPunct += tokens.stream().filter(s -> (s.getType() != Token.Type.Punctuation && s.getType() != Token.Type.SpaceTab)).count();
}
long elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Token Count = " + tokenCount);
Log.info("Token Count (No Punctuation) = " + tokenCountNoPunct);
Log.info("Tokenization Speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization Speed (No Punctuation) = %.1f tokens/sec ", tokenCountNoPunct * 1000d / elapsed);
Log.info("");
Log.info("Sentence word analysis test:");
int counter = 0;
clock.reset().start();
for (String line : lines) {
try {
List<WordAnalysis> res = analyzer.analyzeSentence(line);
// for preventing VM optimizations.
counter += res.size();
} catch (Exception e) {
Log.info(line);
e.printStackTrace();
}
}
elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Tokenization + Analysis speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization + Analysis speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
Log.info(analyzer.toString());
Log.info("");
Log.info("Disambiguation Test:");
analyzer.invalidateCache();
clock.reset().start();
for (String line : lines) {
try {
SentenceAnalysis results = analyzer.analyzeAndDisambiguate(line);
// for preventing VM optimizations.
counter += results.size();
} catch (Exception e) {
Log.info(line);
e.printStackTrace();
}
}
elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Tokenization + Analysis + Disambiguation speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization + Analysis + Disambiguation speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
Log.info(counter);
}
use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class DocumentSimilarityExperiment method normalizeLine.
public String normalizeLine(String input) {
TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
List<Token> tokens = lexer.tokenize(input);
List<String> reduced = new ArrayList<>();
for (Token token : tokens) {
if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || // token.getType() == TurkishLexer.Punctuation ||
token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
continue;
}
String tokenStr = token.getText();
reduced.add(tokenStr.replaceAll("'’", "").toLowerCase(Turkish.LOCALE));
}
return String.join(" ", reduced);
}
use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class _WordCollector method extracData.
public Histogram<String> extracData(Path p, Path outRoot, int resultLimit) throws IOException {
Histogram<String> words = new Histogram<>(5_000_000);
List<Path> files = Files.walk(p, 1).filter(s -> s.toFile().isFile() && s.toFile().getName().endsWith(".corpus")).collect(Collectors.toList());
LinkedHashSet<SingleAnalysisSentence> result = new LinkedHashSet<>();
for (Path file : files) {
Log.info("Processing %s", file);
List<String> lines = Files.readAllLines(file, StandardCharsets.UTF_8).stream().filter(s -> !s.startsWith("<")).collect(Collectors.toList());
List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(lines);
for (String sentence : sentences) {
sentence = sentence.replaceAll("[\\s/\\-\\u00a0]+", " ");
sentence = sentence.replaceAll("[\\u00ad]", "");
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
for (Token token : tokens) {
String rawWord = token.getText();
if (!Strings.containsNone(rawWord, "0123456789_")) {
continue;
}
String word = Character.isUpperCase(rawWord.charAt(0)) ? Turkish.capitalize(rawWord) : rawWord.toLowerCase(Turkish.LOCALE);
words.add(word);
}
}
Log.info("Count = %d", words.size());
}
String s = p.toFile().getName();
Log.info("Saving words.");
// saving failed words.
words.saveSortedByKeys(outRoot.resolve(s + "-counts-sorted-name.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
// saving failed words by frequency.
words.saveSortedByCounts(outRoot.resolve(s + "-counts-sorted-freq.txt"), " ");
Files.write(outRoot.resolve(s + "-words-sorted-freq.txt"), words.getSortedList());
Files.write(outRoot.resolve(s + "-words-sorted-name.txt"), words.getSortedList(Turkish.STRING_COMPARATOR_ASC));
return words;
}
use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class WordHistogram method stats.
static void stats(List<String> paragraphs) {
int paragraphCounter = 0;
int sentenceCounter = 0;
int tokenCounter = 0;
int tokenNoPunctCounter = 0;
int tokenWordCounter = 0;
for (String paragraph : paragraphs) {
List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraph(paragraph);
sentenceCounter += sentences.size();
for (String sentence : sentences) {
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
for (Token token : tokens) {
if (token.getType() != Token.Type.Punctuation) {
tokenNoPunctCounter++;
}
if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || token.getType() == Token.Type.Punctuation || token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Email || token.getType() == Token.Type.HashTag || token.getType() == Token.Type.Emoticon || token.getType() == Token.Type.Time || token.getType() == Token.Type.Date || token.getType() == Token.Type.URL || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
tokenCounter++;
} else {
tokenCounter++;
tokenWordCounter++;
}
}
}
paragraphCounter++;
if (paragraphCounter % 1000 == 0) {
System.out.println(paragraphCounter + " of " + paragraphs.size());
}
}
System.out.println("sentenceCounter = " + sentenceCounter);
System.out.println("tokenCounter = " + tokenCounter);
System.out.println("tokenNoPunctCounter = " + tokenNoPunctCounter);
System.out.println("tokenWordCounter = " + tokenWordCounter);
}
Aggregations