use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class TurkishSentenceNormalizer method combineNecessaryWords.
String combineNecessaryWords(List<Token> tokens) {
List<String> result = new ArrayList<>();
boolean combined = false;
for (int i = 0; i < tokens.size() - 1; i++) {
Token first = tokens.get(i);
Token second = tokens.get(i + 1);
String firstS = first.getText();
String secondS = second.getText();
if (!isWord(first) || !isWord(second)) {
combined = false;
result.add(firstS);
continue;
}
if (combined) {
combined = false;
continue;
}
String c = combineCommon(firstS, secondS);
if (c.length() > 0) {
result.add(c);
combined = true;
} else {
result.add(first.getText());
combined = false;
}
}
if (!combined) {
result.add(tokens.get(tokens.size() - 1).getText());
}
return String.join(" ", result);
}
use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class TurkishSentenceNormalizer method replaceCommon.
String replaceCommon(List<Token> tokens) {
List<String> result = new ArrayList<>();
for (Token token : tokens) {
String text = token.getText();
result.add(replacements.getOrDefault(text, text));
}
return String.join(" ", result);
}
use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class SpeedTest method testNewsCorpus.
@Test
@Ignore(value = "Speed Test.")
public void testNewsCorpus() throws IOException {
// Path p = Paths.get("/media/aaa/Data/corpora/me-sentences/www.aljazeera.com.tr/2018-02-22");
Path p = Paths.get("src/test/resources/corpora/cnn-turk-10k");
List<String> sentences = getSentences(p);
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
Stopwatch sw = Stopwatch.createStarted();
int tokenCount = 0;
int noAnalysis = 0;
int sentenceCount = 0;
Histogram<String> failedWords = new Histogram<>(100000);
for (String sentence : sentences) {
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
for (Token token : tokens) {
if (token.getType() == Token.Type.Punctuation) {
continue;
}
tokenCount++;
WordAnalysis results = morphology.analyze(token.getText());
if (!results.isCorrect()) {
noAnalysis++;
failedWords.add(token.getText());
}
}
sentenceCount++;
if (sentenceCount % 2000 == 0) {
Log.info("%d tokens analyzed.", tokenCount);
}
}
double seconds = sw.stop().elapsed(TimeUnit.MILLISECONDS) / 1000d;
double speed = tokenCount / seconds;
double parseRatio = 100 - (noAnalysis * 100d / tokenCount);
Log.info("%nElapsed = %.2f seconds", seconds);
Log.info("%nToken Count (No Punc) = %d %nParse Ratio = %.4f%nSpeed = %.2f tokens/sec%n", tokenCount, parseRatio, speed);
Log.info("Saving Unknown Tokens");
failedWords.saveSortedByCounts(Paths.get("unk.freq"), " ");
failedWords.saveSortedByKeys(Paths.get("unk"), " ", Turkish.STRING_COMPARATOR_ASC);
}
use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class SpeedTest method testForVisualVm.
private static void testForVisualVm(Path p, TurkishMorphology analyzer) throws IOException {
// Path p = Paths.get("/media/aaa/Data/corpora/me-sentences/www.aljazeera.com.tr/2018-02-22");
List<String> sentences = getSentences(p);
Stopwatch sw = Stopwatch.createStarted();
int tokenCount = 0;
int noAnalysis = 0;
int sentenceCount = 0;
for (String sentence : sentences) {
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
for (Token token : tokens) {
tokenCount++;
WordAnalysis results = analyzer.analyze(token.getText());
if (!results.isCorrect()) {
noAnalysis++;
}
}
sentenceCount++;
if (sentenceCount % 2000 == 0) {
Log.info("%d tokens analyzed.", tokenCount);
}
}
double seconds = sw.stop().elapsed(TimeUnit.MILLISECONDS) / 1000d;
double speed = tokenCount / seconds;
double parseRatio = 100 - (noAnalysis * 100d / tokenCount);
System.out.println(analyzer.getCache());
Log.info("%nElapsed = %.2f seconds", seconds);
Log.info("%nToken Count (No Punc) = %d %nParse Ratio = %.4f%nSpeed = %.2f tokens/sec%n", tokenCount, parseRatio, speed);
}
use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class NerDataSet method normalizeForNer.
public static String normalizeForNer(String input) {
input = input.toLowerCase(Turkish.LOCALE);
List<String> result = new ArrayList<>();
for (Token t : TurkishTokenizer.DEFAULT.tokenize(input)) {
String s = t.getText();
if (t.getType() == Token.Type.Date || t.getType() == Type.Number || t.getType() == Token.Type.Time) {
s = "*" + s.replaceAll("[0-9]", "D") + "*";
}
result.add(s);
}
return String.join("", result);
}
Aggregations