use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class ClassificationExampleBase method removeNonWords.
protected String removeNonWords(String sentence) {
List<Token> docTokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
List<String> reduced = new ArrayList<>(docTokens.size());
for (Token token : docTokens) {
String text = token.getText();
// skip label and ending words.
if (text.startsWith("_") || text.contains("__")) {
reduced.add(text);
continue;
}
Token.Type type = token.getType();
if (type == Token.Type.Mention || type == Token.Type.HashTag || type == Token.Type.URL || type == Token.Type.Punctuation || type == Type.RomanNumeral || type == Token.Type.Time || type == Token.Type.UnknownWord || type == Token.Type.Unknown) {
continue;
}
reduced.add(text);
}
return String.join(" ", reduced);
}
use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class UnsupervisedKeyPhraseExtractor method collectCorpusStatistics.
static CorpusStatistics collectCorpusStatistics(WebCorpus corpus) throws IOException {
CorpusStatistics statistics = new CorpusStatistics(1_000_000);
for (WebDocument document : corpus.getDocuments()) {
Histogram<String> docHistogram = new Histogram<>();
List<String> sentences = extractor.fromParagraphs(document.getLines());
for (String sentence : sentences) {
List<Token> tokens = lexer.tokenize(sentence);
for (Token token : tokens) {
if (!tokenTypeAccpetable(token)) {
continue;
}
String s = normalize(token.getText());
if (TurkishStopWords.DEFAULT.contains(s)) {
continue;
}
docHistogram.add(s);
}
}
statistics.termFrequencies.add(docHistogram);
for (String s : docHistogram) {
statistics.documentFrequencies.add(s);
}
}
statistics.documentCount = corpus.documentCount();
return statistics;
}
use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class UnsupervisedKeyPhraseExtractor method collectGrams.
void collectGrams(List<Token> tokens, Histogram<Term> grams, int order, int offset) {
for (int i = 0; i < tokens.size() - order; i++) {
String[] words = new String[order];
boolean fail = false;
for (int j = 0; j < order; j++) {
Token t = tokens.get(i + j);
if (!tokenTypeAccpetable(t)) {
fail = true;
break;
}
String word = normalize(t.getText());
if (TurkishStopWords.DEFAULT.contains(word)) {
fail = true;
break;
}
words[j] = word;
}
if (!fail) {
Term t = new Term(words);
int count = grams.add(t);
if (count == 1) {
// if this is the first time, set the first occurance index.
t.setFirstOccurrenceIndex(offset + i);
}
}
}
}
use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class AutomaticLabelingExperiment method processContent.
public String processContent(TurkishMorphology analyzer, String content, boolean useRoots) {
List<Token> docTokens = lexer.tokenize(content);
List<String> reduced = new ArrayList<>(docTokens.size());
for (Token token : docTokens) {
if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || token.getType() == Token.Type.Punctuation || token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
continue;
}
String tokenStr = token.getText();
reduced.add(tokenStr);
}
String joined = String.join(" ", reduced);
if (useRoots) {
SentenceAnalysis analysis = analyzer.analyzeAndDisambiguate(joined);
List<String> res = new ArrayList<>();
for (SentenceWordAnalysis e : analysis) {
SingleAnalysis best = e.getBestAnalysis();
if (best.isUnknown()) {
res.add(e.getWordAnalysis().getInput());
continue;
}
List<String> lemmas = best.getLemmas();
if (lemmas.size() == 0) {
continue;
}
res.add(lemmas.get(lemmas.size() - 1));
}
joined = String.join(" ", res);
}
return joined.replaceAll("[']", "").toLowerCase(Turkish.LOCALE);
}
use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class _MorphologicalAmbiguityResolverExperiment method collect.
private List<SingleAnalysisSentence> collect(Path p, int maxAnalysisCount) throws IOException {
List<String> sentences = getSentences(p);
TurkishMorphology analyzer = TurkishMorphology.createWithDefaults();
int tokenCount = 0;
int sentenceCount = 0;
List<SingleAnalysisSentence> result = new ArrayList<>();
for (String sentence : sentences) {
sentence = sentence.replaceAll("\\s+|\\u00a0", " ");
sentence = sentence.replaceAll("[\\u00ad]", "");
sentence = sentence.replaceAll("[…]", "...");
List<Single> singleAnalysisWords = new ArrayList<>();
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
boolean failed = false;
int i = 0;
for (Token token : tokens) {
tokenCount++;
String rawWord = token.getText();
String word = Character.isUpperCase(rawWord.charAt(0)) ? Turkish.capitalize(rawWord) : rawWord.toLowerCase(Turkish.LOCALE);
WordAnalysis results;
if (cache.containsKey(word)) {
results = cache.get(word);
} else {
results = analyzer.analyze(word);
cache.put(word, results);
}
if (results.analysisCount() == 0) {
if (Strings.containsNone(word, "0123456789-.")) {
failedWords.add(word);
}
}
if (results.analysisCount() < 1 || results.analysisCount() > maxAnalysisCount) {
failed = true;
break;
} else {
List<SingleAnalysis> filtered = results.stream().filter(s -> !(s.getDictionaryItem().secondaryPos == SecondaryPos.ProperNoun && Character.isLowerCase(rawWord.charAt(0)))).collect(Collectors.toList());
if (filtered.size() == 0) {
failed = true;
break;
}
singleAnalysisWords.add(new Single(word, i, results.copyFor(filtered)));
i++;
}
}
if (!failed) {
result.add(new SingleAnalysisSentence(sentence, singleAnalysisWords));
}
sentenceCount++;
if (sentenceCount % 2000 == 0) {
Log.info("%d sentences %d tokens analyzed. %d found", sentenceCount, tokenCount, result.size());
}
}
return result;
}
Aggregations