use of zemberek.tokenization.Token.Type in project zemberek-nlp by ahmetaa.
the class TurkishTokenizer method getAllTokens.
private List<Token> getAllTokens(Lexer lexer) {
List<Token> tokens = new ArrayList<>();
for (org.antlr.v4.runtime.Token token = lexer.nextToken(); token.getType() != org.antlr.v4.runtime.Token.EOF; token = lexer.nextToken()) {
Token.Type type = convertType(token);
if (typeIgnored(type)) {
continue;
}
tokens.add(convert(token));
}
return tokens;
}
use of zemberek.tokenization.Token.Type in project zemberek-nlp by ahmetaa.
the class ClassificationExampleBase method removeNonWords.
protected String removeNonWords(String sentence) {
List<Token> docTokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
List<String> reduced = new ArrayList<>(docTokens.size());
for (Token token : docTokens) {
String text = token.getText();
// skip label and ending words.
if (text.startsWith("_") || text.contains("__")) {
reduced.add(text);
continue;
}
Token.Type type = token.getType();
if (type == Token.Type.Mention || type == Token.Type.HashTag || type == Token.Type.URL || type == Token.Type.Punctuation || type == Type.RomanNumeral || type == Token.Type.Time || type == Token.Type.UnknownWord || type == Token.Type.Unknown) {
continue;
}
reduced.add(text);
}
return String.join(" ", reduced);
}